In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
df = pd.read_csv("../Data/indian_liver_patient.csv")
print("Initial shape:", df.shape)
print(df.info())
df.head()

Initial shape: (583, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB
None


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))

df_outliers = df[(z_scores > 3).any(axis=1)]

print(f"Found {len(df_outliers)} outliers:")
display(df_outliers)

Found 45 outliers:


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
25,34,Male,4.1,2.0,289,875,731,5.0,2.7,1.1,1
26,34,Male,4.1,2.0,289,875,731,5.0,2.7,1.1,1
27,34,Male,6.2,3.0,240,1680,850,7.2,4.0,1.2,1
47,40,Male,1.1,0.3,230,1630,960,4.9,2.8,1.3,1
77,68,Female,0.6,0.1,1620,95,127,4.6,2.1,0.8,1
115,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,2
116,48,Male,0.7,0.1,1630,74,149,5.3,2.0,0.6,1
117,32,Male,12.7,6.2,194,2000,2946,5.7,3.3,1.3,1
118,32,Male,15.9,7.0,280,1350,1600,5.6,2.8,1.0,1
119,32,Male,18.0,8.2,298,1250,1050,5.4,2.6,0.9,1


In [4]:
# Imputing the missing values
if df['Albumin_and_Globulin_Ratio'].isna().any():
    med = df['Albumin_and_Globulin_Ratio'].median()
    df.loc[:, 'Albumin_and_Globulin_Ratio'] = df['Albumin_and_Globulin_Ratio'].fillna(med)

# Converting gender into numbers (Male = 1, Female = 0)
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])

print("Nulls after imputation:\n", df.isnull().sum())
print("Gender mapping:", dict(zip(le.classes_, le.transform(le.classes_))))
print("Data shape now:", df.shape)

df.head()

Nulls after imputation:
 Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64
Gender mapping: {'Female': 0, 'Male': 1}
Data shape now: (583, 11)


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
df = df.drop(index=df_outliers.index).reset_index(drop=True)
print("After outlier removal:", df.shape)

After outlier removal: (538, 11)


In [6]:
print(df.info())
print("\nMissing per column:\n", df.isnull().sum())

df.dropna(inplace=True)
print("\nAfter dropna, new shape:", df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 0 to 537
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         538 non-null    int64  
 1   Gender                      538 non-null    int32  
 2   Total_Bilirubin             538 non-null    float64
 3   Direct_Bilirubin            538 non-null    float64
 4   Alkaline_Phosphotase        538 non-null    int64  
 5   Alamine_Aminotransferase    538 non-null    int64  
 6   Aspartate_Aminotransferase  538 non-null    int64  
 7   Total_Protiens              538 non-null    float64
 8   Albumin                     538 non-null    float64
 9   Albumin_and_Globulin_Ratio  538 non-null    float64
 10  Dataset                     538 non-null    int64  
dtypes: float64(5), int32(1), int64(5)
memory usage: 44.3 KB
None

Missing per column:
 Age                           0
Gender                        0
To

In [7]:
df.to_csv("../Data/indian_liver_patient_clean.csv", index=False)