In [1]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# loading dataset
df = pd.read_csv('/content/liver_disease_dataset.csv', encoding= 'unicode_escape')
df.head()

Unnamed: 0,Age,Gender,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
0,65.0,Female,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1
1,62.0,Male,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,62.0,Male,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,58.0,Male,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1
4,72.0,Male,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1


In [3]:
# getting the info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30691 entries, 0 to 30690
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age                                   30689 non-null  float64
 1   Gender                                29789 non-null  object 
 2   Total Bilirubin                       30043 non-null  float64
 3   Direct Bilirubin                      30130 non-null  float64
 4   Alkphos Alkaline Phosphotase          29895 non-null  float64
 5   Sgpt Alamine Aminotransferase         30153 non-null  float64
 6   Sgot Aspartate Aminotransferase       30229 non-null  float64
 7   Total Protiens                        30228 non-null  float64
 8   ALB Albumin                           30197 non-null  float64
 9   A/G Ratio Albumin and Globulin Ratio  30132 non-null  float64
 10  Result                                30691 non-null  int64  
dtypes: float64(9), 

In [4]:
# checking the nulls
df.isnull().sum()

Age                                       2
Gender                                  902
Total Bilirubin                         648
Direct Bilirubin                        561
Alkphos Alkaline Phosphotase            796
Sgpt Alamine Aminotransferase           538
Sgot Aspartate Aminotransferase         462
Total Protiens                          463
ALB Albumin                             494
A/G Ratio Albumin and Globulin Ratio    559
Result                                    0
dtype: int64

In [5]:
# mapping values in Gender
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})
df.head()

Unnamed: 0,Age,Gender,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
0,65.0,0.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1
1,62.0,1.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,62.0,1.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,58.0,1.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1
4,72.0,1.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1


In [6]:
# getting a stat summary
df.describe()

Unnamed: 0,Age,Gender,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
count,30689.0,29789.0,30043.0,30130.0,29895.0,30153.0,30229.0,30228.0,30197.0,30132.0,30691.0
mean,44.107205,0.738058,3.370319,1.528042,289.075364,81.488641,111.469979,6.480237,3.130142,0.943467,1.285882
std,15.981043,0.439699,6.255522,2.869592,238.537589,182.15885,280.851078,1.08198,0.792281,0.323164,0.451841
min,4.0,0.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,32.0,0.0,0.8,0.2,175.0,23.0,26.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,1.0,0.3,209.0,35.0,42.0,6.6,3.1,0.9,1.0
75%,55.0,1.0,2.7,1.3,298.0,62.0,88.0,7.2,3.8,1.1,2.0
max,90.0,1.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [7]:
# filling the nulls
df = df.fillna(method='bfill')
df.isnull().sum()

Age                                     0
Gender                                  0
Total Bilirubin                         0
Direct Bilirubin                        0
Alkphos Alkaline Phosphotase            0
Sgpt Alamine Aminotransferase           0
Sgot Aspartate Aminotransferase         0
Total Protiens                          0
ALB Albumin                             0
A/G Ratio Albumin and Globulin Ratio    0
Result                                  0
dtype: int64

In [8]:
# droping the duplicates
df = df.drop_duplicates()

In [9]:
# getting the value count of target
df['Result'].value_counts()

1    13603
2     5487
Name: Result, dtype: int64

In [10]:
# sampling for target
cls_0 = df[df['Result']==1]
cls_1 = df[df['Result']==2]

cls_1 = cls_1.sample(13770,replace=True)
df = pd.concat([cls_0,cls_1],axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27373 entries, 0 to 15396
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age                                   27373 non-null  float64
 1   Gender                                27373 non-null  float64
 2   Total Bilirubin                       27373 non-null  float64
 3   Direct Bilirubin                      27373 non-null  float64
 4   Alkphos Alkaline Phosphotase          27373 non-null  float64
 5   Sgpt Alamine Aminotransferase         27373 non-null  float64
 6   Sgot Aspartate Aminotransferase       27373 non-null  float64
 7   Total Protiens                        27373 non-null  float64
 8   ALB Albumin                           27373 non-null  float64
 9   A/G Ratio Albumin and Globulin Ratio  27373 non-null  float64
 10  Result                                27373 non-null  int64  
dtypes: float64(10),

In [11]:
# normalizing the features
for x in df.columns:
    df[x]=(df[x] - df[x].min()) / (df[x].max() - df[x].min())
df.head()

Unnamed: 0,Age,Gender,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
0,0.709302,0.0,0.004021,0.0,0.060576,0.003015,0.001626,0.594203,0.521739,0.24,0.0
1,0.674419,1.0,0.140751,0.27551,0.310699,0.027136,0.018296,0.695652,0.5,0.176,0.0
2,0.674419,1.0,0.092493,0.204082,0.208598,0.025126,0.011791,0.623188,0.521739,0.236,0.0
3,0.627907,1.0,0.008043,0.015306,0.058134,0.00201,0.002033,0.594203,0.543478,0.28,0.0
4,0.790698,1.0,0.046917,0.096939,0.064485,0.008543,0.009961,0.666667,0.326087,0.04,0.0


In [12]:
# rechecking the target
df['Result'].value_counts()

1.0    13770
0.0    13603
Name: Result, dtype: int64

In [13]:
# creating X & y
X = df.drop('Result', axis=1)
y = df['Result']

In [14]:
# splitting the data in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [15]:
# training a model
model = XGBClassifier(random_state=20)
model.fit(X_train,y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=20, ...)

In [16]:
# checking for the accuracy
print('Accuracy:', (accuracy_score(y_train, model.predict(X_train)) * 100).round(2))
print('Accuracy:', (accuracy_score(y_test, model.predict(X_test)) * 100).round(2))

Accuracy: 99.97
Accuracy: 99.73


In [17]:
# confusion matrix
print(confusion_matrix(y_test, model.predict(X_test)))

[[2730    9]
 [   6 2730]]


In [18]:
# printing a clf report
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2739
         1.0       1.00      1.00      1.00      2736

    accuracy                           1.00      5475
   macro avg       1.00      1.00      1.00      5475
weighted avg       1.00      1.00      1.00      5475

