In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings("ignore")
import joblib

In [4]:
df = pd.read_csv('liver_dataset.csv',encoding= 'unicode_escape')

In [5]:
df.size

337601

In [6]:
df.columns

Index(['Age of the patient', 'Gender of the patient', 'Total Bilirubin',
       'Direct Bilirubin', 'Alkphos Alkaline Phosphotase',
       'Sgpt Alamine Aminotransferase', 'Sgot Aspartate Aminotransferase',
       'Total Protiens', 'ALB Albumin', 'A/G Ratio Albumin and Globulin Ratio',
       'Result'],
      dtype='object')

In [31]:
df.shape

(30691, 11)

In [32]:
#checking_duplicates
df.duplicated().sum()

11323

In [33]:
#dropping the existing duplicates and printing shape
df=df.drop_duplicates()
df.shape

(19368, 11)

In [34]:
df.dtypes

Age of the patient                      float64
Gender of the patient                    object
Total Bilirubin                         float64
Direct Bilirubin                        float64
Alkphos Alkaline Phosphotase            float64
Sgpt Alamine Aminotransferase           float64
Sgot Aspartate Aminotransferase         float64
Total Protiens                          float64
ALB Albumin                             float64
A/G Ratio Albumin and Globulin Ratio    float64
Result                                    int64
dtype: object

In [9]:
df.head(4)

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
0,65.0,Female,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1
1,62.0,Male,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,62.0,Male,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,58.0,Male,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1


In [36]:
df.tail(4)

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
29976,28.0,Male,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,1
30686,50.0,Male,2.2,1.0,610.0,17.0,28.0,7.3,2.6,0.55,1
30688,54.0,Male,6.8,3.0,542.0,116.0,66.0,6.4,3.1,0.9,1
30689,48.0,Female,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,1


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19368 entries, 0 to 30689
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age of the patient                    19367 non-null  float64
 1   Gender of the patient                 18572 non-null  object 
 2   Total Bilirubin                       18811 non-null  float64
 3   Direct Bilirubin                      18878 non-null  float64
 4   Alkphos Alkaline Phosphotase          18674 non-null  float64
 5   Sgpt Alamine Aminotransferase         18909 non-null  float64
 6   Sgot Aspartate Aminotransferase       18975 non-null  float64
 7   Total Protiens                        18998 non-null  float64
 8   ALB Albumin                           18955 non-null  float64
 9   A/G Ratio Albumin and Globulin Ratio  18932 non-null  float64
 10  Result                                19368 non-null  int64  
dtypes: float64(9), 

In [38]:
df.describe()

Unnamed: 0,Age of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
count,19367.0,18811.0,18878.0,18674.0,18909.0,18975.0,18998.0,18955.0,18932.0,19368.0
mean,43.809986,3.315767,1.515637,289.407411,80.598762,110.790672,6.494326,3.141045,0.945888,1.286917
std,16.454988,6.111851,2.863609,239.727272,181.134634,278.927937,1.089985,0.793602,0.322023,0.452334
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,32.0,0.8,0.2,175.0,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,55.0,2.6,1.3,298.0,61.0,88.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [39]:
df.isnull()

Unnamed: 0,Age of the patient,Gender of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
29974,False,False,False,False,False,False,False,False,False,False,False
29976,False,False,False,False,False,False,False,False,False,False,False
30686,False,False,False,False,False,False,False,False,False,False,False
30688,False,False,False,False,False,False,False,False,False,False,False


In [40]:
#checking if null values are present
df.isnull().sum()

Age of the patient                        1
Gender of the patient                   796
Total Bilirubin                         557
Direct Bilirubin                        490
Alkphos Alkaline Phosphotase            694
Sgpt Alamine Aminotransferase           459
Sgot Aspartate Aminotransferase         393
Total Protiens                          370
ALB Albumin                             413
A/G Ratio Albumin and Globulin Ratio    436
Result                                    0
dtype: int64

In [41]:
df.columns = df.columns.str.replace(' ','_')

In [42]:
df.columns

Index(['Age_of_the_patient', 'Gender_of_the_patient', 'Total_Bilirubin',
       'Direct_Bilirubin', 'Alkphos_Alkaline_Phosphotase',
       'Sgpt_Alamine_Aminotransferase', 'Sgot_Aspartate_Aminotransferase',
       'Total_Protiens', 'ALB_Albumin', 'A/G_Ratio_Albumin_and_Globulin_Ratio',
       'Result'],
      dtype='object')

In [43]:
df.Gender_of_the_patient=df.Gender_of_the_patient.map({'Female':1,'Male':0})

In [44]:
df.dtypes

Age_of_the_patient                      float64
Gender_of_the_patient                   float64
Total_Bilirubin                         float64
Direct_Bilirubin                        float64
Alkphos_Alkaline_Phosphotase            float64
Sgpt_Alamine_Aminotransferase           float64
Sgot_Aspartate_Aminotransferase         float64
Total_Protiens                          float64
ALB_Albumin                             float64
A/G_Ratio_Albumin_and_Globulin_Ratio    float64
Result                                    int64
dtype: object

In [45]:
df.isnull().sum()

Age_of_the_patient                        1
Gender_of_the_patient                   796
Total_Bilirubin                         557
Direct_Bilirubin                        490
Alkphos_Alkaline_Phosphotase            694
Sgpt_Alamine_Aminotransferase           459
Sgot_Aspartate_Aminotransferase         393
Total_Protiens                          370
ALB_Albumin                             413
A/G_Ratio_Albumin_and_Globulin_Ratio    436
Result                                    0
dtype: int64

In [46]:
df.fillna(df.Gender_of_the_patient.median(),inplace=True)
df.Total_Bilirubin.fillna(df.Total_Bilirubin.median(),inplace=True)
df.Direct_Bilirubin.fillna(df.Direct_Bilirubin.median(),inplace=True)
df.Alkphos_Alkaline_Phosphotase.fillna(df.Alkphos_Alkaline_Phosphotase.median(),inplace=True)
df.Sgpt_Alamine_Aminotran sferase.fillna(df.Sgpt_Alamine_Aminotransferase.median(),inplace=True)
df.Sgot_Aspartate_Aminotransferase.fillna(df.Sgot_Aspartate_Aminotransferase.median(),inplace=True)
df.Total_Protiens.fillna(df.Total_Protiens.median(),inplace=True)
df.ALB_Albumin.fillna(df.ALB_Albumin.median(),inplace=True)
df.rename(columns = {'A/G_Ratio_Albumin_and_Globulin_Ratio':'AbyG_Ratio'}, inplace = True)
df.AbyG_Ratio.fillna(df.AbyG_Ratio.median(),inplace=True)
df.dtypes

Age_of_the_patient                 float64
Gender_of_the_patient              float64
Total_Bilirubin                    float64
Direct_Bilirubin                   float64
Alkphos_Alkaline_Phosphotase       float64
Sgpt_Alamine_Aminotransferase      float64
Sgot_Aspartate_Aminotransferase    float64
Total_Protiens                     float64
ALB_Albumin                        float64
AbyG_Ratio                         float64
Result                               int64
dtype: object

In [47]:
df.isnull().sum()

Age_of_the_patient                 0
Gender_of_the_patient              0
Total_Bilirubin                    0
Direct_Bilirubin                   0
Alkphos_Alkaline_Phosphotase       0
Sgpt_Alamine_Aminotransferase      0
Sgot_Aspartate_Aminotransferase    0
Total_Protiens                     0
ALB_Albumin                        0
AbyG_Ratio                         0
Result                             0
dtype: int64

In [48]:
#Gender is dropped
X =df.drop(['Gender_of_the_patient','Result'], axis=1)
Y=df['Result']

In [49]:
X.shape

(19368, 9)

In [50]:
Y.shape

(19368,)

In [51]:
X.columns

Index(['Age_of_the_patient', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkphos_Alkaline_Phosphotase', 'Sgpt_Alamine_Aminotransferase',
       'Sgot_Aspartate_Aminotransferase', 'Total_Protiens', 'ALB_Albumin',
       'AbyG_Ratio'],
      dtype='object')

In [52]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=99)

In [53]:
g = GaussianNB()
g.fit(x_train, y_train)
y_pred = g.predict(x_test)
print('Accuracy: \n', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 
 0.5508518327310273
[[1637 2543]
 [  67 1564]]
              precision    recall  f1-score   support

           1       0.96      0.39      0.56      4180
           2       0.38      0.96      0.55      1631

    accuracy                           0.55      5811
   macro avg       0.67      0.68      0.55      5811
weighted avg       0.80      0.55      0.55      5811



In [54]:
lr=LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print('Accuracy: \n', accuracy_score(y_test,y_pred))
print('Confusion Matrix :\n',confusion_matrix(y_test,y_pred),'\n')
print('Classification Report :\n',classification_report(y_test,y_pred),'\n')

Accuracy: 
 0.7189812424711753
Confusion Matrix :
 [[3968  212]
 [1421  210]] 

Classification Report :
               precision    recall  f1-score   support

           1       0.74      0.95      0.83      4180
           2       0.50      0.13      0.20      1631

    accuracy                           0.72      5811
   macro avg       0.62      0.54      0.52      5811
weighted avg       0.67      0.72      0.65      5811
 



In [55]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print('Accuracy: \n', accuracy_score(y_test,y_pred))
print('Confusion Matrix :\n',confusion_matrix(y_test,y_pred),'\n')
print('Classification Report :\n',classification_report(y_test,y_pred),'\n')

Accuracy: 
 0.9962140767509895
Confusion Matrix :
 [[4176    4]
 [  18 1613]] 

Classification Report :
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      4180
           2       1.00      0.99      0.99      1631

    accuracy                           1.00      5811
   macro avg       1.00      0.99      1.00      5811
weighted avg       1.00      1.00      1.00      5811
 



In [56]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix
neural = MLPClassifier()
neural.fit(x_train, y_train)

predicted = neural.predict(x_test)

neural_score = round(neural.score(x_train, y_train) * 100, 2)
neural_score_test = round(neural.score(x_test, y_test) * 100, 2)
print('Neural Train Score: \n', neural_score)
print('Neural Test Score: \n', neural_score_test)
print('Accuracy: \n', accuracy_score(y_test, predicted))
print(confusion_matrix(predicted,y_test))
print(classification_report(y_test,predicted))

Neural Train Score: 
 72.07
Neural Test Score: 
 72.35
Accuracy: 
 0.7234555154018242
[[4130 1557]
 [  50   74]]
              precision    recall  f1-score   support

           1       0.73      0.99      0.84      4180
           2       0.60      0.05      0.08      1631

    accuracy                           0.72      5811
   macro avg       0.66      0.52      0.46      5811
weighted avg       0.69      0.72      0.63      5811



In [57]:
joblib.dump(rf,"final.pkl")
final_model=joblib.load('final.pkl')
pred=final_model.predict(x_test)
acc=accuracy_score(y_test,pred,normalize=True)*float(100)
print(acc)

99.62140767509895
