# IMPORTING REQUIRED LIBRARIES

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# READING THE DATA

In [55]:
df = pd.read_csv("liver_data.csv")
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


# CHECKING THE SIZE OF THE DATA

In [56]:
df.shape

(500, 11)

# CHECKING THE DATA TYPE OF EACH COLUMN

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         500 non-null    int64  
 1   Gender                      500 non-null    object 
 2   Total_Bilirubin             500 non-null    float64
 3   Direct_Bilirubin            500 non-null    float64
 4   Alkaline_Phosphotase        500 non-null    int64  
 5   Alamine_Aminotransferase    500 non-null    int64  
 6   Aspartate_Aminotransferase  500 non-null    int64  
 7   Total_Protiens              500 non-null    float64
 8   Albumin                     500 non-null    float64
 9   Albumin_and_Globulin_Ratio  496 non-null    float64
 10  Liver_Problem               500 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 43.1+ KB


# CHECKING THE NULL VALUES

In [58]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Liver_Problem                 0
dtype: int64

# FILLING NULL VALUES WITH MEAN VALUE

In [59]:
df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].mean(),inplace=True)

In [60]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Liver_Problem                 0
dtype: int64

# CHANGING CATEGORICAL VALUES INTO NUMERICAL VALUES



In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         500 non-null    int64  
 1   Gender                      500 non-null    object 
 2   Total_Bilirubin             500 non-null    float64
 3   Direct_Bilirubin            500 non-null    float64
 4   Alkaline_Phosphotase        500 non-null    int64  
 5   Alamine_Aminotransferase    500 non-null    int64  
 6   Aspartate_Aminotransferase  500 non-null    int64  
 7   Total_Protiens              500 non-null    float64
 8   Albumin                     500 non-null    float64
 9   Albumin_and_Globulin_Ratio  500 non-null    float64
 10  Liver_Problem               500 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 43.1+ KB


In [62]:
df['Gender'] = df['Gender'].replace({'Male':0,'Female':1})

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         500 non-null    int64  
 1   Gender                      500 non-null    int64  
 2   Total_Bilirubin             500 non-null    float64
 3   Direct_Bilirubin            500 non-null    float64
 4   Alkaline_Phosphotase        500 non-null    int64  
 5   Alamine_Aminotransferase    500 non-null    int64  
 6   Aspartate_Aminotransferase  500 non-null    int64  
 7   Total_Protiens              500 non-null    float64
 8   Albumin                     500 non-null    float64
 9   Albumin_and_Globulin_Ratio  500 non-null    float64
 10  Liver_Problem               500 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 43.1 KB


# PRINTING THE COLUMN NAMES

In [65]:
df.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Liver_Problem'],
      dtype='object')

# CORRELATIONS

In [66]:
for i in df.columns:
    print(i,"-------->",df[i].corr(df['Liver_Problem']))

Age --------> -0.15058841617661606
Gender --------> 0.04600147207065952
Total_Bilirubin --------> -0.1831623759368882
Direct_Bilirubin --------> -0.22360384999927307
Alkaline_Phosphotase --------> -0.1922323146431627
Alamine_Aminotransferase --------> -0.16341478784074903
Aspartate_Aminotransferase --------> -0.1432847896744353
Total_Protiens --------> 0.02920455149125384
Albumin --------> 0.14028160102182924
Albumin_and_Globulin_Ratio --------> 0.17879513082928367
Liver_Problem --------> 1.0


# IMPORTING REQUIRED MODELS AND METRICS

In [67]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Train Test Split
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# DIVIDING THE DEPENDENT AND INDEPENDENT VARIABLES

In [68]:
y = df['Liver_Problem']
X = df.drop(['Liver_Problem'],axis = 1)

# DIVIDING THE DATA INTO TRAIN AND TEST

In [69]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [77]:
lst = [x_train,x_test,y_train,y_test]
for data in lst:
  print("shape","---->",data.shape)

shape ----> (400, 10)
shape ----> (100, 10)
shape ----> (400,)
shape ----> (100,)


# MAKING PIPELINE FOR EVERY MODEL

In [70]:
mdl_1 = make_pipeline(StandardScaler(), KNeighborsClassifier()) 
mdl_2 = make_pipeline(StandardScaler(), LogisticRegression()) 
mdl_3 = make_pipeline(StandardScaler(), SVC()) 
mdl_4 = make_pipeline(StandardScaler(), DecisionTreeClassifier()) 
mdl_5 = make_pipeline(StandardScaler(), RandomForestClassifier())

# FITTING AND PREDICTING ACCURACY OF MODELS

In [71]:
mdl_lst = [mdl_1,mdl_2,mdl_3,mdl_4,mdl_5]
accuracy_lst = []
for mdl in mdl_lst:
  i = 1
  mdl.fit(x_train,y_train)
  y_pred = mdl.predict(x_test)
  print("")
  print("******CLASSIFICATION MODEL******")
  i = i+1
  print("")
  print("CONFUSION MATRIX")
  print(confusion_matrix(y_test,y_pred))
  print("")
  print("ACCURACY SCORE")
  print(accuracy_score(y_test,y_pred))
  print("----------------------------")
  accuracy_lst.append(accuracy_score(y_test,y_pred)) 
 


******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[56 10]
 [25  9]]

ACCURACY SCORE
0.65
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[60  6]
 [28  6]]

ACCURACY SCORE
0.66
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[66  0]
 [34  0]]

ACCURACY SCORE
0.66
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[45 21]
 [24 10]]

ACCURACY SCORE
0.55
----------------------------

******CLASSIFICATION MODEL******

CONFUSION MATRIX
[[60  6]
 [28  6]]

ACCURACY SCORE
0.66
----------------------------


# CHECKING FOR HIGH ACCURACY SCORE AMOUNG THE MODELS

In [72]:
print(accuracy_lst.index(max(accuracy_lst)))


1


# SAVING THE MODEL IN PICKLE FILE

In [78]:
import pickle

In [79]:
filename = 'liver_model.pkl'
pickle.dump(mdl_2, open(filename, 'wb'))