Importing the Dependencies

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.impute import SimpleImputer

Data Collection and Processing

In [6]:
# loading the csv data to a Pandas DataFrame
liver_data = pd.read_csv('/content/liver_data.csv',encoding ='latin-1')

In [7]:
# print first 5 rows of the dataset
liver_data.head()

Unnamed: 0,Gender of the patient,Age of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
0,Female,65.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,1
1,Male,62.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,1
2,Male,62.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,1
3,Male,58.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,1
4,Male,72.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,1


In [8]:
# print last 5 rows of the dataset
liver_data.tail()

Unnamed: 0,Gender of the patient,Age of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
30686,Male,50.0,2.2,1.0,610.0,17.0,28.0,7.3,2.6,0.55,1
30687,Male,55.0,2.9,1.3,482.0,22.0,34.0,7.0,2.4,0.5,1
30688,Male,54.0,6.8,3.0,542.0,116.0,66.0,6.4,3.1,0.9,1
30689,Female,48.0,1.9,1.0,231.0,16.0,55.0,4.3,1.6,0.6,1
30690,Male,30.0,3.1,1.6,253.0,80.0,406.0,6.8,3.9,1.3,1


In [9]:
# number of rows and columns in the dataset
liver_data.shape

(30691, 11)

In [10]:
# getting some info about the data
liver_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30691 entries, 0 to 30690
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Gender of the patient                 29789 non-null  object 
 1   Age of the patient                    30689 non-null  float64
 2   Total Bilirubin                       30043 non-null  float64
 3   Direct Bilirubin                      30130 non-null  float64
 4    Alkphos Alkaline Phosphotase         29895 non-null  float64
 5    Sgpt Alamine Aminotransferase        30153 non-null  float64
 6   Sgot Aspartate Aminotransferase       30229 non-null  float64
 7   Total Protiens                        30228 non-null  float64
 8    ALB Albumin                          30197 non-null  float64
 9   A/G Ratio Albumin and Globulin Ratio  30132 non-null  float64
 10  Result                                30691 non-null  int64  
dtypes: float64(9), 

In [11]:
# checking for missing values
liver_data.isnull().sum()

Unnamed: 0,0
Gender of the patient,902
Age of the patient,2
Total Bilirubin,648
Direct Bilirubin,561
Alkphos Alkaline Phosphotase,796
Sgpt Alamine Aminotransferase,538
Sgot Aspartate Aminotransferase,462
Total Protiens,463
ALB Albumin,494
A/G Ratio Albumin and Globulin Ratio,559


In [12]:
# statistical measures about the data
liver_data.describe()

Unnamed: 0,Age of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio,Result
count,30689.0,30043.0,30130.0,29895.0,30153.0,30229.0,30228.0,30197.0,30132.0,30691.0
mean,44.107205,3.370319,1.528042,289.075364,81.488641,111.469979,6.480237,3.130142,0.943467,1.285882
std,15.981043,6.255522,2.869592,238.537589,182.15885,280.851078,1.08198,0.792281,0.323164,0.451841
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,32.0,0.8,0.2,175.0,23.0,26.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,209.0,35.0,42.0,6.6,3.1,0.9,1.0
75%,55.0,2.7,1.3,298.0,62.0,88.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [13]:
# checking the distribution of Target Variable
liver_data['Result'].value_counts()

Unnamed: 0_level_0,count
Result,Unnamed: 1_level_1
1,21917
2,8774


1 --> Liver Patient

2 --> Non liver patient

In [14]:
# grouping the data bas3ed on the target variable
liver_data.groupby('Result').mean(numeric_only=True)

Unnamed: 0_level_0,Age of the patient,Total Bilirubin,Direct Bilirubin,Alkphos Alkaline Phosphotase,Sgpt Alamine Aminotransferase,Sgot Aspartate Aminotransferase,Total Protiens,ALB Albumin,A/G Ratio Albumin and Globulin Ratio
Result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,44.155327,4.259,1.980382,316.735787,100.568506,139.656508,6.459446,3.049743,0.911167
2,43.987007,1.14,0.394508,219.919447,33.616897,41.028128,6.532168,3.330871,1.024537


Splitting the Features and Target

In [15]:
X = liver_data.drop(columns=['Gender of the patient','Result'], axis=1)
Y = liver_data['Result']

In [16]:
print(X)

       Age of the patient  Total Bilirubin  Direct Bilirubin  \
0                    65.0              0.7               0.1   
1                    62.0             10.9               5.5   
2                    62.0              7.3               4.1   
3                    58.0              1.0               0.4   
4                    72.0              3.9               2.0   
...                   ...              ...               ...   
30686                50.0              2.2               1.0   
30687                55.0              2.9               1.3   
30688                54.0              6.8               3.0   
30689                48.0              1.9               1.0   
30690                30.0              3.1               1.6   

        Alkphos Alkaline Phosphotase   Sgpt Alamine Aminotransferase  \
0                              187.0                            16.0   
1                              699.0                            64.0   
2              

In [17]:
print(Y)

0        1
1        1
2        1
3        1
4        1
        ..
30686    1
30687    1
30688    1
30689    1
30690    1
Name: Result, Length: 30691, dtype: int64


Splitting the Data into Training data & Test Data

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [19]:
print(X.shape, X_train.shape, X_test.shape)

(30691, 9) (24552, 9) (6139, 9)


Model Training

Logistic Regression

In [20]:
# Impute missing values using the mean
imputer = SimpleImputer(strategy='mean') # Create an imputer instance
X_train = imputer.fit_transform(X_train) # Fit and transform on training data
X_test = imputer.transform(X_test) # Transform test data using the trained imputer


In [21]:
# model = LogisticRegression()

# model = svm.SVC(kernel='linear')

model = RandomForestClassifier()

In [22]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [23]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [24]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9999185402411209


In [25]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [26]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9965792474344356


Building a Predictive System

In [27]:
input_data = (65,0.7, 0.1, 187, 16, 18, 6.8, 3.3, 0.9)  # Removed 'Female' and kept only numerical features

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 1):
  print('The Person have a liver disease')
else:
  print('The Person has does not have liver Disease')

[1]
The Person have a liver disease


Saving the trained model

In [28]:
import pickle

In [30]:
filename = 'liverdisease_model_rf.sav'
pickle.dump(model, open(filename, 'wb'))

In [31]:
# loading the saved model
loaded_model = pickle.load(open('liverdisease_model_rf.sav', 'rb'))

In [32]:
for column in X.columns:
  print(column)

Age of the patient
Total Bilirubin
Direct Bilirubin
 Alkphos Alkaline Phosphotase
 Sgpt Alamine Aminotransferase
Sgot Aspartate Aminotransferase
Total Protiens
 ALB Albumin
A/G Ratio Albumin and Globulin Ratio
