In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

# Set a random seed
random_seed = 42
np.random.seed(random_seed)

In [64]:
# loading the diabetes dataset to a pands DataFrame
liver_data = pd.read_csv('liver.csv')

In [65]:
liver_data.columns = liver_data.columns.map(str.lower)
liver_data.head()

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_protiens,albumin,albumin_and_globulin_ratio,dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [66]:
liver_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         583 non-null    int64  
 1   gender                      583 non-null    object 
 2   total_bilirubin             583 non-null    float64
 3   direct_bilirubin            583 non-null    float64
 4   alkaline_phosphotase        583 non-null    int64  
 5   alamine_aminotransferase    583 non-null    int64  
 6   aspartate_aminotransferase  583 non-null    int64  
 7   total_protiens              583 non-null    float64
 8   albumin                     583 non-null    float64
 9   albumin_and_globulin_ratio  579 non-null    float64
 10  dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [67]:
liver_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,583.0,44.746141,16.189833,4.0,33.0,45.0,58.0,90.0
total_bilirubin,583.0,3.298799,6.209522,0.4,0.8,1.0,2.6,75.0
direct_bilirubin,583.0,1.486106,2.808498,0.1,0.2,0.3,1.3,19.7
alkaline_phosphotase,583.0,290.576329,242.937989,63.0,175.5,208.0,298.0,2110.0
alamine_aminotransferase,583.0,80.713551,182.620356,10.0,23.0,35.0,60.5,2000.0
aspartate_aminotransferase,583.0,109.910806,288.918529,10.0,25.0,42.0,87.0,4929.0
total_protiens,583.0,6.48319,1.085451,2.7,5.8,6.6,7.2,9.6
albumin,583.0,3.141852,0.795519,0.9,2.6,3.1,3.8,5.5
albumin_and_globulin_ratio,579.0,0.947064,0.319592,0.3,0.7,0.93,1.1,2.8
dataset,583.0,1.286449,0.45249,1.0,1.0,1.0,2.0,2.0


In [68]:
liver_data['albumin_and_globulin_ratio'].fillna(liver_data['albumin_and_globulin_ratio'].mean(), inplace=True)

In [69]:
liver_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         583 non-null    int64  
 1   gender                      583 non-null    object 
 2   total_bilirubin             583 non-null    float64
 3   direct_bilirubin            583 non-null    float64
 4   alkaline_phosphotase        583 non-null    int64  
 5   alamine_aminotransferase    583 non-null    int64  
 6   aspartate_aminotransferase  583 non-null    int64  
 7   total_protiens              583 non-null    float64
 8   albumin                     583 non-null    float64
 9   albumin_and_globulin_ratio  583 non-null    float64
 10  dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [70]:
le = LabelEncoder()
liver_data['gender'] = le.fit_transform(liver_data['gender'])
liver_data['dataset'] = le.fit_transform(liver_data['dataset'])
liver_data.head()

Unnamed: 0,age,gender,total_bilirubin,direct_bilirubin,alkaline_phosphotase,alamine_aminotransferase,aspartate_aminotransferase,total_protiens,albumin,albumin_and_globulin_ratio,dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,0
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,0
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,0
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,0
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,0


In [71]:
liver_data['dataset'].value_counts()

0    416
1    167
Name: dataset, dtype: int64

In [72]:
X = liver_data.drop('dataset', axis=1)
Y = liver_data['dataset']

In [73]:
oversampler = RandomOverSampler(random_state=random_seed)
X_resampled, Y_resampled = oversampler.fit_resample(X,Y)

In [74]:
Y_resampled.value_counts()

0    416
1    416
Name: dataset, dtype: int64

In [75]:
x_train, x_test, y_train, y_test = train_test_split(X_resampled,Y_resampled,test_size=0.2, random_state=random_seed)

In [76]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(665, 10) (167, 10) (665,) (167,)


In [77]:
model_pipeline = Pipeline([
    ('scaler',PowerTransformer()),
    ('classifier',ExtraTreesClassifier())
])
model_pipeline.fit(x_train,y_train)

In [78]:
y_pred = model_pipeline.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        75
           1       0.91      0.86      0.88        92

    accuracy                           0.87       167
   macro avg       0.87      0.88      0.87       167
weighted avg       0.88      0.87      0.87       167



In [79]:
import pickle

In [80]:
filename = 'liver_model.sav'
pickle.dump(model_pipeline, open(filename, 'wb'))