In [107]:
import pandas as pd
import joblib
from sklearn import ensemble, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
)

import warnings
warnings.simplefilter("ignore")

In [108]:
patients = pd.read_csv("../input/specific_disease/liver/indian_liver_patient.csv")
patients.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [109]:
# distribution of classes
patients.Dataset.value_counts()

1    416
2    167
Name: Dataset, dtype: int64

In [110]:
patients.Albumin_and_Globulin_Ratio.value_counts()

1.00    106
0.80     65
0.90     59
0.70     53
1.10     46
       ... 
0.58      1
0.69      1
1.27      1
1.12      1
0.37      1
Name: Albumin_and_Globulin_Ratio, Length: 69, dtype: int64

In [111]:
# clean data
patients["Gender"] = patients["Gender"].apply(lambda x: 1 if x == "Male" else 0)
patients['Albumin_and_Globulin_Ratio'] = patients['Albumin_and_Globulin_Ratio'].fillna(1.00)

In [112]:
# TRAIN MODEL
X = patients.drop("Dataset", axis=1, inplace=False)
print(X.columns)
y = patients["Dataset"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Shape training set: X:{}, y:{}".format(X_train.shape, y_train.shape))
print("Shape test set: X:{}, y:{}".format(X_test.shape, y_test.shape))

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio'],
      dtype='object')
Shape training set: X:(466, 10), y:(466,)
Shape test set: X:(117, 10), y:(117,)


In [113]:
# model = ensemble.RandomForestClassifier()
model = linear_model.LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy : {}".format(accuracy_score(y_test, y_pred)))
print("Recall : {}".format(recall_score(y_test, y_pred)))
print("Precision : {}".format(precision_score(y_test, y_pred)))
print("F1-Score : {}".format(f1_score(y_test, y_pred)))

clf_report = classification_report(y_test, y_pred)
print("Classification report")
print("---------------------")
print(clf_report)
print("_____________________")

joblib.dump(model, "../models/liver.pkl")

Accuracy : 0.7435897435897436
Recall : 0.9425287356321839
Precision : 0.7663551401869159
F1-Score : 0.845360824742268
Classification report
---------------------
              precision    recall  f1-score   support

           1       0.77      0.94      0.85        87
           2       0.50      0.17      0.25        30

    accuracy                           0.74       117
   macro avg       0.63      0.55      0.55       117
weighted avg       0.70      0.74      0.69       117

_____________________


['../models/liver.pkl']