In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [16]:
data = pd.read_csv('indian_liver_patient.csv')

In [18]:
data['Albumin_and_Globulin_Ratio'] = data['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].mean())

In [None]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

In [5]:
data = binary_encode(data, 'Gender', 'Male')
data = binary_encode(data, 'Dataset', 1)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    int64  
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  583 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 50.2 KB


In [7]:
y = data['Dataset']
X = data.drop('Dataset', axis=1)

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size = 0.3, random_state = 9)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
def models(X_train,y_train):
  #Using DecisionTreeClassifier 
  from sklearn.tree import DecisionTreeClassifier
  tree = DecisionTreeClassifier( random_state = 9)
  tree.fit(X_train, y_train)
    
   #Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm
  from sklearn.ensemble import RandomForestClassifier
  forest = RandomForestClassifier(n_estimators = 10, random_state = 9)
  forest.fit(X_train, y_train)
    
  #Using Logistic Regression 

  from sklearn.linear_model import LogisticRegression
  logistic = LogisticRegression(random_state = 9)
  logistic.fit(X_train, y_train)

  #Using SVC rbf
  from sklearn.svm import SVC
  svc_lin = SVC(kernel = 'linear', random_state = 9)
  svc_lin.fit(X_train, y_train)
    
  #Using KNeighborsClassifier 
  from sklearn.neighbors import KNeighborsClassifier
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train, y_train)

  #Using SVC rbf
  from sklearn.svm import SVC
  svc_rbf = SVC(kernel = 'rbf', random_state = 9)
  svc_rbf.fit(X_train, y_train)

  #Using GaussianNB 
  from sklearn.naive_bayes import GaussianNB
  gauss = GaussianNB()
  gauss.fit(X_train, y_train)
 
  


  
  #print model accuracy on the training data.
  print('[0]Decision Tree Classifier Training Accuracy:', tree.score(X_test, y_test))
  print('[1]Support Vector Machine (Linear Classifier) Training Accuracy:', svc_lin.score(X_test, y_test))
  print('[2]K Nearest Neighbor Training Accuracy:', knn.score(X_test, y_test))
  print('[3]Support Vector Machine (RBF Classifier) Training Accuracy:', svc_rbf.score(X_test, y_test))
  print('[4]Gaussian Naive Bayes Training Accuracy:', gauss.score(X_test, y_test))
  print('[5]Logistic Regression Training Accuracy:', logistic.score(X_test, y_test))
  print('[6]Random Forest Classifier Training Accuracy:', forest.score(X_test, y_test))

  return tree,svc_lin,knn, svc_rbf, gauss,logistic, forest

In [10]:
model = models(X_train,y_train)

[0]Decision Tree Classifier Training Accuracy: 0.6628571428571428
[1]Support Vector Machine (Linear Classifier) Training Accuracy: 0.7142857142857143
[2]K Nearest Neighbor Training Accuracy: 0.6514285714285715
[3]Support Vector Machine (RBF Classifier) Training Accuracy: 0.7142857142857143
[4]Gaussian Naive Bayes Training Accuracy: 0.5828571428571429
[5]Logistic Regression Training Accuracy: 0.6971428571428572
[6]Random Forest Classifier Training Accuracy: 0.6971428571428572


In [11]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(random_state = 9)
logistic.fit(X_train, y_train)
logistic.score(X_test,y_test)

0.6971428571428572

In [12]:
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [13]:
np.random.seed(9)
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(logistic, param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)

Fitting 10 folds for each of 1200 candidates, totalling 12000 fits


In [38]:
best_clf.best_estimator_.score(X_train,y_train)

0.7450980392156863

In [24]:
print (f'Accuracy - : {best_clf.score(X_train,y_train):.3f}')

Accuracy - : 0.745


In [36]:
#Cross -valuation scores
models=[]
from sklearn.svm import SVC
models.append(('RFS', RandomForestClassifier()))
models.append(('LR', LogisticRegression()))
models.append(('SVM', SVC()))
results =[]
names=[]
for name , model in models:
    kfold=KFold(n_splits=25)
    cv_results= cross_val_score(model, X_train, y_train, cv=25,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    
    msg= '%s:, %f, (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)

RFS:, 0.718088, (0.079939)
LR:, 0.730441, (0.073689)
SVM:, 0.708382, (0.039333)


LogisticRegression(C=4.281332398719396, penalty='l1', random_state=9,
                   solver='liblinear')

In [26]:
y_test

317    1
98     1
39     1
40     1
356    1
      ..
223    0
51     1
181    1
122    0
316    0
Name: Dataset, Length: 175, dtype: int64

In [40]:
import pickle
liver_saved_model=pickle.dump(best_clf.best_estimator_,open('liver_saved_model.pkl','wb'))

In [43]:
data.columns


Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'],
      dtype='object')