In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, KFold

import pandas as pd  

In [68]:
# Split data for modeling
df = pd.read_csv("data.csv")
X = df.drop("dataset", axis = 1)
y = df.dataset.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
X_train.head()


Unnamed: 0,age,gender,total_bilirubin,alkaline_phosphotase,alamine_aminotransferase,albumin_and_globulin_ratio
181,1.2,0.0,0.963474,0.088665,-0.060745,0.126945
55,-0.12,0.0,2.307429,0.506781,-0.125175,-1.234483
460,-0.92,-1.0,0.678072,0.062498,1.585264,-0.371664
170,1.08,0.0,0.432959,-0.074042,-0.229792,-0.041657
261,-0.48,0.0,0.321928,1.6774,1.853825,0.126945


In [69]:
# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
predict_train = model.predict(X_train)
predict_test = model.predict(X_test)

print(model)
print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, predict_train))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, predict_test))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, predict_test))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, predict_test))




LogisticRegression()
Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.76      0.93      0.83       314
           2       0.57      0.23      0.33       123

    accuracy                           0.73       437
   macro avg       0.66      0.58      0.58       437
weighted avg       0.70      0.73      0.69       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.73      0.95      0.83       102
           2       0.64      0.20      0.31        44

    accuracy                           0.73       146
   macro avg       0.69      0.58      0.57       146
weighted avg       0.71      0.73      0.67       146

Roc_auc score
-------------------------------------------------------
0.5777629233511586

Confusion matrix
-------------------------------------------------------
[[97  5]
 [35  

In [70]:
# Support vector machines
svm = SVC()
svm.fit(X_train, y_train)
predict_train_svm = svm.predict(X_train)
predict_test_svm = svm.predict(X_test)

print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, predict_train_svm))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, predict_test_svm))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, predict_test_svm))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, predict_test_svm))



Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.73      1.00      0.84       314
           2       1.00      0.05      0.09       123

    accuracy                           0.73       437
   macro avg       0.86      0.52      0.47       437
weighted avg       0.80      0.73      0.63       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.70      1.00      0.83       102
           2       1.00      0.02      0.04        44

    accuracy                           0.71       146
   macro avg       0.85      0.51      0.44       146
weighted avg       0.79      0.71      0.59       146

Roc_auc score
-------------------------------------------------------
0.5113636363636364

Confusion matrix
-------------------------------------------------------
[[102   0]
 [ 43   1]]


In [71]:
# Random Forest
rf = RandomForestClassifier(n_jobs=-1, random_state=123)
rf.fit(X_train, y_train)
predict_train_rf = rf.predict(X_train)
predict_test_rf = rf.predict(X_test)

print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, predict_train_rf))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, predict_test_rf))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, predict_test_rf))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, predict_test_rf))

Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       314
           2       1.00      1.00      1.00       123

    accuracy                           1.00       437
   macro avg       1.00      1.00      1.00       437
weighted avg       1.00      1.00      1.00       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.76      0.87      0.81       102
           2       0.55      0.36      0.44        44

    accuracy                           0.72       146
   macro avg       0.66      0.62      0.63       146
weighted avg       0.70      0.72      0.70       146

Roc_auc score
-------------------------------------------------------
0.6180926916221035

Confusion matrix
-------------------------------------------------------
[[89 13]
 [28 16]]


In [72]:
# K-NeighborsClassifier
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)
predict_train_k = kn.predict(X_train)
predict_test_k = kn.predict(X_test)

print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, predict_train_k))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, predict_test_k))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, predict_test_k))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, predict_test_k))


Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.83      0.90      0.86       314
           2       0.67      0.52      0.58       123

    accuracy                           0.79       437
   macro avg       0.75      0.71      0.72       437
weighted avg       0.78      0.79      0.78       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.76      0.89      0.82       102
           2       0.59      0.36      0.45        44

    accuracy                           0.73       146
   macro avg       0.68      0.63      0.64       146
weighted avg       0.71      0.73      0.71       146

Roc_auc score
-------------------------------------------------------
0.6278966131907309

Confusion matrix
-------------------------------------------------------
[[91 11]
 [28 16]]


In [73]:
# Extra Three
extra_t = ExtraTreesClassifier()
extra_t.fit(X_train, y_train)
predict_train_tree = extra_t.predict(X_train)
predict_test_tree = extra_t.predict(X_test)

print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, predict_train_tree))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, predict_test_tree))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, predict_test_tree))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, predict_test_tree))


Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       314
           2       1.00      1.00      1.00       123

    accuracy                           1.00       437
   macro avg       1.00      1.00      1.00       437
weighted avg       1.00      1.00      1.00       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.76      0.89      0.82       102
           2       0.59      0.36      0.45        44

    accuracy                           0.73       146
   macro avg       0.68      0.63      0.64       146
weighted avg       0.71      0.73      0.71       146

Roc_auc score
-------------------------------------------------------
0.6278966131907309

Confusion matrix
-------------------------------------------------------
[[91 11]
 [28 16]]


In [74]:
# XGboost 
xg = XGBClassifier()
xg.fit(X_train, y_train)
predict_train_xg = xg.predict(X_train)
predict_test_xg = xg.predict(X_test)

print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, predict_train_xg))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, predict_test_xg))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, predict_test_xg))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, predict_test_xg))




Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       314
           2       1.00      1.00      1.00       123

    accuracy                           1.00       437
   macro avg       1.00      1.00      1.00       437
weighted avg       1.00      1.00      1.00       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.78      0.83      0.81       102
           2       0.54      0.45      0.49        44

    accuracy                           0.72       146
   macro avg       0.66      0.64      0.65       146
weighted avg       0.71      0.72      0.71       146

Roc_auc score
-------------------------------------------------------
0.6439393939393939

Confusion matrix
-------------------------------------------------------
[[85 17]
 [24 20]]


In [45]:
#Model evaluation & optimization
# Random forest
params = {
    'n_estimators': [100, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

gs = GridSearchCV(rf, params, n_jobs=-1, cv= KFold(n_splits=3), scoring= 'roc_auc', verbose = 10)
gs.fit(X_train, y_train)


Fitting 3 folds for each of 480 candidates, totalling 1440 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 

NameError: name 'gs1' is not defined

In [59]:
print('Best score:', gs.best_score_)
print('Best score:', gs.best_params_)
print(gs.best_estimator_)

Best score: 0.7415139571578487
Best score: {'criterion': 'entropy', 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 500}
RandomForestClassifier(criterion='entropy', max_leaf_nodes=20,
                       min_samples_split=4, n_estimators=500, n_jobs=-1,
                       random_state=123)


In [63]:

# XGBoost

params = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01,0.05,0.1],
    'booster': ['gbtree', 'gblinear'],
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 5],
    'base_score': [0.2, 0.5, 1]
}

gs2 = GridSearchCV(xg, params,  n_jobs=-1, cv=KFold(n_splits=3), scoring='roc_auc', verbose = 10)
gs2.fit(X_train, y_train)


print('Best score:', gs2.best_score_)
print('Best score:', gs2.best_params_)
print(gs2.best_estimator_)

Fitting 3 folds for each of 1458 candidates, totalling 4374 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:

In [79]:
# Extra Tree

params = {
    'n_estimators': [100, 200, 500],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [1,2,4,5],
    'min_samples_leaf': [1,2,4,5],
    'max_leaf_nodes': [4,10,20,50,None]
}

gs3 =  GridSearchCV(extra_t, params, n_jobs=-1, cv=KFold(n_splits=3), scoring='roc_auc',  verbose = 10)
gs3.fit(X_train, y_train)


print('Best score:', gs3.best_score_)
print('Best score:', gs3.best_params_)
print(gs3.best_estimator_)

Fitting 3 folds for each of 480 candidates, totalling 1440 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   42.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 

In [87]:
# Voting classifier
votes = [
    ('rf', gs.best_estimator_),
    ('xg', gs2.best_estimator_),
    ('extra_t', gs3.best_estimator_)
]

vote = VotingClassifier(estimators=votes, voting='hard', n_jobs=-1)
vote_cv = cross_validate(vote, X_train, y_train, cv= KFold(3, random_state=123))
vote.fit(X_train, y_train)
vote_cv

{'fit_time': array([1.32944417, 1.33044267, 1.03124118]),
 'score_time': array([0.14148712, 0.19049001, 0.17191982]),
 'test_score': array([0.74657534, 0.71917808, 0.69655172])}

In [88]:
model = vote
y_train_pred = vote.predict(X_train)
y_test_pred = vote.predict(X_test)

print('Train performance')
print('-------------------------------------------------------')
print(classification_report(y_train, y_train_pred))

print('Test performance')
print('-------------------------------------------------------')
print(classification_report(y_test, y_test_pred))

print('Roc_auc score')
print('-------------------------------------------------------')
print(roc_auc_score(y_test, y_test_pred))
print('')

print('Confusion matrix')
print('-------------------------------------------------------')
print(confusion_matrix(y_test, y_test_pred))

Train performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.74      1.00      0.85       314
           2       1.00      0.09      0.16       123

    accuracy                           0.74       437
   macro avg       0.87      0.54      0.51       437
weighted avg       0.81      0.74      0.66       437

Test performance
-------------------------------------------------------
              precision    recall  f1-score   support

           1       0.70      1.00      0.83       102
           2       1.00      0.02      0.04        44

    accuracy                           0.71       146
   macro avg       0.85      0.51      0.44       146
weighted avg       0.79      0.71      0.59       146

Roc_auc score
-------------------------------------------------------
0.5113636363636364

Confusion matrix
-------------------------------------------------------
[[102   0]
 [ 43   1]]
