In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score

In [2]:
train_file = pd.read_csv('../data/cleaned_train.csv')
#test_file = pd.read_csv('../data/cleaned_test.csv')

In [3]:
X = train_file.drop('Class', axis=1)
y = train_file['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [5]:
y_pred_test = lr.predict(X_test)

In [7]:
y_pred_test

array(['LIVE', 'LIVE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE'], dtype=object)

In [8]:
accuracy = accuracy_score(y_test, y_pred_test)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred_test)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred_test)
print('Classification Report:')
print(class_report)

# original columns
kappa1= cohen_kappa_score(y_test, y_pred_test)
print(kappa1)

y_pred_train = lr.predict(X_train)
cohen_kappa_score(y_train, y_pred_train)

Accuracy: 0.86
Confusion Matrix:
[[ 3  3]
 [ 0 15]]
Classification Report:
              precision    recall  f1-score   support

         DIE       1.00      0.50      0.67         6
        LIVE       0.83      1.00      0.91        15

    accuracy                           0.86        21
   macro avg       0.92      0.75      0.79        21
weighted avg       0.88      0.86      0.84        21

0.5882352941176471


0.48347826086956525

In [9]:
from imblearn.over_sampling import RandomOverSampler

# resample the X_train and y_train data
ros = RandomOverSampler(random_state=42)

# for now, ros is only applied on the train data
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

X_train_resampled.shape, y_train_resampled.shape

((132, 19), (132,))

In [10]:
lr2 = LogisticRegression()
lr2.fit(X_train_resampled, y_train_resampled)

In [11]:
y_pred2_test = lr2.predict(X_test)
y_pred2_test

array(['LIVE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'DIE', 'LIVE'], dtype=object)

In [12]:
accuracy = accuracy_score(y_test, y_pred2_test)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred2_test)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred2_test)
print('Classification Report:')
print(class_report)

# kappa for resampled dataframe
kappa2 = cohen_kappa_score(y_test, y_pred2_test)
print(kappa2)

y_pred2_train = lr2.predict(X_train)
cohen_kappa_score(y_train, y_pred2_train)

Accuracy: 0.90
Confusion Matrix:
[[ 5  1]
 [ 1 14]]
Classification Report:
              precision    recall  f1-score   support

         DIE       0.83      0.83      0.83         6
        LIVE       0.93      0.93      0.93        15

    accuracy                           0.90        21
   macro avg       0.88      0.88      0.88        21
weighted avg       0.90      0.90      0.90        21

0.7666666666666666


0.6880616174582799

In [47]:
x_train_resampled_drop_protime = X_train_resampled.copy()
x_train_resampled_drop_protime.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1, inplace=True)
x_train_resampled_drop_protime

Unnamed: 0,FATIGUE,MALAISE,ANOREXIA,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,HISTOLOGY,AGE,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN
0,1,1,1,0,0,0,0,1,0.258065,0.064935,0.333702,0.000000,0.843750
1,1,0,0,1,0,0,0,1,0.548387,0.077922,0.225664,0.174242,0.562500
2,1,0,1,0,1,1,0,1,0.709677,0.149679,0.333702,0.251263,0.535282
3,0,0,0,0,0,0,0,1,0.370968,0.090909,0.243363,0.015152,0.593750
4,0,0,0,1,0,0,0,0,0.500000,0.012987,0.942478,0.132576,0.531250
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,1,1,0,1,0,0,1,0,0.645161,0.181818,0.247788,0.022727,0.000000
128,1,0,0,1,1,1,0,1,0.822581,0.220779,0.606195,0.863636,0.375000
129,1,1,0,1,0,0,1,0,0.645161,0.181818,0.247788,0.022727,0.000000
130,1,1,1,1,0,1,0,0,0.790323,0.337662,0.265487,0.526515,0.593750


In [48]:
lr3 = LogisticRegression()
lr3.fit(x_train_resampled_drop_protime, y_train_resampled)

In [49]:
X_test_drop_protime = X_test.copy()
X_test_drop_protime = X_test_drop_protime.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1)
y_pred3_test = lr3.predict(X_test_drop_protime)

accuracy = accuracy_score(y_test, y_pred3_test)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred3_test)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred3_test)
print('Classification Report:')
print(class_report)

#kappa score after dropping protime from resampled columns
kappa3= cohen_kappa_score(y_test, y_pred3_test)
print(kappa3)

y_pred3_train = lr3.predict(X_train.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1))
cohen_kappa_score(y_train, y_pred3_train)

Accuracy: 0.95
Confusion Matrix:
[[ 5  1]
 [ 0 15]]
Classification Report:
              precision    recall  f1-score   support

         DIE       1.00      0.83      0.91         6
        LIVE       0.94      1.00      0.97        15

    accuracy                           0.95        21
   macro avg       0.97      0.92      0.94        21
weighted avg       0.96      0.95      0.95        21

0.8771929824561403


0.6346863468634687

In [50]:
display(y_pred_test)
display(kappa1)

display(y_pred2_test)
display(kappa2)

display(y_pred3_test)
display(kappa3)

array(['LIVE', 'LIVE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE'], dtype=object)

0.5882352941176471

array(['LIVE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'DIE', 'LIVE'], dtype=object)

0.7666666666666666

array(['LIVE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE'], dtype=object)

0.8771929824561403