In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, cohen_kappa_score

In [2]:
train_file = pd.read_csv('../data/cleaned_train.csv')
test = pd.read_csv('../data/cleaned_test.csv')

In [3]:
X = train_file.drop('Class', axis=1)
y = train_file['Class']

#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from imblearn.over_sampling import RandomOverSampler

# resample the X_train and y_train data
ros = RandomOverSampler(random_state=42)

# for now, ros is only applied on the train data
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

X_train_resampled.shape, y_train_resampled.shape

((132, 19), (132,))

In [5]:
x_train_resampled_new = X_train_resampled.copy()

#dropping columns according to the features with the lower importance
x_train_resampled_new.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1, inplace=True)
x_train_resampled_new

Unnamed: 0,FATIGUE,MALAISE,ANOREXIA,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,HISTOLOGY,AGE,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN
0,1,1,1,0,0,0,0,1,0.258065,0.064935,0.333702,0.000000,0.843750
1,1,0,0,1,0,0,0,1,0.548387,0.077922,0.225664,0.174242,0.562500
2,1,0,1,0,1,1,0,1,0.709677,0.149679,0.333702,0.251263,0.535282
3,0,0,0,0,0,0,0,1,0.370968,0.090909,0.243363,0.015152,0.593750
4,0,0,0,1,0,0,0,0,0.500000,0.012987,0.942478,0.132576,0.531250
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,1,1,0,1,0,0,1,0,0.645161,0.181818,0.247788,0.022727,0.000000
128,1,0,0,1,1,1,0,1,0.822581,0.220779,0.606195,0.863636,0.375000
129,1,1,0,1,0,0,1,0,0.645161,0.181818,0.247788,0.022727,0.000000
130,1,1,1,1,0,1,0,0,0.790323,0.337662,0.265487,0.526515,0.593750


In [6]:
#creating the model and fitting it
lr3 = LogisticRegression()
lr3.fit(x_train_resampled_new, y_train_resampled)

In [7]:
# Dropping the columns from the test set and predicting the y

X_test_new = X_test.copy()
X_test_new = X_test_new.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1)
y_pred3_test = lr3.predict(X_test_new)

In [8]:
# Calculating the accuracy, confusion matrix, classification report and kappa score

accuracy = accuracy_score(y_test, y_pred3_test)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred3_test)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred3_test)
print('Classification Report:')
print(class_report)

kappa3= cohen_kappa_score(y_test, y_pred3_test)
print(kappa3)

#y_pred3_train = lr3.predict(X_train.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1))
#cohen_kappa_score(y_train, y_pred3_train)

Accuracy: 0.95
Confusion Matrix:
[[ 5  1]
 [ 0 15]]
Classification Report:
              precision    recall  f1-score   support

         DIE       1.00      0.83      0.91         6
        LIVE       0.94      1.00      0.97        15

    accuracy                           0.95        21
   macro avg       0.97      0.92      0.94        21
weighted avg       0.96      0.95      0.95        21

0.8771929824561403


In [9]:
# Applying the model to the test set
X_final= test.copy()

X_final.drop(columns=['PROTIME', 'OneHotSex', 'ANTIVIRALS', 'LIVER BIG', 'STEROID', 'VARICES'], axis=1, inplace= True)

y_pred_final= lr3.predict(X_final)
y_pred_final_df= pd.DataFrame(y_pred_final,columns=['Class'])     

# Saving the predictions to a csv file
y_pred_final_df.to_csv('group_7-3.csv', index=False)

In [10]:
# Viewing the file
pd.read_csv('group_7-3.csv')

Unnamed: 0,Class
0,LIVE
1,LIVE
2,LIVE
3,LIVE
4,LIVE
5,LIVE
6,LIVE
7,LIVE
8,DIE
9,LIVE


In [11]:
# testing other models

In [12]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_test = lr.predict(X_test)

In [13]:
y_pred_test

array(['LIVE', 'LIVE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE'], dtype=object)

In [14]:
accuracy = accuracy_score(y_test, y_pred_test)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred_test)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred_test)
print('Classification Report:')
print(class_report)

# original columns kappa
kappa1= cohen_kappa_score(y_test, y_pred_test)
print(kappa1)

#y_pred_train = lr.predict(X_train)
#cohen_kappa_score(y_train, y_pred_train)

Accuracy: 0.86
Confusion Matrix:
[[ 3  3]
 [ 0 15]]
Classification Report:
              precision    recall  f1-score   support

         DIE       1.00      0.50      0.67         6
        LIVE       0.83      1.00      0.91        15

    accuracy                           0.86        21
   macro avg       0.92      0.75      0.79        21
weighted avg       0.88      0.86      0.84        21

0.5882352941176471


In [15]:
lr2 = LogisticRegression()
lr2.fit(X_train_resampled, y_train_resampled)

In [16]:
y_pred2_test = lr2.predict(X_test)
y_pred2_test

array(['LIVE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'DIE', 'DIE', 'DIE', 'LIVE', 'LIVE', 'LIVE', 'LIVE', 'LIVE',
       'LIVE', 'LIVE', 'LIVE', 'DIE', 'LIVE'], dtype=object)

In [17]:
accuracy = accuracy_score(y_test, y_pred2_test)
print(f'Accuracy: {accuracy:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred2_test)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred2_test)
print('Classification Report:')
print(class_report)

# kappa for resampled dataframe without dropping columns
kappa2 = cohen_kappa_score(y_test, y_pred2_test)
print(kappa2)

#y_pred2_train = lr2.predict(X_train)
#cohen_kappa_score(y_train, y_pred2_train)

Accuracy: 0.90
Confusion Matrix:
[[ 5  1]
 [ 1 14]]
Classification Report:
              precision    recall  f1-score   support

         DIE       0.83      0.83      0.83         6
        LIVE       0.93      0.93      0.93        15

    accuracy                           0.90        21
   macro avg       0.88      0.88      0.88        21
weighted avg       0.90      0.90      0.90        21

0.7666666666666666
