In [52]:
import pandas as pd
import numpy as np

train = pd.read_csv('../data/non_scaled_cleaned_train.csv')
test = pd.read_csv('../data/non_scaled_data_cleaned_test.csv')
display(train.head())
display(train.shape)

Unnamed: 0,AGE,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,Class,OneHotSex
0,48,1,0,1,1,0,0,1,0,1,1,1,4.8,123.0,157.0,2.7,31.0,0,DIE,1.0
1,51,0,0,1,0,0,0,1,1,1,0,1,1.0,105.416667,20.0,3.0,63.0,0,LIVE,1.0
2,40,1,0,1,0,0,0,1,0,0,0,0,0.6,62.0,166.0,4.0,63.0,1,LIVE,1.0
3,25,0,0,1,0,0,1,1,1,1,1,1,1.3,181.0,181.0,4.5,57.0,0,LIVE,1.0
4,34,1,0,1,0,0,1,1,0,1,0,0,1.0,72.0,46.0,4.4,57.0,1,LIVE,1.0


(102, 20)

In [2]:
train['Class'].unique()

array(['DIE', 'LIVE'], dtype=object)

In [3]:
display(test.head())
display(test.shape)

Unnamed: 0,AGE,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY,OneHotSex
0,39,0,0,1,0,0,0,1,0,0,0,0,0.7,105.142857,48.0,4.4,61.030303,1,1.0
1,41,0,1,1,1,1,0,0,0,0,0,0,0.7,81.0,53.0,5.0,74.0,1,0.0
2,28,1,0,1,1,1,0,1,0,0,0,0,1.6,44.0,123.0,4.0,46.0,1,1.0
3,36,1,0,1,1,1,0,1,0,0,0,0,1.0,105.142857,45.0,4.0,57.0,1,1.0
4,32,0,0,0,0,0,0,0,0,0,0,0,0.7,102.0,64.0,4.0,90.0,1,1.0


(53, 19)

#### Checking Class Imbalance on the prediction column "Class

In [4]:
train['Class'].value_counts()

Class
LIVE    81
DIE     21
Name: count, dtype: int64

The Data Set is not very big (102 columns). Therefore I will use an upsampling approach to not further reduce the data set.

In [5]:
from sklearn.model_selection import train_test_split

y = train['Class']
X = train.drop(columns='Class')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from imblearn.over_sampling import RandomOverSampler

# resample the X_train and y_train data
ros = RandomOverSampler(random_state=42)

# for now, ros is only applied on the train data
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

X_train_resampled.shape, y_train_resampled.shape

((132, 19), (132,))

In [7]:
y_train_resampled.value_counts()

Class
LIVE    66
DIE     66
Name: count, dtype: int64

The data is now balanced and the RandomForestClassifier can be applied. It's investigated with hyperparameters are the best to use.

In [44]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, #default=100, number of trees in the forest
                             max_depth=5, # no. of tree levels; If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
                             min_samples_split=20, #
                             min_samples_leaf =20,
                             max_samples=0.8,
                             random_state=42)

clf.fit(X_train_resampled, y_train_resampled)

print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train_resampled, y_train_resampled)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(X_test, y_test)))

The Accuracy for the Random Forest in the TRAIN set is 0.89
The Accuracy for the Random Forest in the TEST  set is 0.90


In [57]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=10)
print("The mean Accuracy of the folds was {:.2f}".format(np.mean(cross_val_scores)))

The mean Accuracy of the folds was 0.87


In [58]:
cross_val_scores

array([0.92857143, 0.85714286, 0.92307692, 0.92307692, 0.92307692,
       0.84615385, 0.92307692, 0.76923077, 0.76923077, 0.84615385])

In [39]:
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score

# Predictions on the test set
y_pred_test = clf.predict(X_test)

# Classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))

print("Cohen Kappa:\n", cohen_kappa_score(y_test, y_pred_test))

Classification Report:
               precision    recall  f1-score   support

         DIE       0.75      1.00      0.86         6
        LIVE       1.00      0.87      0.93        15

    accuracy                           0.90        21
   macro avg       0.88      0.93      0.89        21
weighted avg       0.93      0.90      0.91        21

Confusion Matrix:
 [[ 6  0]
 [ 2 13]]
Cohen Kappa:
 0.7878787878787878


The accuracy scores indicate the proportion of correctly classified instances. The model shows good performance on both the training and test sets.

The precision, recall, and F1-score values for both 'DIE' and 'LIVE' classes are relatively high, indicating a balanced performance

Looking at the confusion matrix:
It shows that there are some instances where the model is making false negative predictions ('DIE' class), but overall, the model is performing well.

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier
clf2 = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [5, 10],
    'max_samples': [0.8]
}

# Create a GridSearchCV object with precision as the scoring metric
#grid_search = GridSearchCV(rf_classifier, scorer=scorer, param_grid, cv=5)
grid_search = GridSearchCV(clf2, param_grid, cv=5, return_train_score=True, n_jobs=-1, verbose = 20)

# Fit the model using GridSearchCV
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and the best precision score
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the results
print(best_params)
print(best_model)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
{'max_depth': 5, 'max_samples': 0.8, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 100}
RandomForestClassifier(max_depth=5, max_samples=0.8, min_samples_leaf=5,
                       min_samples_split=10, random_state=42)


In [9]:
grid_search.best_estimator_

In [32]:
names = best_model.feature_names_in_
values = best_model.feature_importances_

best_features = pd.DataFrame({"Name":names, "value":values})
best_features.sort_values(by="value", ascending=False)

Unnamed: 0,Name,value
12,BILIRUBIN,0.20817
15,ALBUMIN,0.204223
17,HISTOLOGY,0.089147
9,SPIDERS,0.074751
0,AGE,0.069679
13,ALK PHOSPHATE,0.060532
4,MALAISE,0.046591
3,FATIGUE,0.044963
14,SGOT,0.042315
16,PROTIME,0.038831


In [34]:
from sklearn.model_selection import cross_val_score
clf2 = grid_search.best_estimator_
cross_val_scores = cross_val_score(clf2, X_train_resampled, y_train_resampled, cv=10)
cross_val_scores

array([0.92857143, 0.85714286, 1.        , 0.92307692, 0.92307692,
       0.84615385, 0.92307692, 0.84615385, 0.84615385, 0.84615385])

In [36]:
print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf2.score(X_train_resampled, y_train_resampled)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf2.score(X_test, y_test)))

The Accuracy for the Random Forest in the TRAIN set is 0.92
The Accuracy for the Random Forest in the TEST  set is 0.90


In [37]:
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score

# Predictions on the test set
y_pred_test2 = clf2.predict(X_test)

# Classification report and confusion matrix
print("Classification Report:\n", classification_report(y_test, y_pred_test2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test2))

print("Cohen Kappa:\n", cohen_kappa_score(y_test, y_pred_test2))

Classification Report:
               precision    recall  f1-score   support

         DIE       0.75      1.00      0.86         6
        LIVE       1.00      0.87      0.93        15

    accuracy                           0.90        21
   macro avg       0.88      0.93      0.89        21
weighted avg       0.93      0.90      0.91        21

Confusion Matrix:
 [[ 6  0]
 [ 2 13]]
Cohen Kappa:
 0.7878787878787878


In [32]:
X.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [55]:
# Predictions on the test set with first attempt parameters
predict1 = clf.predict(test)

In [56]:
# Predictions on the test set with best parameters
predict2 = clf2.predict(test)

In [59]:
test['Class'] = predict1
test['Class2'] = predict2

In [61]:
test['Class'].value_counts()

Class
LIVE    36
DIE     17
Name: count, dtype: int64

In [62]:
test['Class2'].value_counts()

Class2
LIVE    37
DIE     16
Name: count, dtype: int64

In [70]:
random_forest_pred = pd.DataFrame(test['Class2'])

In [72]:
random_forest_pred.columns = ["Class"]

In [73]:
random_forest_pred

Unnamed: 0,Class
0,LIVE
1,LIVE
2,LIVE
3,LIVE
4,LIVE
5,LIVE
6,LIVE
7,LIVE
8,DIE
9,LIVE


In [75]:
random_forest_pred.to_csv('group_7.csv', index=False)