<a href="https://colab.research.google.com/github/NvdSuni/Thesis-code-complete/blob/main/Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install optuna
!pip install imbalanced-learn

import optuna
from imblearn.over_sampling import SMOTE
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import joblib

In [None]:
from google.colab import drive
from imblearn.over_sampling import SMOTE
import optuna
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import __version__ as sklearn_version
from sklearn.ensemble import RandomForestClassifier
import joblib

# Get imbalanced-learn version using the pip package
import imblearn
imblearn_version = imblearn.__version__

# Print versions
print(f"optuna: {optuna.__version__}")
print(f"imbalanced-learn: {imblearn_version}")
print(f"numpy: {np.__version__}")
print(f"matplotlib: {plt.matplotlib.__version__}")
print(f"scikit-learn: {sklearn_version}")
# print(f"sklearn.ensemble (RandomForestClassifier): {RandomForestClassifier()._get_tags()['requires_y']}")
print(f"joblib: {joblib.__version__}")


In [None]:
X_train_combined = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/X_train_combined.npy")
X_val_combined = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/X_val_combined.npy")
y_train_combined = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/y_train_combined.npy")
y_val_combined = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/y_val_combined.npy")

#Untrained model

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)


rf_classifier.fit(X_train_combined, y_train_combined)

y_pred = rf_classifier.predict(X_val_combined)


accuracy = accuracy_score(y_val_combined, y_pred)
print(f"Accuracy on validation set: {accuracy}")


roc_auc = roc_auc_score(y_val_combined, rf_classifier.predict_proba(X_val_combined), multi_class='ovr')
print(f"ROC-AUC score on validation set: {roc_auc}")

joblib.dump(rf_classifier, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model.joblib")

#Trained model

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }

    rf_classifier = RandomForestClassifier(**params)

    rf_classifier.fit(X_train_combined, y_train_combined)

    y_pred_proba = rf_classifier.predict_proba(X_val_combined)

    roc_auc = roc_auc_score(y_val_combined, y_pred_proba, multi_class='ovr', average='macro')
    print(f'Trial {trial.number} ROC-AUC: {roc_auc}')

    return roc_auc

study = optuna.create_study(direction='maximize')
The
# Optimize the objective function
study.optimize(objective, n_trials=200)

# Print the best hyperparameters found by Optuna
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Use the best hyperparameters to train the final model
best_params = study.best_params
final_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
final_rf_classifier.fit(X_train_combined, y_train_combined)

# Make predictions on the validation data for the classification report
y_pred = final_rf_classifier.predict(X_val_combined)

# Print the classification report
print(classification_report(y_val_combined, y_pred))

joblib.dump(final_rf_classifier, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_tuned.joblib")


#Class imbalance treated + Tuned

In [None]:
smote = SMOTE(random_state=42)

# Fit and transform the data
X_train_combined_balanced, y_train_combined_balanced = smote.fit_resample(X_train_combined, y_train_combined)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }

    rf_classifier = RandomForestClassifier(**params)

    rf_classifier.fit(X_train_combined_balanced, y_train_combined_balanced)

    y_pred_proba = rf_classifier.predict_proba(X_val_combined)

    roc_auc = roc_auc_score(y_val_combined, y_pred_proba, multi_class='ovr', average='macro')
    print(f'Trial {trial.number} ROC-AUC: {roc_auc}')

    return roc_auc

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=200)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

best_params = study.best_params
final_rf_classifier = RandomForestClassifier(**best_params, random_state=42)

final_rf_classifier.fit(X_train_combined_balanced, y_train_combined_balanced)

y_pred = final_rf_classifier.predict(X_val_combined)

print(classification_report(y_val_combined, y_pred))

joblib.dump(final_rf_classifier, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_tuned_smote.joblib")


#Xray specific

In [None]:
X_train_Xray = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/X_train_Xray_reduced.npy")
X_val_Xray = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/X_val_Xray_reduced.npy")
y_train_Xray = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/train_labels_complete_Xray.npy")
y_val_Xray = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/val_labels_complete_Xray.npy")

In [None]:
X_train_shuffled_Xray, y_train_shuffled_Xray = shuffle(X_train_Xray, y_train_Xray, random_state=42)

X_val_shuffled_Xray, y_val_shuffled_Xray = shuffle(X_val_Xray, y_val_Xray, random_state=42)

y_val_shuffled_Xray = np.argmax(y_val_shuffled_Xray, axis=1)
y_train_shuffled_Xray = np.argmax(y_train_shuffled_Xray, axis=1)

#Random Forest: Xray untuned

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train_shuffled_Xray, y_train_shuffled_Xray)

y_pred_Xray = rf_classifier.predict(X_val_shuffled_Xray)

accuracy_Xray = accuracy_score(y_val_shuffled_Xray, y_pred_Xray)
print(f"Accuracy on validation set: {accuracy_Xray}")

y_pred_proba_Xray = np.array(rf_classifier.predict_proba(X_val_shuffled_Xray))
roc_auc_Xray = roc_auc_score(y_val_shuffled_Xray, y_pred_proba_Xray, multi_class='ovr', average='macro')
print(f"ROC-AUC score on validation set: {roc_auc_Xray}")

joblib.dump(rf_classifier, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_Xray.joblib")

#Xray tuned

In [None]:
def objective(trial):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }

    rf_classifier = RandomForestClassifier(**params)

    rf_classifier.fit(X_train_shuffled_Xray, y_train_shuffled_Xray)

    y_pred_proba = rf_classifier.predict_proba(X_val_shuffled_Xray)

    roc_auc = roc_auc_score(y_val_shuffled_Xray, y_pred_proba, multi_class='ovr', average='macro')
    print(f'Trial {trial.number} ROC-AUC: {roc_auc}')

    return roc_auc


study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=200)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

final_rf_classifier = RandomForestClassifier(**best_params, random_state=42)
final_rf_classifier.fit(X_train_shuffled_Xray, y_train_shuffled_Xray)

y_val_pred_proba = final_rf_classifier.predict_proba(X_val_shuffled_Xray)
roc_auc_test = roc_auc_score(y_val_shuffled_Xray, y_val_pred_proba, multi_class='ovr', average='macro')
print(f'Final Model ROC-AUC on Test Set: {roc_auc_test}')


joblib.dump(final_rf_classifier, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_tuned_Xray.joblib")


#Xray SMOTE

In [None]:
smote = SMOTE(random_state=42)

X_train_smote_Xray, y_train_smote_Xray = smote.fit_resample(X_train_shuffled_Xray, y_train_shuffled_Xray)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }

    rf_classifier = RandomForestClassifier(**params)

    rf_classifier.fit(X_train_smote_Xray, y_train_smote_Xray)

    y_pred_proba = rf_classifier.predict_proba(X_val_shuffled_Xray)

    roc_auc = roc_auc_score(y_val_shuffled_Xray, y_pred_proba, multi_class='ovr', average='macro')
    print(f'Trial {trial.number} ROC-AUC: {roc_auc}')

    return roc_auc

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=200)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

final_rf_classifier_smote = RandomForestClassifier(**best_params, random_state=42)
final_rf_classifier_smote.fit(X_train_smote_Xray, y_train_smote_Xray)

y_val_pred_proba = final_rf_classifier_smote.predict_proba(X_val_shuffled_Xray)

roc_auc_val = roc_auc_score(y_val_shuffled_Xray, y_val_pred_proba, multi_class='ovr', average='macro')
print(f'ROC-AUC on Validation Set: {roc_auc_val}')


joblib.dump(final_rf_classifier, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_tuned_Xray_smote.joblib")

#MRI Specific

In [None]:
X_train_MRNet = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/X_train_MRNet_reduced.npy")
X_val_MRNet = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/X_val_MRNet_reduced.npy")
y_train_MRNet = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/y_train_MRNet.npy")
y_val_MRNet = np.load("/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/y_val_MRNet.npy")

In [None]:
rf_classifier_MRNet = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier_MRNet.fit(X_train_MRNet, y_train_MRNet)

y_pred_proba = rf_classifier_MRNet.predict_proba(X_val_MRNet)

y_pred_proba_positive_class = y_pred_proba[:, 1]

roc_auc_MRNet = roc_auc_score(y_val_MRNet, y_pred_proba_positive_class)
print(f"ROC-AUC score on validation set: {roc_auc_MRNet}")

joblib.dump(rf_classifier_MRNet, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_MRNet.joblib")

#MRI Tuned

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }

    rf_classifier = RandomForestClassifier(**params)

    rf_classifier.fit(X_train_MRNet, y_train_MRNet)

    y_pred_proba = rf_classifier.predict_proba(X_val_MRNet)

    y_pred_proba_positive_class = y_pred_proba[:, 1]

    roc_auc = roc_auc_score(y_val_MRNet, y_pred_proba_positive_class)
    print(f'Trial {trial.number} ROC-AUC: {roc_auc}')

    return roc_auc

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=200)


best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

final_rf_classifier_MRNet = RandomForestClassifier(**best_params, random_state=42)
final_rf_classifier_MRNet.fit(X_train_MRNet, y_train_MRNet)

y_val_pred_proba = final_rf_classifier_MRNet.predict_proba(X_val_MRNet)


y_val_pred_proba = final_rf_classifier_MRNet.predict_proba(X_val_MRNet)
y_pred_proba_positive_class = y_pred_proba[:, 1]

roc_auc_val = roc_auc_score(y_val_MRNet, y_pred_proba_positive_class, multi_class='ovr', average='macro')
print(f'ROC-AUC on Validation Set: {roc_auc_val}')

joblib.dump(final_rf_classifier_MRNet, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_tuned_MRNet.joblib")


#MRI Tuned + SMOTE

In [None]:
smote = SMOTE(random_state=42)

X_train_smote_MRNet, y_train_smote_MRNet = smote.fit_resample(X_train_MRNet, y_train_MRNet)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42
    }

    rf_classifier = RandomForestClassifier(**params)

    rf_classifier.fit(X_train_smote_MRNet, y_train_smote_MRNet)

    y_pred_proba = rf_classifier.predict_proba(X_val_MRNet)
    y_pred_proba_positive_class = y_pred_proba[:, 1]

    roc_auc = roc_auc_score(y_val_MRNet, y_pred_proba_positive_class, multi_class='ovr', average='macro')
    print(f'Trial {trial.number} ROC-AUC: {roc_auc}')

    return roc_auc

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=200)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

final_rf_classifier_smote_MRNet = RandomForestClassifier(**best_params, random_state=42)
final_rf_classifier_smote_MRNet.fit(X_train_smote_MRNet, y_train_smote_MRNet)

y_val_pred = final_rf_classifier_smote_MRNet.predict(X_val_MRNet)
y_val_pred_proba = final_rf_classifier_smote_MRNet.predict_proba(X_val_MRNet)[:, 1]

roc_auc_val = roc_auc_score(y_val_MRNet, y_val_pred_proba)
print(f'ROC-AUC on Validation Set: {roc_auc_val}')

classification_rep = classification_report(y_val_MRNet, y_val_pred)
print(f'Classification Report:\n{classification_rep}')

joblib.dump(final_rf_classifier_smote_MRNet, "/content/drive/My Drive/Tilburg University/Master Thesis/Combined data/Models/Random Forest/random_forest_model_tuned_smote_MRNet.joblib")

