In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from joblib import dump

In [2]:
# Datasets Paths
X_train_resampled_path = os.path.join('..','datasets','prepared_data','X_train_resampled.csv') # SMOTE applied  
X_train_reduced_path = os.path.join('..','datasets','prepared_data','X_train_reduced.csv')     # PCA applied (It was resampled before)
X_test_path = os.path.join('..','datasets','prepared_data','X_test_transformed.csv') 
X_test_reduced_path = os.path.join('..','datasets','prepared_data','X_test_reduced.csv') 
y_train_path = os.path.join('..','datasets','prepared_data','y_train_resampled.csv') # SMOTE applied
y_test_path = os.path.join('..','datasets','prepared_data','y_test.csv')

# Load Datasets
X_train_resampled = pd.read_csv(X_train_resampled_path) # PCA Reduced Dataset
X_train_reduced = pd.read_csv(X_train_reduced_path)     # PCA applied (It was resampled before)
X_test_reduced = pd.read_csv(X_test_reduced_path)               # PCA
X_test = pd.read_csv(X_test_path)             
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# Ensure y are 1D arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [3]:
# Define the base model
rfc = RandomForestClassifier(random_state=42)

# Define the hyperparameters to search
param_grid = {
    'max_depth': [5, 7, 9, 12],            # The maximum depth of the tree.
    'criterion': ['gini', 'entropy'],      # The function to measure the quality of a split.
    'n_estimators': [75, 100, 150],        # The number of trees in the forest.
    'min_samples_split': [4, 5, 6, 8],     # The minimum number of samples required to split an internal node.
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring='recall',  # Change to 'roc_auc' or 'f1' if needed
    cv=5,                # Number of cross-validation folds
    n_jobs=-1,           # Use all available CPU cores
    verbose=1            # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_resampled, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best parameters: {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 4, 'n_estimators': 75}
Best score: 0.8790447704324633


In [4]:
# Predict
y_pred = best_model.predict(X_test)

In [5]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83      1552
           1       0.55      0.75      0.63       561

    accuracy                           0.77      2113
   macro avg       0.72      0.76      0.73      2113
weighted avg       0.80      0.77      0.78      2113



In [6]:
# Save trained model
dump(best_model, os.path.join('..','trained_models','random_forest.joblib'))

['..\\trained_models\\random_forest.joblib']

## With Reduced Dimensionality

In [7]:
# Define the base model
rfc = RandomForestClassifier(random_state=42)

# Define the hyperparameters to search
param_grid = {
    'max_depth': [5, 7, 9, 12],            # The maximum depth of the tree.
    'criterion': ['gini', 'entropy'],      # The function to measure the quality of a split.
    'n_estimators': [75, 100, 150],        # The number of trees in the forest.
    'min_samples_split': [4, 5, 6, 8],     # The minimum number of samples required to split an internal node.
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring='recall',  # Change to 'roc_auc' or 'f1' if needed
    cv=5,                # Number of cross-validation folds
    n_jobs=-1,           # Use all available CPU cores
    verbose=1            # Display progress details
)

# Run GridSearchCV
grid_search.fit(X_train_reduced, y_train)

# Display the best hyperparameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Get the best model
best_model__reduced = grid_search.best_estimator_

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best parameters: {'criterion': 'entropy', 'max_depth': 12, 'min_samples_split': 4, 'n_estimators': 75}
Best score: 0.8577961516479329


In [8]:
# Predict
y_pred = best_model__reduced.predict(X_test_reduced)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.77      0.82      1552
           1       0.52      0.71      0.60       561

    accuracy                           0.75      2113
   macro avg       0.70      0.74      0.71      2113
weighted avg       0.79      0.75      0.76      2113

