<a href="https://colab.research.google.com/github/RayNCode/code_collab/blob/main/RandomForest_final_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn
sklearn.set_config(display="diagram")
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from itertools import combinations
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

Here I did a trick. To accelerate the process I did the preprocessing in another notebook to get a csv file and therefore I skipped the imputer step. See Main_1 notebook to check the merge

In [3]:
final_df = pd.read_csv('/content/X_Data.csv', low_memory=False)
df = pd.read_csv('/content/target_data.csv')
y = df['target'].copy()
y = np.where(y == 'B', 1, 0)
categorical_columns = final_df.select_dtypes(include=['object']).columns
final_df['N1'] = final_df['N1'].astype(str).astype('object')
final_df['N2'] = final_df['N2'].astype(str).astype('object')
final_df['N3'] = final_df['N3'].astype(str).astype('object')
final_df['Categorie'] = final_df['Categorie'].astype(str).astype('object')
final_df['REG'] = final_df['REG'].astype(str).astype('object')
final_df[categorical_columns] = final_df[categorical_columns].fillna("None")
X = final_df.copy()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [5]:
preprocessing = make_column_transformer (
    (OneHotEncoder(handle_unknown='ignore'), ['insee_code',"is_student", "OCCUPATION_42", "ACTIVITY_TYPE", "household", "sex", "employer_category", "job_category", "Terms_of_emp",
                       "Eco_sect", "Job_dep", "WORK_CONDITION", "work_description","N3", "N2", "N1", "town_type", "dep", "Emp_contract", "Club", "Categorie", 'REG']),
    (OrdinalEncoder(), ["Highest_degree", "EMPLOYEE_COUNT"]),
    remainder='passthrough')

In [11]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['log2', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
param_grid_rf = {
    'classifier__n_estimators': n_estimators,
    'classifier__max_features': max_features,
    'classifier__max_depth': max_depth,
    'classifier__min_samples_split': min_samples_split,
    'classifier__min_samples_leaf': min_samples_leaf,
    'classifier__bootstrap': bootstrap
}


In [12]:
pipeline = Pipeline([
    ('preprocessing', preprocessing), # Remplacer par le préprocesseur réel
    ('classifier', RandomForestClassifier())
])

In [15]:
def tune_hyperparameters(pipeline, param_grid, X_train, y_train):
    """
    This function tunes the hyperparameters of a classifier using GridSearchCV and cross-validation
    and returns the best classifier model with the optimal hyperparameters.
    """

    # Create the cross-validation object using StratifiedKFold to ensure the class distribution is the same across all the folds
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Create the RandomizedSearch object
    clf = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grid,
        cv=cv, scoring='accuracy',
        n_iter=50,
        n_jobs=-1, random_state=0,
        verbose=2)

    # Fit the GridSearchCV object to the training data
    clf.fit(X_train, y_train)

    # Get the best hyperparameters
    print("Best hyperparameters:\n", clf.best_params_)

    # Return best_estimator_ attribute which gives us the best model that has been fitted to the training data
    return clf.best_estimator_


In [None]:
rfb_opt = tune_hyperparameters(pipeline, param_grid_rf, X_train, y_train)

In [None]:
def metrics_calculator(clf, X_test, y_test, model_name):
    '''
    This function calculates all desired performance metrics for a given model on test data.
    '''
    y_pred = clf.predict(X_test)
    result = pd.DataFrame(data=[accuracy_score(y_test, y_pred),
                                precision_score(y_test, y_pred, average='macro'),
                                recall_score(y_test, y_pred, average='macro'),
                                f1_score(y_test, y_pred, average='macro'),
                                roc_auc_score(y_test, clf.predict_proba(X_test)[::,1], average='macro')],
                          index=['Accuracy','Macro Precision','Macro Recall','Macro F1-score','Macro AUC'],
                          columns = [model_name])

    result = (result * 100).round(2).astype(str) + '%'
    return result

In [None]:
def model_evaluation(clf, X_train, X_test, y_train, y_test, model_name):
    '''
    This function provides a complete report of the model's performance including classification reports and confusion matrix
    '''
    # Set font scale
    sns.set(font_scale=1.5)

    # Generate classification report for training set
    y_pred_train = clf.predict(X_train)
    print("\n\t  Classification report for training set")
    print("-"*55)
    print(classification_report(y_train, y_pred_train))

    # Generate classification report for test set
    y_pred_test = clf.predict(X_test)
    print("\n\t   Classification report for test set")
    print("-"*55)
    print(classification_report(y_test, y_pred_test))
    print('\n')

    # Create figure and subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, dpi=90, figsize=(12, 5))

    # Plot confusion matrix for test set
    cmap = plt.cm.Purples  # Remplace 'purple_cmap' par 'cmap'

    ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test, colorbar=False, cmap=cmap, ax=ax1)
    ax1.set_title('Confusion Matrix for Test Data')
    ax1.grid(False)

    # Report desired results as a summary in the form of a table
    result = metrics_calculator(clf, X_test, y_test, model_name)
    table = ax2.table(cellText=result.values, colLabels=result.columns, rowLabels=result.index, loc='center')
    table.scale(0.6, 3.6)
    table.set_fontsize(12)
    ax2.axis('tight')
    # Hide the axes
    ax2.axis('off')
    # set the title
    ax2.set_title('{} Performance Summary on Test Data'.format(model_name), fontsize=18)
    # Modify color
    for key, cell in table.get_celld().items():
        if key[0] == 0:
          cell.set_color('purple')

    plt.tight_layout()
    plt.show()

In [None]:
# best_params = {'xgbclassifier__subsample': 0.9000000000000001,
#               'xgbclassifier__scale_pos_weight': 0.8300000000000001,
#               'xgbclassifier__reg_lambda': 0.08,
#               'xgbclassifier__reg_alpha': 0.8,
#               'xgbclassifier__n_estimators': 151,
#               'xgbclassifier__min_child_weight': 3.279999999999994,
#               'xgbclassifier__max_depth': 5, 'xgbclassifier__max_delta_step': 1.54,
#               'xgbclassifier__learning_rate': 0.76,
#               'xgbclassifier__gamma': 0.9980000000000002,
#               'xgbclassifier__colsample_bytree': 0.7100000000000001,
#               'xgbclassifier__colsample_bylevel': 0.9400000000000002}

In [None]:
model_evaluation(rfb_opt, X_train, X_test, y_train, y_test, 'Random Forest')