In [2]:
import matplotlib.pyplot as plt
from typing import Literal
import pandas as pd
import numpy as np
import gc


#Data pre processing 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Scoring
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

#Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold

#Hyperparam opti
import optuna

#Mlflow
import mlflow
import mlflow.sklearn

# Explainability
import shap

import warnings
warnings.simplefilter(action='ignore', category=AttributeError)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def clear_global_variable(variable_name):
    """
    Check if a global variable exists and delete it if it does.
    """
    for v in variable_name:
        if v in globals():
            # Delete the global variable
            del globals()[v]
    gc.collect()

In [None]:
def change_dataset_version(dataset_version : Literal['1.0','1.1','2.0','3.0']):
    """
    1.0 : Full dataset from our data kernel with Na
    1.1 : Full dataset from our data kernel `without` Na (imputed by mean)
    2.0 : sampled data, by default 10% of the original data
    2.1 : sampled data, by default 10% of the original data `without` Na (imputed by mean)
    3.0 : scaled data
    4.0 : scaled and PCA
    4.1 : scaled and PCA and sampled, by default 10% of the original data size 

    # How to use:
    `X, y, current_version = change_dataset_version('1.0')`
    """
    reset_global_var = ['X','y','current_version']
    clear_global_variable(reset_global_var)

    if dataset_version == '1.0':
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]
        #Remove characters that model can't read
        X.columns = X.columns.str.replace('[^\w\s]','')
        #Clean unused data to free up ram
        del train_data
        gc.collect()

    elif dataset_version == '1.1':
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        # Calculate the mean of each column
        column_means = train_data.mean()
        # Fill NaN values in each column with the corresponding mean value
        train_data.fillna(column_means, inplace=True)
        # Replace positive infinity values
        for col in train_data.columns:
            max_val = train_data[train_data[col] != np.inf][col].max()
            train_data[col].replace(np.inf, max_val, inplace=True)
        
        # Replace negative infinity values (if applicable)
        for col in train_data.columns:
            min_val = train_data[train_data[col] != -np.inf][col].min()
            train_data[col].replace(-np.inf, min_val, inplace=True)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]
        #Remove characters that model can't read
        X.columns = X.columns.str.replace('[^\w\s]','')
        #Clean unused data to free up ram
        del train_data
        gc.collect()

    elif dataset_version == '2.0':
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        # Sample 10% of the data
        train_data = train_data.sample(frac=0.10, random_state=42)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]
        #Remove characters that model can't read
        X.columns = X.columns.str.replace('[^\w\s]','')

        #Clean unused data to free up ram
        del train_data
        gc.collect()

    elif dataset_version == '2.1':
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        # Sample 10% of the data
        train_data = train_data.sample(frac=0.10, random_state=42)

        # Calculate the mean of each column
        column_means = train_data.mean()
        # Fill NaN values in each column with the corresponding mean value
        train_data.fillna(column_means, inplace=True)

        # Replace positive infinity values
        for col in train_data.columns:
            max_val = train_data[train_data[col] != np.inf][col].max()
            train_data[col].replace(np.inf, max_val, inplace=True)

        # Replace negative infinity values (if applicable)
        for col in train_data.columns:
            min_val = train_data[train_data[col] != -np.inf][col].min()
            train_data[col].replace(-np.inf, min_val, inplace=True)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]
        #Remove characters that model can't read
        X.columns = X.columns.str.replace('[^\w\s]','')

        #Clean unused data to free up ram
        del train_data
        gc.collect()


    elif dataset_version == '3.0':
        # Same process as '1.0', but with additional scaling step.
        # Load the data.
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X.
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]

        # Scaling the data.
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        del train_data
        gc.collect()

    elif dataset_version == '4.0':
        # Same process as '3.0', but with additional PCA step.
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X.
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]

        # Scaling the data.
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # Apply PCA.
        pca = PCA(n_components=0.95)
        X = pca.fit_transform(X)

        del train_data
        gc.collect()

    elif dataset_version == '4.1':
        # Same process as '4.0', but with additional sampling step.
        train_data = pd.read_feather("./train_data.feather")
        not_usable_col = ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']
        train_data.drop(columns='level_0', inplace=True)

        #y is just the target column
        y = train_data['TARGET'].copy()

        # Get the columns to be included in X.
        columns_X = [col for col in train_data.columns if col not in not_usable_col]
        X = train_data[columns_X]

        X, _, y, _ = train_test_split(X, y, stratify=y, test_size=0.9, random_state=42)

        # Calculate the mean of each column
        column_means = X.mean()
        # Fill NaN values in each column with the corresponding mean value
        X.fillna(column_means, inplace=True)

        # Replace positive infinity values
        for col in X.columns:
            max_val = X[X[col] != np.inf][col].max()
            X[col].replace(np.inf, max_val, inplace=True)

        # Scaling the data.
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        # Apply PCA.
        pca = PCA(n_components=0.95)
        X = pca.fit_transform(X)

        # Convert back to DataFrame.
        X = pd.DataFrame(X, columns=["PC" + str(i) for i in range(1, X.shape[1] + 1)])


        del train_data
        gc.collect()


    print(X.shape)
    print(y.shape)
    ratio = y.sum()/len(y)*100
    print("Target 1-0 ratio: {:.2f}%".format(ratio))
    return X, y, dataset_version

In [None]:
def save_5_rows(X, y):

    X_5 = X.head()
    y_5 = y.head()

    X_5.to_csv('./test_df/X_head', index = False)
    y_5.to_csv('./test_df/y_head',index = False)

    return 

**Let's create functions that will become our baseline for model iteration**

After we have our data we will put it through this function and it will get us:
- The best hyperparameters for those data and this model
- Save all information about the model run so we can compare with other results and pick the best model at the end 

In [None]:
models = {
    'randomForestClassifier': lambda params: RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'], random_state=42),
    'lightGBM': lambda params: lgb.LGBMClassifier(num_leaves=params['num_leaves'], max_depth=params['max_depth'], n_estimators=params['n_estimators'], random_state=42),
    'logisticRegression': lambda params: LogisticRegression(C=params['C'], random_state=42, max_iter=1000)
    #'SVC': lambda params: SVC(C=params['C'], gamma=params['gamma'], probability=True, random_state=42)
    # Add more models here as needed
}

In [None]:
def train_and_evaluate_model(model_name, params, X, y, use_smote = False):
    skf = StratifiedKFold(n_splits=5)

    accuracies = []
    adjusted_scores = []
    recalls = []
    f1_scores = []
    all_y_pred_prob = []
    all_y_true = []  # <--- List for all true labels in validation sets


    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]


        if use_smote:
            smote = SMOTE()
            X_train, y_train = smote.fit_resample(X_train, y_train)
            #ratio = y_train.sum()/len(y_train)*100
            #print("Target 1-0 ratio: {:.2f}%".format(ratio))
            
        #Create the model with given param
        model = models[model_name](params)
        model.fit(X_train, y_train)

        #calculate the pred probabilities on the test set for ROC AUC later
        y_pred_prob = model.predict_proba(X_val)[:,1]

        #Calculate accuracy
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        accuracies.append(accuracy)

        #Calculate recall and f1_score
        recall = recall_score(y_val, y_pred)
        recalls.append(recall)

        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)

        #accuracy is not everything, in our case errors on FP and FN are vastly different FN are lossing the bank much more money than FP
        tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
        adjusted_score = 10 * fn + fp
        adjusted_scores.append(adjusted_score)

        all_y_pred_prob.extend(y_pred_prob)
        all_y_true.extend(y_val)

    mean_accuracy = np.mean(accuracies)
    mean_adjusted_score = np.mean(adjusted_scores)
    mean_recall = np.mean(recalls)
    mean_f1_score = np.mean(f1_scores)

    return model, mean_accuracy, all_y_true, all_y_pred_prob, mean_recall, mean_f1_score, adjusted_scores, mean_adjusted_score

In [None]:
def objective(model_name, params, X, y, use_smote : bool):
    """
    This function is what optuna will try to minimze during the best param search 
    """
    _, _, _, _, _, _, _, adjusted_score= train_and_evaluate_model(model_name, params,X,y,use_smote)
    

    return adjusted_score


In [None]:
def run_optuna_experiment(model_name : Literal['lightGBM','randomForestClassifier','logisticRegression'],
                           X, y, max_trials:int, use_smote = False):
    
    # Create an Optuna study object
    study = optuna.create_study(direction='minimize')  # 'minimize' for custom "métier" score

    # Optimize the objective function (number of trials specified by max_trials)
    study.optimize(lambda trial: objective(model_name, 
                                       {'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                                        'max_depth': trial.suggest_int('max_depth', 3, 10)
                                       } if model_name == 'randomForestClassifier' else

                                       {'num_leaves': trial.suggest_int('num_leaves', 31, 50),
                                        'max_depth': trial.suggest_int('max_depth', -1, 50),
                                        'n_estimators': trial.suggest_int('n_estimators', 100, 200)
                                       } if model_name == 'lightGBM' else
                                       {'C': trial.suggest_float('C', 0.1, 10)
                                        # Add more hyperparameters as needed
                                       } if model_name == 'logisticRegression' else

                                       {
                                        #Write Hparams for other models
                                       }, 
                                       X, y, use_smote), n_trials=max_trials)

    # Retrieve the best hyperparameters from the study
    best_params = study.best_params
    return best_params



In [None]:
def run_best_model(model_name : Literal['lightGBM','randomForestClassifier','logisticRegression'],
                    best_params, X, y, dataset_version:str, use_smote = False, shap = False):
    
        # Start an MLflow run to track the training process
    with mlflow.start_run():
        # Log the dataset version
        mlflow.log_param('dataset_version', dataset_version)
        mlflow.log_param('SMOTE', use_smote)
        mlflow.log_param('model type', model_name)

        # Log the hyperparameters
        mlflow.log_params(best_params)

        # Train and evaluate the model using the best hyperparameters
        model, accuracy, all_y_true, y_pred_prob, recall, f1_score, adjusted_scores, adjusted_score = train_and_evaluate_model(model_name, best_params, X, y, use_smote)

        # Log the metrics
        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('recall', recall)
        mlflow.log_metric('f1_score', f1_score)
        mlflow.log_metric('adjusted_score', adjusted_score)
        mlflow.log_metrics('adjusted_scores_CV', adjusted_scores)

        # Save the model as an artifact
        mlflow.sklearn.log_model(model, 'model')

        #region ROC AUC
        # Calculate ROC AUC
        roc_auc = roc_auc_score(all_y_true, y_pred_prob)
        mlflow.log_metric('roc_auc', roc_auc)
        # Plot ROC curve
        fpr, tpr, thresholds = roc_curve(all_y_true, y_pred_prob)
        plt.figure()
        plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc='lower right')
        
        # Save the plot as an image file
        roc_curve_img_path = './Exports/roc_curve.png'
        plt.savefig(roc_curve_img_path)

        # Log the ROC curve image as an artifact
        mlflow.log_artifact(roc_curve_img_path)

        # Close the plot to free up memory
        plt.close()
        # endregion 

        #region Shap
        if shap:
            # Calculate SHAP values
            explainer = shap.Explainer(model, X)
            shap_values = explainer(X, check_additivity=False)

            # Create a summary plot and save it
            shap.summary_plot(shap_values, X, show=False)
            plt.savefig('./Exports/shap_summary.png')

            # Log the SHAP summary plot as an artifact
            mlflow.log_artifact('./Exports/shap_summary.png')
        #endregion

In [None]:
mlflow.set_tracking_uri('sqlite:///app/mlflow.db')
mlflow.set_experiment('credits_models')


In [None]:
X, y, current_version = change_dataset_version('4.1')

In [None]:
#save_5_rows(X,y)

In [None]:
best_optuna_params = run_optuna_experiment('lightGBM',X,y,5,use_smote=True)

In [None]:
run_best_model('lightGBM',best_optuna_params,X,y,current_version,use_smote=False,shap=False)