### Add the scripts to the notebooks path

In [138]:
import os, sys

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 6\Bati-Bank-Credit-Scoring\notebooks


### Import Statements

In [139]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [140]:
sns.set_theme()

### Load the Data

In [141]:
PATH = "../data/data.csv"

data = pd.read_csv(PATH)

### Obtain RFMS scores and Use default estimator to add target variable

In [142]:
from scripts.credit_risk_modeler import CreditScoreEngine

credit_engine = CreditScoreEngine(transaction_data=data)

- obtain, score and label RFMS data

In [143]:
rfms_data = credit_engine.calcualte_rfms()
scored_rfms = credit_engine.score_rfms(rfms_data=rfms_data)
labeled_rfms, decision_boundary = credit_engine.label_rfms_score(data=scored_rfms, score_column='RFMS_Score')

In [144]:
# join the labeled rfms with the original data
data = data.join(other=labeled_rfms, on='CustomerId', how='left')

### Feature Engineering

In [145]:
from scripts.feature_engineering import FeatureEngineering

1) Break down the `TransactionStartTime` into Hour, Day, Month and Year features

In [146]:
data = FeatureEngineering.extract_date_features(data=data)

2. Handle missing values

In [147]:
data = FeatureEngineering.handle_missing_data(data=data)

3. Arggregate data per customer

In [148]:
data = FeatureEngineering.aggregate_customer_data(data=data)

4. Normalizing data

In [149]:
data, scaler = FeatureEngineering.normalize_numerical_features(data=data)

5. Encode categorical features

In [150]:
risk_encoding = {'Good': 1, 'Bad': 0}
data['RiskLabel'] = data['RiskLabel'].map(lambda x: risk_encoding[x])
data, encoders = FeatureEngineering.encode_categorical_data(data=data)

6. Save the encoders and the numerical scalers to use when new data comes

In [151]:
import pickle

DUMP_PATH = '../model/'

# serialize the scaler
with open(os.path.join(DUMP_PATH, 'scaler.pkl'), 'wb') as file:
    pickle.dump(scaler, file)

# serialize the encoder
with open(os.path.join(DUMP_PATH, 'encoder.pkl'), 'wb') as file:
    pickle.dump(scaler, file)

### Split data into training and testing sets

In [152]:
features = ['RFMS_Score', 'RecencyScore', 'PricingStrategy', 'ProductCategory']
target = 'RiskLabel'

X = data[features]
y = data[target]

- Fix data imbalance.

The reason is when I trained it without this it gave me unusuall accuracy scroes of 100% for every model. So I figured to add this.

In [153]:
from sklearn.model_selection import train_test_split

# Assume 'features' is your features DataFrame and 'target' is your binary target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=69)

- Fix data imbalance.

The reason is when I trained it without this it gave me unusuall accuracy scroes of 100% for every model. So I figured to add this.

In [154]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=69)

# Fit and resample the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

### Train and select models

In [155]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

1) Define the models in a dict

In [156]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

2. Define hyperparameter seach spaces for each model

In [157]:
param_grids = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]}, 
    'Decision Tree': {'max_depth': [3, 5, 7, None]},  
    'Gradient Boosting': {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [50, 100, 200]}  
}

3) Use the models and the parameter grids to define a greadsearch

In [158]:
from sklearn.model_selection import GridSearchCV

grid_searches = {
    name: GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='accuracy')
    for name, model in models.items()
}

4. Train the models

In [159]:
for name, grid_search in grid_searches.items():
    grid_search.fit(X_resampled, y_resampled)
    print(f"{name} model best parameters: {grid_search.best_params_}")

Logistic Regression model best parameters: {'C': 100}
Random Forest model best parameters: {'max_depth': None, 'n_estimators': 50}
Decision Tree model best parameters: {'max_depth': 3}
Gradient Boosting model best parameters: {'learning_rate': 0.01, 'n_estimators': 50}


5. Evaluate the models 

In [160]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score

In [161]:
performance_metrics = {}
y_probs = {}

for name, model in grid_searches.items():
    y_pred = model.predict(X_test)  # Predictions on the test set
    y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
    y_probs[name] = y_prob  # Store probabilities for ROC curve
    # Calculate metrics
    performance_metrics[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
}
    
performance_metrics = pd.DataFrame(performance_metrics).T

#### Obtain the best model

In [162]:
best_model = grid_search.best_estimator_

In [163]:
best_model

In [164]:
MODEL_PATH = "../model/model.pkl"

# save the model
with open(MODEL_PATH, 'wb') as file:
    pickle.dump(best_model, file)