### Add the scripts to the notebooks path

In [1]:
import os, sys

current_dir = os.getcwd()
print(current_dir)

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

scripts_path = os.path.join(parent_dir, 'scripts')

# Insert the path to the parent directory
sys.path.insert(0, parent_dir)

# Insert the path to the Scripts directory
sys.path.insert(0, scripts_path)

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

d:\KifiyaAIM-Course\Week - 6\Bati-Bank-Credit-Scoring\notebooks


### Import Statements

In [2]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
sns.set_theme()

### Load the Data

In [4]:
PATH = "../data/data.csv"

data = pd.read_csv(PATH)

### Obtain RFMS scores and Use default estimator to add target variable

In [5]:
from scripts.credit_risk_modeler import CreditScoreEngine

credit_engine = CreditScoreEngine(transaction_data=data)

- obtain, score and label RFMS data

In [6]:
rfms_data = credit_engine.calcualte_rfms()
scored_rfms = credit_engine.score_rfms(rfms_data=rfms_data)
labeled_rfms, decision_boundary = credit_engine.label_rfms_score(data=scored_rfms, score_column='RFMS_Score')

In [7]:
# join the labeled rfms with the original data
data = data.join(other=labeled_rfms, on='CustomerId', how='left')

### Feature Engineering

In [8]:
from scripts.feature_engineering import FeatureEngineering

1) Break down the `TransactionStartTime` into Hour, Day, Month and Year features

In [9]:
data = FeatureEngineering.extract_date_features(data=data)

2. Handle missing values

In [10]:
data = FeatureEngineering.handle_missing_data(data=data)

3. Arggregate data per customer

In [11]:
data = FeatureEngineering.aggregate_customer_data(data=data)

4. Normalizing data

In [12]:
data, scaler = FeatureEngineering.normalize_numerical_features(data=data)

5. Encode categorical features

In [13]:
risk_encoding = {'Good': 1, 'Bad': 0}
data['RiskLabel'] = data['RiskLabel'].map(lambda x: risk_encoding[x])
data, encoders = FeatureEngineering.encode_categorical_data(data=data)

6. Save the encoders and the numerical scalers to use when new data comes

In [20]:
import pickle

DUMP_PATH = '../data/'

# serialize the scaler
with open(os.path.join(DUMP_PATH, 'scaler.pkl'), 'wb') as file:
    pickle.dump(scaler, file)

# serialize the encoder
with open(os.path.join(DUMP_PATH, 'encoder.pkl'), 'wb') as file:
    pickle.dump(scaler, file)

### Split data into training and testing sets

In [15]:
features = ['RFMS_Score', 'RecencyScore', 'PricingStrategy', 'ProductCategory']
target = 'RiskLabel'

X = data[features]
y = data[target]

In [16]:
from sklearn.model_selection import train_test_split

# Assume 'features' is your features DataFrame and 'target' is your binary target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### Train and select models

In [21]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

1) Define the models in a dict

In [27]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

2. Define hyperparameter seach spaces for each model

In [37]:
param_grids = {
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 5, 10]}, 
    'Decision Tree': {'max_depth': [3, 5, 7, None]},  
    'Gradient Boosting': {'learning_rate': [0.01, 0.1, 0.2], 'n_estimators': [50, 100, 200]}  
}

3) Use the models and the parameter grids to define a greadsearch

In [38]:
from sklearn.model_selection import GridSearchCV

grid_searches = {
    name: GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='accuracy')
    for name, model in models.items()
}

4. Train the models

In [39]:
for name, grid_search in grid_searches.items():
    grid_search.fit(X_train, y_train)
    print(f"{name} model best parameters: {grid_search.best_params_}")

Logistic Regression model best parameters: {'C': 10}
Random Forest model best parameters: {'max_depth': None, 'n_estimators': 50}
Decision Tree model best parameters: {'max_depth': 3}
Gradient Boosting model best parameters: {'learning_rate': 0.01, 'n_estimators': 50}
