Importing Libraries

In [13]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio 
import sklearn 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score, confusion_matrix  
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder  
from bayes_opt import BayesianOptimization
import optuna
pio.templates.default = "plotly_white"  
import joblib

Loading and Viewing The Data

In [3]:
credit_card_data = pd.read_csv("train.csv") 
print(credit_card_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  float64
 5   SSN                       100000 non-null  float64
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  float64
 10  Num_Credit_Card           100000 non-null  float64
 11  Interest_Rate             100000 non-null  float64
 12  Num_of_Loan               100000 non-null  float64
 13  Type_of_Loan              100000 non-null  ob

In [4]:
# Checking if dataset has any null values or not 
print(credit_card_data.isnull().sum()) # No null values for any columns ; no need for imputation

ID                          0
Customer_ID                 0
Month                       0
Name                        0
Age                         0
SSN                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64


Data Visualization

In [None]:
fig = px.box(credit_card_data, x = "Occupation", color = "Credit_Score", title = "Credit Scores Based on Occupation") 
fig.show() # No difference in credit scores among different occupations 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Annual_Income", color = "Credit_Score", title = "Credit Scores Based on Annual Income")
fig.show() # Average annual income increases as credit score increases 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Age", color = "Credit_Score", title = "Credit Scores Based on Annual Income")
fig.show() # Older people tend to have higher average credit scores 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Monthly_Inhand_Salary", color = "Credit_Score", title = "Credit Scores Based on Monthly Inhand Salary")
fig.show() # The more monthly inhand salary you receive, the higher your credit score will most likely be 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Num_Bank_Accounts", color = "Credit_Score", title = "Credit Scores Based on Number of Bank Accounts")
fig.show() # Having a large amount of bank accounts negatively impacts your credit score; 2-3 bank accounts are an optimal number 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Num_Credit_Card", color = "Credit_Score", title = "Credit Scores Based on Number of Credit Cards")
fig.show() # Having a large amount of credit card accounts is bad for your credit score 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Interest_Rate", color = "Credit_Score", title = "Credit Scores Based on Interest Rate")
fig.show() # Account holders with good credit scores are offered lower interest rates on average

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Num_of_Loan", color = "Credit_Score", title = "Credit Scores Based on Number of Loans")
fig.show() # People with good credit scores tend to take a lower amount of loans 

In [None]:
fig = px.box(credit_card_data, x = "Type_of_Loan", color = "Credit_Score", title = "Credit Scores Based on Type of Loan Taken")
fig.show() # No real correlation to credit score & the type of loan taken 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Delay_from_due_date", color = "Credit_Score", title = "Credit Scores Based on Avg Number of Days Delayed for Credit Card Payments")
fig.show() # People with lower number of days delayed for credit card have better credit scores 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Num_of_Delayed_Payment", color = "Credit_Score", title = "Credit Scores Based on Number of Delayed Card Payments")
fig.show() # People with lower number of delayed payments have higher credit scores(4-12 late payments are fine)

In [16]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Changed_Credit_Limit", color = "Credit_Score", title = "Credit Scores Based on % Change in Credit Card Limit")
fig.show() # People with good credit scores have lower percent change in credit card limit(lower correlation compared to other indicators) 

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Num_Credit_Inquiries", color = "Credit_Score", title = "Credit Scores Based on Number of Credit Card Inquiries")
fig.show() # People with good credit scores have lower number of credit card inquiries(lots of outliers for good credit scores)

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Outstanding_Debt", color = "Credit_Score", title = "Credit Scores Based on Outstanding Debt")
fig.show() # People with lower outstanding debt have higher credit scores(lots of outliers)

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Credit_Utilization_Ratio", color = "Credit_Score", title = "Credit Scores Based on Credit Utilization Ratio")
fig.show() # No effect on credit scores

In [None]:
fig = px.box(credit_card_data, x = "Credit_Score", y = "Total_EMI_per_month", color = "Credit_Score", title = "Credit Scores Based on Total EMI Per Month")
fig.show() # Low impact on credit score 

In [None]:
fig = px.box(credit_card_data, x="Credit_Score", y="Amount_invested_monthly", color="Credit_Score", title="Credit Scores Based on Amount Invested Monthly") 
fig.show() # On average, people with higher credit scores have invested more money each month(lots of outliers)

In [None]:
fig = px.box(credit_card_data, x="Credit_Score", y="Monthly_Balance", color="Credit_Score", title="Credit Scores Based on Monthly Balance") 
fig.show() # People with higher monthly balances have higher credit scores 


Creating a Training, Validation, & Test Set

In [5]:
train_df, test_df = train_test_split(credit_card_data,test_size=0.33,random_state=42)

print('train_df.shape :', train_df.shape)
print('test_df.shape :', test_df.shape)

train_df.shape : (67000, 28)
test_df.shape : (33000, 28)


In [6]:
input_cols = ["Age", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card","Interest_Rate","Num_of_Loan","Delay_from_due_date","Changed_Credit_Limit", "Num_Credit_Inquiries", "Credit_Mix", "Credit_History_Age", "Amount_invested_monthly", "Monthly_Balance"]
target_col = ["Credit_Score"]

In [7]:
train_inputs = train_df[input_cols].copy() 
train_targets = train_df[target_col].copy() 

test_inputs = test_df[input_cols].copy() 
test_targets = test_df[target_col].copy() 


In [8]:
numeric_cols = train_inputs.select_dtypes(include = np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes("object").columns.tolist()  


Scaling & One-Hot Encoding 

In [9]:
scaler = MinMaxScaler().fit(credit_card_data[numeric_cols]) 
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])  


In [10]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(credit_card_data[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

train_inputs[encoded_cols] = encoder.transform(train_inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [12]:
x_train = train_inputs[numeric_cols + encoded_cols]
x_test = test_inputs[numeric_cols + encoded_cols] 
x_train

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_History_Age,Amount_invested_monthly,Monthly_Balance,Credit_Mix_Bad,Credit_Mix_Good,Credit_Mix_Standard
59428,0.642857,0.365062,0.361895,0.727273,0.454545,0.515152,0.222222,0.435484,0.635346,0.000000,0.622829,0.118635,0.531365,0.0,0.0,1.0
34957,0.357143,0.166786,0.175453,0.272727,0.545455,0.575758,0.111111,0.258065,0.526459,0.294118,0.761787,0.161097,0.342439,0.0,0.0,1.0
4264,0.071429,0.044339,0.068944,0.545455,0.272727,0.333333,0.777778,0.435484,0.394166,0.235294,0.362283,0.097594,0.212734,0.0,0.0,1.0
53791,0.309524,0.138575,0.171851,0.727273,0.272727,0.000000,0.111111,0.080645,0.243894,0.294118,0.776675,0.084403,0.378318,0.0,1.0,0.0
82114,0.476190,0.141639,0.143945,0.545455,0.909091,1.000000,0.777778,0.838710,0.551221,0.411765,0.379653,0.057639,0.284626,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.761905,0.433915,0.434073,0.454545,0.454545,0.060606,0.333333,0.193548,0.171303,0.058824,0.682382,0.108354,0.267744,0.0,1.0,0.0
54886,0.142857,0.045757,0.075214,0.545455,0.909091,0.454545,0.222222,0.983871,0.440299,0.705882,0.397022,0.123342,0.255456,0.0,0.0,1.0
76820,0.571429,0.346005,0.366880,0.272727,0.545455,0.575758,0.444444,0.129032,0.124152,0.235294,0.833747,0.176821,0.268026,0.0,0.0,1.0
860,0.976190,0.071669,0.107463,0.090909,0.636364,0.060606,0.333333,0.193548,0.115332,0.117647,0.493797,0.029249,0.212357,0.0,1.0,0.0


Training the Baseline Model

In [None]:
baseline_model = RandomForestClassifier(n_jobs = -1, random_state = 42).fit(x_train, train_targets) 
train_acc = accuracy_score(train_targets,baseline_model.predict(x_train)) 
test_acc = accuracy_score(test_targets, baseline_model.predict(x_test)) 
print(train_acc, test_acc) # Training Accuracy = 0.9999, Test Accuracy = 0.8099

Hyperparameter Tuning Using Optuna,Bayesian Optimization

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])  # Removed 'auto'

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )
    
    score = cross_val_score(model, x_train, train_targets, n_jobs=-1, cv=10, scoring='accuracy').mean()
    
    return score

# Create the Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

Final Random Forests

In [15]:
final_rf_model = RandomForestClassifier(n_jobs=-1,random_state=42, n_estimators = 63, max_depth = 28, min_samples_split = 4, min_samples_leaf = 1, max_features = 'log2').fit(x_train,train_targets)
train_acc = accuracy_score(train_targets,final_rf_model.predict(x_train)) 
test_acc = accuracy_score(test_targets, final_rf_model.predict(x_test)) 
print(train_acc ,test_acc) # Lower training,validation, & testing accuracy 

# Saving the model for deployment 
joblib.dump(final_rf_model, 'final_rf_model.joblib')

  return fit_method(estimator, *args, **kwargs)


0.9650746268656717 0.8084242424242424


['final_rf_model.joblib']

In [None]:
def rf_objective(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features): 
    n_estimators = int(n_estimators)
    max_depth = int(max_depth) 
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf) 
    max_features = max_features 

    model = RandomForestClassifier(n_jobs = -1, 
                                   random_state=42, 
                                   n_estimators = n_estimators, 
                                   max_depth = max_depth,
                                   min_samples_split = min_samples_split,
                                   min_samples_leaf = min_samples_leaf,
                                   max_features = max_features).fit(x_train, train_targets)
    
    test_preds = model.predict(x_test)
    test_accuracy = accuracy_score(test_targets,test_preds)

    return test_accuracy 

# Setting bounds for hyperparameters 
pbounds = {
    'n_estimators': (10, 100),
    'max_depth': (5, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20),
    'max_features': (0.1, 1.0) 
} 

# Initializing Bayesian Optimizer 
optimizer = BayesianOptimization(
    f = rf_objective, 
    pbounds = pbounds, 
    random_state = 42, 
    verbose = 2 
)

# Running optimization process 
optimizer.maximize(init_points = 10, n_iter = 30) 

# Retrieving the best parameters 
best_params = optimizer.max["params"]
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])
best_params['min_samples_leaf'] = int(best_params['min_samples_leaf']) 


final_rf_model_1 = RandomForestClassifier(
    n_jobs=-1,
    random_state=42,
    **best_params
).fit(x_train, train_targets)

train_acc = accuracy_score(train_targets, final_rf_model_1.predict(x_train))
test_acc = accuracy_score(test_targets, final_rf_model_1.predict(x_test))
print(train_acc, test_acc)


In [None]:
#Summary(Training Accuracy & Test Accuracy) 
#Baseline: 0.9999701492537313 0.809939393939394
#Optuna: 0.9650746268656717 0.8084242424242424
#Bayesian Optimization: 0.999910447761194 0.809030303030303
#Baseline: The highest test accuracy but significant overfitting.
#Optuna: Good balance between training and test accuracy, indicating better generalization.
#Bayesian Optimization: Similar to Baseline, slight overfitting but not as pronounced as Baseline. 