#  Read the CSV and Perform Basic Data Cleaning

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
columns = [
    "ID", "Customer_ID", "Month", "Annual_Income", "Monthly_Inhand_Salary", "Num_Bank_Accounts", "Num_Credit_Card", 
    "Interest_Rate", "Num_of_Loan", "Type_of_Loan", "Delay_from_due_date", "Num_of_Delayed_Payment", "Num_Credit_Inquiries", 
    "Outstanding_Debt", "Credit_Utilization_Ratio", "Credit_History_Age", "Payment_of_Min_Amount", "Total_EMI_per_month", 
    "Amount_invested_monthly", "Payment_Behaviour", "Monthly_Balance", "Credit_Score", "Name", "Age", "Occupation"
]
target = ["Credit_Score"]

In [4]:
# Load the data
file_path = Path('credit_score_new.csv')
df = pd.read_csv(file_path)
df = df.loc[:, columns].copy()
# df = pd.read_csv(file_path, skiprows=1)[:-2]
# df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Good' credit_score
credit_mask = df['Credit_Score'] != 'Good'
df = df.loc[credit_mask]

# Convert the target column values to low_risk and high_risk based on their values
x = {'Standard': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Poor'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head(10)

Unnamed: 0,ID,Customer_ID,Month,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,...,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score,Name,Age,Occupation
0,0x1626,CUS_0xb891,1,30689.89,2612.490833,2,5,4,1.0,Not Specified,...,17.3,No,16.415452,81.228859,Low_spent_Large_value_payments,433.604773,low_risk,Jasond,55.0,Entrepreneur
1,0x1627,CUS_0xb891,2,30689.89,2612.490833,2,5,4,1.0,Not Specified,...,17.4,No,16.415452,124.88182,Low_spent_Small_value_payments,409.951812,low_risk,Jasond,55.0,Entrepreneur
2,0x1628,CUS_0xb891,3,30689.89,2612.490833,2,5,4,1.0,Not Specified,...,17.5,NM,16.415452,83.406509,High_spent_Medium_value_payments,411.427123,low_risk,Jasond,55.0,Entrepreneur
3,0x1629,CUS_0xb891,4,30689.89,2612.490833,2,5,4,1.0,Not Specified,...,17.6,No,16.415452,272.334037,Low_spent_Small_value_payments,262.499594,low_risk,Jasond,55.0,Entrepreneur
4,0x162a,CUS_0xb891,5,30689.89,2612.490833,2,5,4,1.0,Not Specified,...,17.7,No,16.415452,10000.0,Low_spent_Large_value_payments,359.374916,low_risk,Jasond,55.0,Entrepreneur
5,0x162b,CUS_0xb891,6,30689.89,2612.490833,2,5,4,1.0,Not Specified,...,17.8,No,16.415452,84.952848,High_spent_Small_value_payments,419.880784,low_risk,Jasond,55.0,Entrepreneur
6,0x162d,CUS_0xb891,8,30689.89,2612.490833,2,5,4,-100.0,Not Specified,...,17.1,No,16.415452,125.617251,High_spent_Small_value_payments,379.216381,low_risk,Jasond,55.0,Entrepreneur
7,0x1634,CUS_0x1cdb,3,35547.71,2853.309167,7,5,5,-100.0,Not Specified,...,30.1,Yes,0.0,173.138651,Low_spent_Medium_value_payments,392.192266,low_risk,Deepaa,21.0,Developer
8,0x1635,CUS_0x1cdb,4,35547.71,2853.309167,7,5,5,0.0,Not Specified,...,30.11,Yes,0.0,96.785485,High_spent_Medium_value_payments,438.545432,low_risk,Deepaa,21.0,Developer
9,0x1636,CUS_0x1cdb,5,35547.71,2853.309167,7,5,5,0.0,Not Specified,...,31.0,Yes,0.0,62.723278,High_spent_Small_value_payments,482.607638,low_risk,Deepaa,21.0,Developer


# Split the Data into Training and Testing

In [6]:
# Create our features
X = pd.get_dummies(df.drop(columns='Credit_Score'))


# Create our target
y = df['Credit_Score']

MemoryError: Unable to allocate 1.48 GiB for an array with shape (39917, 39917) and data type uint8

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

# Oversampling

## Naive Random Oversampling

In [None]:
# pip install imblearn

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
# Naive Random Oversampling
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

## SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
# SMOTE Oversampling
print(classification_report_imbalanced(y_test, y_pred))

## Undersampling

In [None]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
# Undersampling
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

# Combination (Over and Under) Sampling

In [None]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
# Combination (Over and Under) Sampling
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))