# Importing Libraries

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
import re

sns.set_style("whitegrid")
sns.set_context("paper", font_scale = 1.0)

%matplotlib inline

# Set Options For Display
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format

import warnings
warnings.filterwarnings("ignore")

# Loading The Dataset

In [92]:
df = pd.read_csv("Datasets/train.csv")
#turn names of columns to lowercase
df.columns = [x.lower() for x in df.columns]
df.head()

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.84,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,11.27,4.0,_,809.98,26.82,22 Years and 1 Months,No,49.57,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,,11.27,4.0,Good,809.98,31.94,,No,49.57,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,7.0,_,4.0,Good,809.98,28.61,22 Years and 3 Months,No,49.57,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,4.0,6.27,4.0,Good,809.98,31.38,22 Years and 4 Months,No,49.57,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.84,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,,11.27,4.0,Good,809.98,24.8,22 Years and 5 Months,No,49.57,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


# EDA and Data Preprocessing

In [93]:
#dropping not important columns
df.drop(columns = ["id", "customer_id", "month", "name", "ssn", "type_of_loan", "credit_history_age", "total_emi_per_month"], inplace = True)

In [94]:
df.head(10)

Unnamed: 0,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,payment_of_min_amount,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,23,Scientist,19114.12,1824.84,3,4,3,4,3,7,11.27,4.0,_,809.98,26.82,No,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,23,Scientist,19114.12,,3,4,3,4,-1,,11.27,4.0,Good,809.98,31.94,No,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,-500,Scientist,19114.12,,3,4,3,4,3,7,_,4.0,Good,809.98,28.61,No,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,23,Scientist,19114.12,,3,4,3,4,5,4,6.27,4.0,Good,809.98,31.38,No,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,23,Scientist,19114.12,1824.84,3,4,3,4,6,,11.27,4.0,Good,809.98,24.8,No,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
5,23,Scientist,19114.12,,3,4,3,4,8,4,9.27,4.0,Good,809.98,27.26,No,62.430172331195294,!@9#%8,340.4792117872438,Good
6,23,Scientist,19114.12,1824.84,3,4,3,4,3,8_,11.27,4.0,Good,809.98,22.54,No,178.3440674122349,Low_spent_Small_value_payments,244.5653167062043,Good
7,23,Scientist,19114.12,1824.84,3,4,3,4,3,6,11.27,4.0,Good,809.98,23.93,No,24.785216509052056,High_spent_Medium_value_payments,358.12416760938714,Standard
8,28_,_______,34847.84,3037.99,2,4,6,1,3,4,5.42,2.0,Good,605.03,24.46,No,104.291825168246,Low_spent_Small_value_payments,470.69062692529184,Standard
9,28,Teacher,34847.84,3037.99,2,4,6,1,7,1,7.42,2.0,Good,605.03,38.55,No,40.39123782853101,High_spent_Large_value_payments,484.5912142650067,Good


In [95]:
#statistical measures
df.describe()

Unnamed: 0,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,delay_from_due_date,num_credit_inquiries,credit_utilization_ratio
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0
mean,4194.17,17.09,22.47,72.47,21.07,27.75,32.29
std,3183.69,117.4,129.06,466.42,14.86,193.18,5.12
min,303.65,-1.0,0.0,1.0,-5.0,0.0,20.0
25%,1625.57,3.0,4.0,8.0,10.0,3.0,28.05
50%,3093.75,6.0,5.0,13.0,18.0,6.0,32.31
75%,5957.45,7.0,7.0,20.0,28.0,9.0,36.5
max,15204.63,1798.0,1499.0,5797.0,67.0,2597.0,50.0


In [96]:
#number of null values in the data
df.isnull().sum()

age                             0
occupation                      0
annual_income                   0
monthly_inhand_salary       15002
num_bank_accounts               0
num_credit_card                 0
interest_rate                   0
num_of_loan                     0
delay_from_due_date             0
num_of_delayed_payment       7002
changed_credit_limit            0
num_credit_inquiries         1965
credit_mix                      0
outstanding_debt                0
credit_utilization_ratio        0
payment_of_min_amount           0
amount_invested_monthly      4479
payment_behaviour               0
monthly_balance              1200
credit_score                    0
dtype: int64

In [97]:
#checking unique values for the each columns
for column in df:
    print(f"{column}: {df[column].unique()}")
    print("--------------------------------------------------")

age: ['23' '-500' '28_' ... '4808_' '2263' '1342']
--------------------------------------------------
occupation: ['Scientist' '_______' 'Teacher' 'Engineer' 'Entrepreneur' 'Developer'
 'Lawyer' 'Media_Manager' 'Doctor' 'Journalist' 'Manager' 'Accountant'
 'Musician' 'Mechanic' 'Writer' 'Architect']
--------------------------------------------------
annual_income: ['19114.12' '34847.84' '34847.84_' ... '20002.88' '39628.99' '39628.99_']
--------------------------------------------------
monthly_inhand_salary: [1824.84333333           nan 3037.98666667 ... 3097.00833333 1929.90666667
 3359.41583333]
--------------------------------------------------
num_bank_accounts: [   3    2    1    7    4    0    8    5    6    9   10 1414 1231   67
  572 1488   91  528 1647 1696 1338  649  889 1668  685  857  975 1496
 1534 1620   37 1388 1429 1332 1588  120 1777 1096  803  494  744 1139
  831 1613  741  121  665 1748 1644  823 1356 1651   87  711  450  210
 1671 1722  648  672 1662 1495  510  666

monthly_balance: ['312.49408867943663' '284.62916249607184' '331.2098628537912' ...
 516.8090832742814 319.1649785257098 393.6736955618808]
--------------------------------------------------
credit_score: ['Good' 'Standard' 'Poor']
--------------------------------------------------


In [98]:
#handling age

#define a regular expression pattern to match underscores (_) or dashes (-) at the beginning or end of a string
pattern = r'^[_-]+|[_-]+$'

#function to clean the values
def clean_value(value):
    if pd.isnull(value):
        return value
    # Convert float values to strings for cleaning
    if isinstance(value, float):
        value = str(value)
    # Define a regular expression pattern to match underscores (_) or dashes (-) at the beginning or end of a string
    pattern = r'^[_-]+|[_-]+$'
    # Remove underscores and dashes from the beginning and end using regex
    cleaned_value = re.sub(pattern, '', value)
    # Convert back to float if the original value was a float
    if isinstance(value, float):
        cleaned_value = float(cleaned_value)
    return cleaned_value

#apply the function to the column
df['age'] = df['age'].apply(clean_value)


#handling occupation

#count the occurrences of each value in the occupation column
occupation_counts = df['occupation'].value_counts()

#get the second mode (second most frequent value)
second_mode = occupation_counts.index[1]

#replace a specific value (let's say 'unknown') with the second mode
df['occupation'] = df['occupation'].replace('_______', second_mode)


#handling annual_income

#function to clean and convert the values
def clean_and_convert(value):
    #remove underscores from the beginning and end
    cleaned_value = value.strip('_')
    try:
        return float(cleaned_value)  #convert to float
    except ValueError:
        return value  #return the original value if it cannot be converted to a float

#apply the function to the column
df['annual_income'] = df['annual_income'].apply(clean_and_convert)


#handling monthly_inhand_salary

#function to convert and round the values
def convert_and_round(value):
    try:
        #convert to float and round to two decimal places
        return round(float(value), 2)
    except ValueError:
        return value  #return the original value if it cannot be converted to a float

#apply the function to the column
df['monthly_inhand_salary'] = df['monthly_inhand_salary'].apply(convert_and_round)


#hanlding num_of_loan
df['num_of_loan'] = df['num_of_loan'].apply(clean_value)


#handling delay_from_due_date

#replace negative values with zero in the delay_from_due_date column
df['delay_from_due_date'] = df['delay_from_due_date'].apply(lambda x: max(x, 0))


#handling num_of_delayed_payment
df['num_of_delayed_payment'] = df['num_of_delayed_payment'].apply(clean_value)

#handling changed_credit_limit
df['changed_credit_limit'] = df['changed_credit_limit'].apply(convert_and_round)

#convert column to numeric (exclude underscore values)
df['changed_credit_limit'] = pd.to_numeric(df['changed_credit_limit'], errors='coerce')

#calculate median excluding NaN values
median_value = df['changed_credit_limit'].median(skipna=True)

#function to replace underscores with the median value
def replace_with_median(value):
    if pd.isnull(value):
        return median_value
    else:
        return value
    
#apply the function to the column
df['changed_credit_limit'] = df['changed_credit_limit'].apply(replace_with_median)



#handling credit mix

#calculate mode of the column
mode_value = df['credit_mix'].mode()[0]

#function to replace underscores with the mode value
def replace_with_mode(value):
    if value.strip('_') == '':
        return mode_value
    else:
        return value

#apply the function to the column
df['credit_mix'] = df['credit_mix'].apply(replace_with_mode)


#handling outstanding_debt
df['outstanding_debt'] = df['outstanding_debt'].apply(clean_value)

#handling credit_utilization_ratio
df['credit_utilization_ratio'] = df['credit_utilization_ratio'].apply(convert_and_round)


#handling payment_of_min_amount

#calculate mode of the column
mode_value = df['payment_of_min_amount'].mode()[0]

#function to replace underscores with the mode value
def replace_with_mode(value):
    if value.strip('NM') == '':
        return mode_value
    else:
        return value

#apply the function to the column
df['payment_of_min_amount'] = df['payment_of_min_amount'].apply(replace_with_mode)


#handling amount_invested_monthly
df['amount_invested_monthly'] = df['amount_invested_monthly'].apply(convert_and_round)
df['amount_invested_monthly'] = df['amount_invested_monthly'].apply(clean_value)


#handling payment_behaviour

#calculate mode of the column
mode_value = df['payment_behaviour'].mode()[0]

#function to replace underscores with the mode value
def replace_with_mode(value):
    if value.strip('!@9#%8') == '':
        return mode_value
    else:
        return value

#apply the function to the column
df['payment_behaviour'] = df['payment_behaviour'].apply(replace_with_mode)


#handling monthly_balance
df['monthly_balance'] = df['monthly_balance'].apply(convert_and_round)
df['monthly_balance'] = df['monthly_balance'].apply(clean_value)

In [99]:
# handling null values in columns

#monthly_inhand_salary

#calculate the median of the column
median_value = df['monthly_inhand_salary'].median()
#replace null values with the median
df['monthly_inhand_salary'].fillna(median_value, inplace=True)


#num_of_delayed_payment

#calculate the mode of the column
mode_value = df['num_of_delayed_payment'].mode()[0]
#replace null values with the mode
df['num_of_delayed_payment'].fillna(mode_value, inplace=True)


#num_credit_inquiries

#calculate the mode of the column
mode_value = df['num_credit_inquiries'].mode()[0]
#replace null values with the mode
df['num_credit_inquiries'].fillna(mode_value, inplace=True)


#amount_invested_monthly

#calculate the median of the column
median_value = df['amount_invested_monthly'].median()
#replace null values with the mode
df['amount_invested_monthly'].fillna(median_value, inplace=True)


#monthly_balance

#calculate the median of the column
median_value = df['monthly_balance'].median()
#replace null values with the mode
df['monthly_balance'].fillna(median_value, inplace=True)

In [100]:
#converting data types of the columns

#columns to int
columns_to_int = ["age", "num_of_loan", "num_of_delayed_payment", "num_credit_inquiries"]
df[columns_to_int] = df[columns_to_int].astype(int)


#columns to float
columns_to_float = ["outstanding_debt", "amount_invested_monthly", "monthly_balance"]
df[columns_to_float] = df[columns_to_float].astype(float)

In [101]:
#removing rows that contain age less than 18 or more than 70
df = df[(df['age'] >= 18) & (df['age'] <= 70)]

In [102]:
df.head()

Unnamed: 0,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,credit_mix,outstanding_debt,credit_utilization_ratio,payment_of_min_amount,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,23,Scientist,19114.12,1824.84,3,4,3,4,3,7,11.27,4,Standard,809.98,26.82,No,80.42,High_spent_Small_value_payments,312.49,Good
1,23,Scientist,19114.12,3093.75,3,4,3,4,0,19,11.27,4,Good,809.98,31.94,No,118.28,Low_spent_Large_value_payments,284.63,Good
3,23,Scientist,19114.12,3093.75,3,4,3,4,5,4,6.27,4,Good,809.98,31.38,No,199.46,Low_spent_Small_value_payments,223.45,Good
4,23,Scientist,19114.12,1824.84,3,4,3,4,6,19,11.27,4,Good,809.98,24.8,No,41.42,High_spent_Medium_value_payments,341.49,Good
5,23,Scientist,19114.12,3093.75,3,4,3,4,8,4,9.27,4,Good,809.98,27.26,No,62.43,Low_spent_Small_value_payments,340.48,Good


In [103]:
#apply label encoding to each categorical column
categorical_cols = ["occupation", "credit_mix", "payment_of_min_amount", "payment_behaviour", "credit_score"]
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Splitting The Dataset and Scaling

In [104]:
#splitting data to train and test
x = df.drop("credit_score", axis = 1)
y = df["credit_score"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = True, random_state = 1)
    
#scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Modelling

In [105]:
#train the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(x_train, y_train)

#make predictions on the test set
y_pred_test = rf_classifier.predict(x_test)

#make predictions on the test set
y_pred_train = rf_classifier.predict(x_train)

#calculate accuracy for the test and train
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

print("Training Accuracy:", f"{accuracy_train*100}%")
print("Testing Accuracy:", f"{accuracy_test*100}%")

Training Accuracy: 100.0%
Testing Accuracy: 78.14019559635032%
