# CS301 Project

Source: https://www.kaggle.com/datasets/parisrohan/credit-score-classification

## Preprocessing

In [64]:
# import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [65]:
# read in data
data = pd.read_csv('./data/train.csv', low_memory=False)

data.drop(['ID', 'Customer_ID', 'Month', 'Name', 'SSN'], axis=1, inplace=True)

data.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,23,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,-500,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,23,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Age                       100000 non-null  object 
 1   Occupation                100000 non-null  object 
 2   Annual_Income             100000 non-null  object 
 3   Monthly_Inhand_Salary     84998 non-null   float64
 4   Num_Bank_Accounts         100000 non-null  int64  
 5   Num_Credit_Card           100000 non-null  int64  
 6   Interest_Rate             100000 non-null  int64  
 7   Num_of_Loan               100000 non-null  object 
 8   Type_of_Loan              88592 non-null   object 
 9   Delay_from_due_date       100000 non-null  int64  
 10  Num_of_Delayed_Payment    92998 non-null   object 
 11  Changed_Credit_Limit      100000 non-null  object 
 12  Num_Credit_Inquiries      98035 non-null   float64
 13  Credit_Mix                100000 non-null  ob

In [67]:
data.loc[data.Monthly_Balance=='__-333333333333333333333333333__', 'Monthly_Balance'] = 0
data.Monthly_Inhand_Salary.fillna(data.Monthly_Inhand_Salary.mean(), inplace=True)
data.Type_of_Loan.fillna(method='ffill', inplace=True)
data.Num_of_Delayed_Payment.fillna(method='ffill', inplace=True)
data.Num_Credit_Inquiries.fillna(data.Num_Credit_Inquiries.mean(), inplace=True)
data.Credit_History_Age.fillna(method="ffill", inplace=True)
data.Amount_invested_monthly.fillna(method="ffill", inplace=True)
data.Monthly_Balance.fillna(method="ffill", inplace=True)
data.Monthly_Balance = data.Monthly_Balance.astype('float')

In [68]:
## Show info of dtype object
data.select_dtypes(include=['object']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Age                      100000 non-null  object
 1   Occupation               100000 non-null  object
 2   Annual_Income            100000 non-null  object
 3   Num_of_Loan              100000 non-null  object
 4   Type_of_Loan             100000 non-null  object
 5   Num_of_Delayed_Payment   100000 non-null  object
 6   Changed_Credit_Limit     100000 non-null  object
 7   Credit_Mix               100000 non-null  object
 8   Outstanding_Debt         100000 non-null  object
 9   Credit_History_Age       100000 non-null  object
 10  Payment_of_Min_Amount    100000 non-null  object
 11  Amount_invested_monthly  100000 non-null  object
 12  Payment_Behaviour        100000 non-null  object
 13  Credit_Score             100000 non-null  object
dtypes: object(14)
memory 

In [69]:
data.columns[0]

'Age'

In [70]:
print("age_le")
age_le = LabelEncoder()
data.Age = age_le.fit_transform(data.Age)
print("occupation_le")
occupation_le = LabelEncoder()
data.Occupation = occupation_le.fit_transform(data.Occupation)
print("annual_income_le")
annual_income_le = LabelEncoder()
data.Annual_Income = annual_income_le.fit_transform(data.Annual_Income)
print("num_of_loan_le")
num_of_loan_le = LabelEncoder()
data.Num_of_Loan = num_of_loan_le.fit_transform(data.Num_of_Loan)
print("type_of_loan_le")
type_of_loan_le = LabelEncoder()
data.Type_of_Loan = type_of_loan_le.fit_transform(data.Type_of_Loan)
print("num_of_delayed_payment_le")
num_of_delayed_payment_le = LabelEncoder()
data.Num_of_Delayed_Payment = num_of_delayed_payment_le.fit_transform(data.Num_of_Delayed_Payment)
print("changed_credit_limit_le")
changed_credit_limit_le = LabelEncoder()
data.Changed_Credit_Limit = changed_credit_limit_le.fit_transform(data.Changed_Credit_Limit)
print("credit_mix_le")
credit_mix_le = LabelEncoder()
data.Credit_Mix = credit_mix_le.fit_transform(data.Credit_Mix)
print("outstanding_debt_le")
outstanding_debt_le = LabelEncoder()
data.Outstanding_Debt = outstanding_debt_le.fit_transform(data.Outstanding_Debt)
print("credit_history_age_le")
credit_history_age_le = LabelEncoder()
data.Credit_History_Age = credit_history_age_le.fit_transform(data.Credit_History_Age)
print("payment_of_min_amount_le")
payment_of_min_amount_le = LabelEncoder()
data.Payment_of_Min_Amount = payment_of_min_amount_le.fit_transform(data.Payment_of_Min_Amount)
print("amount_invested_monthly_le")
amount_invested_monthly_le = LabelEncoder()
data.Amount_invested_monthly = amount_invested_monthly_le.fit_transform(data.Amount_invested_monthly)
print("payment_behaviour_le")
payment_behaviour_le = LabelEncoder()
data.Payment_Behaviour = payment_behaviour_le.fit_transform(data.Payment_Behaviour)
print("credit_score_le")
credit_score_le = LabelEncoder()
data.Credit_Score = credit_score_le.fit_transform(data.Credit_Score)

age_le
occupation_le
annual_income_le
num_of_loan_le
type_of_loan_le
num_of_delayed_payment_le
changed_credit_limit_le
credit_mix_le
outstanding_debt_le
credit_history_age_le
payment_of_min_amount_le
amount_invested_monthly_le
payment_behaviour_le
credit_score_le


In [71]:
data.head(n=20)

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,308,12,6011,1824.843333,3,4,3,244,128,3,...,3,12062,26.82262,180,1,49.574949,81513,3,312.494089,0
1,308,12,6011,4194.17085,3,4,3,244,128,-1,...,1,12062,31.94496,180,1,49.574949,7661,4,284.629162,0
2,0,12,6011,4194.17085,3,4,3,244,128,3,...,1,12062,28.609352,184,1,49.574949,82226,5,331.209863,0
3,308,12,6011,4194.17085,3,4,3,244,128,5,...,1,12062,31.377862,185,1,49.574949,30358,6,223.45131,0
4,308,12,6011,1824.843333,3,4,3,244,128,6,...,1,12062,24.797347,186,1,49.574949,55363,2,341.489231,0
5,308,12,6011,4194.17085,3,4,3,244,128,8,...,1,12062,27.262259,187,1,49.574949,70815,0,340.479212,0
6,308,12,6011,1824.843333,3,4,3,244,128,3,...,1,12062,22.537593,188,1,49.574949,25839,6,244.565317,0
7,308,12,6011,1824.843333,3,4,3,244,128,3,...,1,12062,23.933795,188,1,49.574949,37319,2,358.124168,2
8,450,15,10300,3037.986667,2,4,6,3,684,3,...,1,10796,24.464031,236,1,18.816215,1916,6,470.690627,2
9,429,13,10300,3037.986667,2,4,6,3,684,7,...,1,10796,38.550848,237,1,18.816215,54511,1,484.591214,0


## Modeling

In [72]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('Credit_Score', axis=1), data.Credit_Score, test_size=0.2, random_state=42)
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [76]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


Accuracy:  0.70905
Confusion Matrix:  [[2386   53 1088]
 [ 393 3759 1722]
 [1312 1251 8036]]


In [74]:
# Use knn to predict the credit score
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))


Accuracy:  0.66725
Confusion Matrix:  [[2115  189 1223]
 [ 391 3594 1889]
 [1222 1741 7636]]
