In [1]:
#Download the cleaned dataset created from previous note book and uploaded to G-drive

!gdown 1Fr0VVlAVNZaGqZjqnEDsGT8pKGKXYxOa

Downloading...
From: https://drive.google.com/uc?id=1Fr0VVlAVNZaGqZjqnEDsGT8pKGKXYxOa
To: /content/cleaned_credit_score.csv
100% 18.1M/18.1M [00:00<00:00, 38.0MB/s]


In [2]:
# libraries to analyze data
import numpy as np
import pandas as pd

# libraries to visualize data
import matplotlib.pyplot as plt
import seaborn as sns

# libraries to perform stats
import scipy.stats as stats

In [3]:
# read the file
cs_df = pd.read_csv('/content/cleaned_credit_score.csv', encoding='unicode_escape')
cs_df.head(5)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Credit-Builder Loan,Debt Consolidation Loan,Home Equity Loan,Mortgage Loan,Payday Loan,Personal Loan,Student Loan,Spent_Level,Value_Payments,Credit_History_Years
0,CUS_0xd40,January,23,Scientist,19114.0,1824.843333,3,4,3,4,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,High,Small,22.0
1,CUS_0xd40,February,23,Scientist,19114.0,1824.843333,3,4,3,4,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,Low,Large,22.0
2,CUS_0xd40,March,23,Scientist,19114.0,1824.843333,3,4,3,4,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,Low,Medium,22.0
3,CUS_0xd40,April,23,Scientist,19114.0,1824.843333,3,4,3,4,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,Low,Small,22.0
4,CUS_0xd40,May,23,Scientist,19114.0,1824.843333,3,4,3,4,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,High,Medium,22.0


# **Feature Engineering**

## **Monetary**

### **Debt-to-Income (DTI) Ratio:**

**Formula:**

Debt_to_Income_Ratio = (Outstanding_Debt / Annual_Income) * 100

***Note:*** The Debt_to_Income_Ratio value is rounded to 2 digits


**Annual_Income**

In [4]:
cs_df['Annual_Income'].describe()

Unnamed: 0,Annual_Income
count,100000.0
mean,50505.12592
std,38299.421504
min,7006.0
25%,19342.75
50%,37000.0
75%,71683.25
max,179987.0


**Outstanding_Debt**

In [5]:
cs_df['Outstanding_Debt'].describe()

Unnamed: 0,Outstanding_Debt
count,100000.0
mean,1426.220376
std,1155.129026
min,0.23
25%,566.0725
50%,1166.155
75%,1945.9625
max,4998.07


In [6]:
cs_df['Debt_to_Income_Ratio'] = cs_df['Outstanding_Debt'] / cs_df['Annual_Income']

In [7]:
cs_df['Debt_to_Income_Ratio'] = round(cs_df['Debt_to_Income_Ratio'] * 100,2)

In [8]:
cs_df['Debt_to_Income_Ratio'].describe()

Unnamed: 0,Debt_to_Income_Ratio
count,100000.0
mean,6.065734
std,8.742499
min,0.0
25%,0.96
50%,2.825
75%,6.97
max,68.33


In [9]:
#we convert the Debt_to_Income_Ratio into 10 buckets - Lower DTI will get higher points range 10 to 1
bins = [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 1000]
labels = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [10]:
# Segment the Debt_to_Income_Ratio into groups
cs_df['Debt_to_Income_Ratio_Score'] = pd.cut(cs_df['Debt_to_Income_Ratio'], bins=bins, labels=labels)

In [11]:
cs_df['Debt_to_Income_Ratio_Score'].unique()

[10, 8, 9, 5, 7, 6, 4]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Monthly Balance remaining ratio**

**Formula:**

Monthly Balance remaining ratio = (Monthly_Balance / Monthly_Inhand_Salary) * 100

In [12]:
cs_df['Monthly_Balance'].describe()

Unnamed: 0,Monthly_Balance
count,100000.0
mean,397.711363
std,217.10652
min,0.0
25%,267.875403
50%,334.814045
75%,467.673454
max,1602.040519


In [13]:
cs_df['Monthly_Inhand_Salary'].describe()

Unnamed: 0,Monthly_Inhand_Salary
count,100000.0
mean,4198.350924
std,3187.402015
min,303.645417
25%,1626.761667
50%,3095.978333
75%,5961.6375
max,15204.63333


In [14]:
cs_df['Monthly_Bal_to_Income_Ratio'] = cs_df['Monthly_Balance'] / cs_df['Monthly_Inhand_Salary']

In [15]:
cs_df['Monthly_Bal_to_Income_Ratio'] = round(cs_df['Monthly_Bal_to_Income_Ratio'] * 100,2)

In [16]:
#we convert the Monthly_Invest_to_Income_Ratio into 10 buckets - Higher ratio will get higher points range 10 to 1
bins_lower = [-1,0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels_lower = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [17]:
# Segment the Monthly_Bal_to_Income_Ratio into groups
cs_df['Monthly_Bal_to_Income_Ratio_Score'] = pd.cut(cs_df['Monthly_Bal_to_Income_Ratio'], bins=bins_lower, labels=labels_lower)

In [18]:
cs_df['Monthly_Bal_to_Income_Ratio_Score'].unique()

[2, 1, 6, 5, 3, 4, 0, 7, 8, 9]
Categories (11, int64): [0 < 1 < 2 < 3 ... 7 < 8 < 9 < 10]

### **Monthly Debt to Income Ratio:**

**Formula:**

Monthly Debt to Income Ratio = (Total_EMI_per_month / Monthly_Inhand_Salary) * 100

In [19]:
cs_df['Monthly_debt_to_Income_Ratio'] = cs_df['Total_EMI_per_month'] / cs_df['Monthly_Inhand_Salary']

In [20]:
cs_df['Monthly_debt_to_Income_Ratio'] = round(cs_df['Monthly_debt_to_Income_Ratio'] * 100,2)

In [21]:
# Segment the Monthly_Debt_to_Income_Ratio into groups
cs_df['Monthly_debt_to_Income_Ratio_Score'] = pd.cut(cs_df['Monthly_debt_to_Income_Ratio'], bins=bins, labels=labels)

In [22]:
cs_df['Monthly_debt_to_Income_Ratio_Score'].unique()

[10, 9, 8]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Monthly Savings Ratio:**

**Formula:**


Monthly Savings Ratio = (Amount_invested_monthly / Monthly_Inhand_Salary) * 100

In [23]:
cs_df['Monthly_Invest_to_Income_Ratio'] = cs_df['Amount_invested_monthly'] / cs_df['Monthly_Inhand_Salary']

In [24]:
cs_df['Monthly_Invest_to_Income_Ratio'] = round(cs_df['Monthly_Invest_to_Income_Ratio'] * 100,2)

In [25]:
#we convert the Monthly_Invest_to_Income_Ratio into 10 buckets - Higher ratio will get higher points range 10 to 1
bins_lower = [-1,0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels_lower = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [26]:
# Segment the Monthly_Debt_to_Income_Ratio into groups
cs_df['Monthly_Invest_to_Income_Ratio_Score'] = pd.cut(cs_df['Monthly_Invest_to_Income_Ratio'], bins=bins_lower, labels=labels_lower)

In [27]:
cs_df[['Customer_ID','Amount_invested_monthly','Monthly_Inhand_Salary' ,'Monthly_Invest_to_Income_Ratio_Score']].isnull().sum()

Unnamed: 0,0
Customer_ID,0
Amount_invested_monthly,0
Monthly_Inhand_Salary,0
Monthly_Invest_to_Income_Ratio_Score,26445


In [28]:
cs_df['Monthly_Invest_to_Income_Ratio_Score'] = cs_df['Monthly_Invest_to_Income_Ratio_Score'].fillna(0)

In [29]:
cs_df['Monthly_Invest_to_Income_Ratio_Score'].unique()

[1, 9, 0, 2, 10, 7, 8]
Categories (11, int64): [0 < 1 < 2 < 3 ... 7 < 8 < 9 < 10]

### **Monthly Other Expense Ratio:**

**Formula:**

Monthly Other Expense Ratio = ((Monthly_Inhand_Salary - (Monthly_Balance + Total_EMI_per_month + Amount_invested_monthly) ) / Monthly_Inhand_Salary) * 100

In [30]:
cs_df['Monthly_Other_Expense_to_Income_Ratio'] = (cs_df['Monthly_Inhand_Salary'] - (cs_df['Monthly_Balance'] + cs_df['Total_EMI_per_month'] + cs_df['Amount_invested_monthly'])) / cs_df['Monthly_Inhand_Salary']

In [31]:
cs_df['Monthly_Other_Expense_to_Income_Ratio'] = round(cs_df['Monthly_Other_Expense_to_Income_Ratio'] * 100,2)

In [32]:
# Segment the Monthly_Other_Expense_to_Income_Ratio into groups
cs_df['Monthly_Other_Expense_to_Income_Ratio_Score'] = pd.cut(cs_df['Monthly_Other_Expense_to_Income_Ratio'], bins=bins, labels=labels)

In [33]:
cs_df[['Customer_ID','Amount_invested_monthly','Monthly_Inhand_Salary' ,'Monthly_Other_Expense_to_Income_Ratio_Score']].isnull().sum()

Unnamed: 0,0
Customer_ID,0
Amount_invested_monthly,0
Monthly_Inhand_Salary,0
Monthly_Other_Expense_to_Income_Ratio_Score,27049


In [34]:
cs_df['Monthly_Other_Expense_to_Income_Ratio_Score'] = cs_df['Monthly_Other_Expense_to_Income_Ratio_Score'].fillna(0)

In [35]:
cs_df['Monthly_Other_Expense_to_Income_Ratio_Score'].unique()

[3, 2, 10, 9, 0, ..., 7, 6, 4, 5, 8]
Length: 11
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

## **Insights**

We have created 5 ratio scores related to money earned, spent and invested.

* Debt-to-Income (DTI) Ratio - Lower ratio means good so scored higher
* Monthly Balance remaining ratio - Higher ratio means good so scored higher
* Monthly Debt to Income Ratio - Lower ratio means good so scored higher
* Monthly Savings/Investment Ratio - Higher ratio means good so scored higher
* Monthly Other Expense Ratio - Lower ratio means good so scored higher



## **Behavior**

### **Credit_History_Years**

In [36]:
cs_df['Credit_History_Years'].nunique()

34

In [37]:
cs_df['Credit_History_Years'].describe()

Unnamed: 0,Credit_History_Years
count,100000.0
mean,18.23592
std,8.313256
min,0.0
25%,12.0
50%,18.0
75%,25.0
max,33.0


In [38]:
cs_df['Credit_History_Years_Ratio'] = cs_df['Credit_History_Years'] / cs_df['Credit_History_Years'].max()

In [39]:
cs_df['Credit_History_Years_Ratio'] = round(cs_df['Credit_History_Years_Ratio'] * 100,2)

In [40]:
# Segment the Credit_History_Years_Ratio into groups
cs_df['Credit_History_Years_Ratio_Score'] = pd.cut(cs_df['Credit_History_Years_Ratio'], bins=bins_lower, labels=labels_lower)

In [41]:
cs_df['Credit_History_Years_Ratio_Score'].unique()

[7, 9, 6, 10, 5, ..., 3, 4, 1, 0, 2]
Length: 11
Categories (11, int64): [0 < 1 < 2 < 3 ... 7 < 8 < 9 < 10]

**Insight**

* Credit History in Years is ranging from 0 to 33 years
* Data is suggesting that there are no outliers

### **Spent_Level and Value_Payments**

In [42]:
cs_df['Spent_Level'].unique()

array(['High', 'Low'], dtype=object)

In [43]:
cs_df['Value_Payments'].unique()

array(['Small', 'Large', 'Medium'], dtype=object)

Spent Level has 2 types - High and Low ==> High is risk and Lower score

Value Payments has 3 types - Small, Medium, Large ==> Large is risk and Lower score

Assign replacement in such a way combination of both will be on a scale of 1-10 inline with other Ratio Scores calculated




In [44]:
# Define replacement dictionaries
spent_level_replacements = {'High': 1, 'Low': 3}
value_payments_replacements = {'Small': 3, 'Medium': 2, 'Large': 1}

In [45]:
cs_df['Spent_Level_Score'] = cs_df['Spent_Level'].replace(spent_level_replacements)
cs_df['Value_Payments_Score'] = cs_df['Value_Payments'].replace(value_payments_replacements)

### **Num_of_Delayed_Payment**

In [46]:
cs_df['Num_of_Delayed_Payment'].unique()

array([ 4,  6, 15,  7,  2, 14, 11,  0, 20,  8,  9, 10, 12, 19, 21, 16, 17,
       18, 24,  5, 23, 22, 13,  3,  1, 25, 28, -2, -1, 27, 26])

In [47]:
cs_df['Num_of_Delayed_Payment'].describe()

Unnamed: 0,Num_of_Delayed_Payment
count,100000.0
mean,13.2756
std,6.194012
min,-2.0
25%,9.0
50%,14.0
75%,18.0
max,28.0


**Secured Loans:**

  These loans require collateral, which means the borrower must pledge an asset (like a car or house) to secure the loan. If the borrower defaults, the lender can seize the collateral.

* Auto Loan: Secured by the vehicle being purchased.
* Home Equity Loan: Secured by the borrower’s home.
* Mortgage Loan: Secured by the property being purchased.

**Unsecured Loans:**

  These loans do not require collateral. Approval is based on the borrower’s creditworthiness.

  * Credit-Builder Loan: Typically unsecured, designed to help build credit.
  * Debt Consolidation Loan: Usually unsecured, used to combine multiple debts into one.
  * Payday Loan: Unsecured, short-term loan with high interest rates.
  * Personal Loan: Can be either secured or unsecured, but often unsecured.
  * Student Loan: Generally unsecured, based on the borrower’s future earning potential.

**Assumption:**

Based on the types of loans assuming 30 years as the maximum tenure would be a good bet and assign score based how many payment defferred - higher will yeild low score

In [48]:
#we convert the Num_of_Delayed_Payment into 10 buckets - Lower will get higher points range 10 to 1
bins_loan = [-100, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31]
labels_loan = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [49]:
# Segment the Num_of_Delayed_Payment_Score into groups
cs_df['Num_of_Delayed_Payment_Score'] = pd.cut(cs_df['Num_of_Delayed_Payment'], bins=bins_loan, labels=labels_loan)

In [50]:
cs_df['Num_of_Delayed_Payment_Score'].unique()

[9, 8, 5, 6, 10, 3, 7, 4, 2, 1]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Delay_from_due_date**

In [51]:
cs_df['Delay_from_due_date'].unique()

array([ 3, -1,  5,  6,  8,  7, 13, 10,  0,  4,  9,  1, 12, 11, 30, 31, 34,
       27, 14,  2, -2, 16, 17, 15, 23, 22, 21, 18, 19, 52, 51, 48, 53, 26,
       43, 28, 25, 20, 47, 46, 49, 24, 61, 29, 50, 58, 45, 59, 55, 56, 57,
       54, 62, 65, 64, 67, 36, 41, 33, 32, 39, 44, 42, 60, 35, 38, -3, 63,
       40, 37, -5, -4, 66])

In [52]:
cs_df['Delay_from_due_date'].describe()

Unnamed: 0,Delay_from_due_date
count,100000.0
mean,21.06878
std,14.860104
min,-5.0
25%,10.0
50%,18.0
75%,28.0
max,67.0


**Assumption:**

Based on the types of loans assuming 100 days as the maximum duration would be a good bet and assign score based how many days payment delayed - higher will yeild low score

*Note:* 90 days would be ideal for simplicity I am going with 100.

In [53]:
#we convert the Delay_from_due_date into 10 buckets - Lower will get higher points range 10 to 1
bins_days = [-10, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101]
labels_days = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [54]:
# Segment the Delay_from_due_date into groups
cs_df['Delay_from_due_date_Score'] = pd.cut(cs_df['Delay_from_due_date'], bins=bins_days, labels=labels_days)

In [55]:
cs_df['Delay_from_due_date_Score'].unique()

[9, 10, 8, 7, 6, 4, 5, 3]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Secured and Unsecured Loans**

**Secured Loans:**

* Auto Loan: Secured by the vehicle being purchased.
* Home Equity Loan: Secured by the borrower’s home.
* Mortgage Loan: Secured by the property being purchased.

**Unsecured Loans:**

  * Credit-Builder Loan: Typically unsecured, designed to help build credit.
  * Debt Consolidation Loan: Usually unsecured, used to combine multiple debts into one.
  * Payday Loan: Unsecured, short-term loan with high interest rates.
  * Personal Loan: Can be either secured or unsecured, but often unsecured.
  * Student Loan: Generally unsecured, based on the borrower’s future earning potential.

In [56]:
cs_df['Secured_Loan_Count'] = cs_df['Auto Loan'] + cs_df['Home Equity Loan'] + cs_df['Mortgage Loan']
cs_df['UnSecured_Loan_Count'] = cs_df['Credit-Builder Loan'] + cs_df['Debt Consolidation Loan'] + cs_df['Payday Loan'] + cs_df['Student Loan'] + cs_df['Personal Loan']

In [57]:
cs_df['Secured_Loan_Count'].isnull().sum()

0

In [58]:
cs_df['UnSecured_Loan_Count'].isnull().sum()

0

In [59]:
cs_df[['Secured_Loan_Count','UnSecured_Loan_Count']].describe()

Unnamed: 0,Secured_Loan_Count,UnSecured_Loan_Count
count,100000.0,100000.0
mean,1.16032,1.9764
std,1.188912,1.648427
min,0.0,0.0
25%,0.0,1.0
50%,1.0,2.0
75%,2.0,3.0
max,7.0,9.0


**Approach**

**Secured** loan values range from 0 - 7, for simplicity leave it as assuming 1 point for 1 loan and within 0-10 score would be good fit for this scenario means more number secured loans yeilds higher score.

**Unsecured** loan values range from 0 - 9, for simplicity we will reverse its range from 0-10 into 10-0 and within 0-10 score would be good fit for this scenario means low number of unsecured loans yeilds higher score.

In [66]:
#we convert the UnSecured_Loan_Count into 10 buckets - Lower will get higher points range 10 to 1
bins_unsecure = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
labels_unsecure = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [67]:
# Segment the Delay_from_due_date into groups
cs_df['UnSecured_Loan_Count_Score'] = pd.cut(cs_df['UnSecured_Loan_Count'], bins=bins_unsecure, labels=labels_unsecure)

In [68]:
cs_df['UnSecured_Loan_Count_Score'].unique()

[9, 10, NaN, 6, 8, 7, 5, 3, 4, 2]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

In [69]:
cs_df['UnSecured_Loan_Count_Score'] = cs_df['UnSecured_Loan_Count_Score'].fillna(0)

In [70]:
cs_df['UnSecured_Loan_Count_Score'].unique()

[9, 10, 0, 6, 8, 7, 5, 3, 4, 2]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Payment_of_Min_Amount**

In [71]:
cs_df['Payment_of_Min_Amount'].unique()

array(['No', 'Yes'], dtype=object)

**Approach**

**No** will be awarded higher score of 10.

**Yes** will be asrded lower score of 0.

In [72]:
# Define replacement dictionaries
min_amount_pmt_level_replacements = {'Yes': 0, 'No': 10}

In [73]:
cs_df['Payment_of_Min_Amount_Score'] = cs_df['Payment_of_Min_Amount'].replace(min_amount_pmt_level_replacements)

In [74]:
cs_df['Payment_of_Min_Amount_Score'].unique()

array([10,  0])

### **Credit_Utilization_Ratio**

In [75]:
cs_df['Credit_Utilization_Ratio'].describe()

Unnamed: 0,Credit_Utilization_Ratio
count,100000.0
mean,32.285173
std,5.116875
min,20.0
25%,28.052567
50%,32.305784
75%,36.496663
max,50.0


In [76]:
# Segment the Credit_Utilization_Ratio into groups
cs_df['Credit_Utilization_Ratio_Score'] = pd.cut(cs_df['Credit_Utilization_Ratio'], bins=bins, labels=labels)

In [78]:
cs_df['Credit_Utilization_Ratio_Score'].unique()

[8, 7, 6, 9]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

## **Insights**

We have created 7 scores related to customer behavior.

* Credit_History_Years - Higher means good so scored higher
* Spent_Level and Value_Payments - Low spent and Low value payments means good so scored higher
* Num_of_Delayed_Payment - Lower means good so scored higher
* Delay_from_due_date - Less delay means good so scored higher
* Secured and Unsecured Loans - secured means good so scored higher and unsecured punished with lower score
* Payment_of_Min_Amount - Awarded for paying more than minimum amount
* Credit_Utilization_Ratio_Score - Low utilization means good so scored higher



## **Credit Availability**

### **Num_Credit_Inquiries**

In [80]:
cs_df['Num_Credit_Inquiries'].describe()

Unnamed: 0,Num_Credit_Inquiries
count,100000.0
mean,5.67776
std,3.827248
min,0.0
25%,3.0
50%,5.0
75%,8.0
max,17.0


**Approach**

Higher Inquires - Lower score. The data will be divided into bins of range 0-20 into 0-10 scale with lower range with high score and higher range with lower score

In [82]:
#we convert the Num_Credit_Inquiries into 10 buckets - Lower will get higher points range 10 to 1
bins_cr_inq = [-10, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 101]
labels_cr_inq = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [84]:
# Segment the Num_Credit_Inquiries into groups
cs_df['Num_Credit_Inquiries_Score'] = pd.cut(cs_df['Num_Credit_Inquiries'], bins=bins_cr_inq, labels=labels_cr_inq)

In [85]:
cs_df['Num_Credit_Inquiries_Score'].unique()

[9, 10, 8, 7, 2, 6, 5, 4, 3]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Changed_Credit_Limit**

In [86]:
cs_df['Changed_Credit_Limit'].describe()

Unnamed: 0,Changed_Credit_Limit
count,100000.0
mean,10.392559
std,6.512894
min,-5.01
25%,5.5
50%,9.34
75%,14.67
max,29.98


**Approach**

Higher credit limit change - Lower score. The data will be divided into bins of range 0-30 into 0-10 scale with lower range with high score and higher range with lower score

In [87]:
#we convert the Changed_Credit_Limit into 10 buckets - Lower will get higher points range 10 to 1
bins_cr_limit = [-10, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 101]
labels_cr_limit = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [88]:
# Segment the Changed_Credit_Limit into groups
cs_df['Changed_Credit_Limit_Score'] = pd.cut(cs_df['Changed_Credit_Limit'], bins=bins_cr_limit, labels=labels_cr_limit)

In [89]:
cs_df['Changed_Credit_Limit_Score'].unique()

[7, 9, 8, 10, 5, 6, 2, 3, 1, 4]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Interest_Rate**

In [90]:
cs_df['Interest_Rate'].describe()

Unnamed: 0,Interest_Rate
count,100000.0
mean,14.53208
std,8.74133
min,1.0
25%,7.0
50%,13.0
75%,20.0
max,34.0


**Approach**

Higher interest rate - Lower score. The data will be divided into bins of range 0-40 into 0-10 scale with lower interest rate with high score and higher interest rate with lower score

In [91]:
#we convert the Interest_Rate into 10 buckets - Lower will get higher points range 10 to 1
bins_int_rt = [-10, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 101]
labels_int_rt = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [95]:
# Segment the Changed_Credit_Limit into groups
cs_df['Interest_Rate_Score'] = pd.cut(cs_df['Interest_Rate'], bins=bins_int_rt, labels=labels_int_rt)

In [96]:
cs_df['Interest_Rate_Score'].unique()

[10, 9, 7, 8, 6, 3, 4, 5, 2]
Categories (11, int64): [10 < 9 < 8 < 7 ... 3 < 2 < 1 < 0]

### **Num_Credit_Card**

In [98]:
cs_df['Num_Credit_Card'].describe()

Unnamed: 0,Num_Credit_Card
count,100000.0
mean,5.53272
std,2.067504
min,0.0
25%,4.0
50%,5.0
75%,7.0
max,11.0


**Approach**

Higher Credit Cards - Lower score. The data will be divided into bins of range 0-20 into 0-10 scale with lower range with high score and higher range with lower score.

***Note:*** Having Zero Credit cards shouldn't be awarded with higher points - Excpetion

In [99]:
#we convert the Num_Credit_Card into 10 buckets - Lower will get higher points range 10 to 1
bins_cr_card = [-10, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 101]
labels_cr_card = [0, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

In [101]:
# Segment the Num_Credit_Card into groups
cs_df['Num_Credit_Card_Score'] = pd.cut(cs_df['Num_Credit_Card'], bins=bins_cr_card, labels=labels_cr_card, ordered=False)

In [102]:
cs_df['Num_Credit_Card_Score'].unique()

[9, 8, 0, 7, 6, 10, 5]
Categories (11, int64): [0, 1, 2, 3, ..., 7, 8, 9, 10]

### **Credit_Mix**

In [104]:
cs_df['Credit_Mix'].unique()

array(['Good', 'Standard', nan, 'Bad'], dtype=object)

In [105]:
cs_df['Credit_Mix'].isnull().sum()

5712

**Approach**

The credit mix is having null values for 5712 rows which belongs to 714 customers.

We will impute the null values based on num of loans.

7-10 ==> Bad
4-6  ==> Standard
1-3  ==> Good
0    ==> Bad

In [106]:
# Define the function to determine Credit_Mix
def determine_credit_mix(num_of_loan):
    if 7 <= num_of_loan <= 10:
        return 'Bad'
    elif 4 <= num_of_loan <= 6:
        return 'Standard'
    elif 1 <= num_of_loan <= 3:
        return 'Good'
    elif num_of_loan == 0:
        return 'Bad'
    else:
        return 'Unknown'

In [107]:
# Apply the function only to rows where Credit_Mix is null
cs_df.loc[cs_df['Credit_Mix'].isnull(), 'Credit_Mix'] = cs_df.loc[cs_df['Credit_Mix'].isnull(), 'Num_of_Loan'].apply(determine_credit_mix)

In [108]:
cs_df['Credit_Mix'].isnull().sum()

0

In [110]:
cs_df['Credit_Mix'].unique()

array(['Good', 'Standard', 'Bad'], dtype=object)

**Approach**

The credit mix score we award 10 for Good, 5 for Standard and 0 for Bad.

In [111]:
# Define replacement dictionaries
credit_mix_level_replacements = {'Good': 10, 'Standard': 5, 'Bad': 0}

In [112]:
cs_df['Credit_Mix_Score'] = cs_df['Credit_Mix'].replace(credit_mix_level_replacements)

In [113]:
cs_df['Credit_Mix_Score'].unique()

array([10,  5,  0])

## **Insights**

We have created 5 scores related to Credit availability to the Customer.

* Num_Credit_Inquiries Score - Lower attempts to get new credit means good so scored higher
* Changed_Credit_Limit Score - Low credit limit change means good so scored higher
* Interest_Rate Score - Lower means good so scored higher
* Num_Credit_Card Score - Less credit cards means good so scored higher
* Credit_Mix Score - Good and Standard are good so awarded with score and bad punished with no score

In [114]:
cs_df.columns

Index(['Customer_ID', 'Month', 'Age', 'Occupation', 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Payment_of_Min_Amount',
       'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance',
       'Auto Loan', 'Credit-Builder Loan', 'Debt Consolidation Loan',
       'Home Equity Loan', 'Mortgage Loan', 'Payday Loan', 'Personal Loan',
       'Student Loan', 'Spent_Level', 'Value_Payments', 'Credit_History_Years',
       'Debt_to_Income_Ratio', 'Debt_to_Income_Ratio_Score',
       'Monthly_Bal_to_Income_Ratio', 'Monthly_Bal_to_Income_Ratio_Score',
       'Monthly_debt_to_Income_Ratio', 'Monthly_debt_to_Income_Ratio_Score',
       'Monthly_Invest_to_Income_Ratio',
       'Monthly_Invest_to_Income_Ratio_Score',
       'Mont

In [115]:
#Copy the Score columns to new Dataset.

scores_attr = ['Customer_ID', 'Month', 'Age', 'Occupation',
       'Debt_to_Income_Ratio_Score',
       'Monthly_Bal_to_Income_Ratio_Score',
       'Monthly_debt_to_Income_Ratio_Score',
       'Monthly_Invest_to_Income_Ratio_Score',
       'Monthly_Other_Expense_to_Income_Ratio_Score',
       'Credit_History_Years_Ratio_Score',
       'Spent_Level_Score', 'Value_Payments_Score',
       'Num_of_Delayed_Payment_Score', 'Delay_from_due_date_Score',
       'Secured_Loan_Count',
       'UnSecured_Loan_Count_Score',
       'Payment_of_Min_Amount_Score',
       'Credit_Utilization_Ratio_Score', 'Num_Credit_Inquiries_Score',
       'Changed_Credit_Limit_Score', 'Interest_Rate_Score',
       'Num_Credit_Card_Score', 'Credit_Mix_Score']

cs_scores_df = cs_df[scores_attr]

In [116]:
cs_scores_df.head()

Unnamed: 0,Customer_ID,Month,Age,Occupation,Debt_to_Income_Ratio_Score,Monthly_Bal_to_Income_Ratio_Score,Monthly_debt_to_Income_Ratio_Score,Monthly_Invest_to_Income_Ratio_Score,Monthly_Other_Expense_to_Income_Ratio_Score,Credit_History_Years_Ratio_Score,...,Delay_from_due_date_Score,Secured_Loan_Count,UnSecured_Loan_Count_Score,Payment_of_Min_Amount_Score,Credit_Utilization_Ratio_Score,Num_Credit_Inquiries_Score,Changed_Credit_Limit_Score,Interest_Rate_Score,Num_Credit_Card_Score,Credit_Mix_Score
0,CUS_0xd40,January,23,Scientist,10,2,10,1,3,7,...,9,2.0,9,10,8,9,7,10,9,10
1,CUS_0xd40,February,23,Scientist,10,2,10,1,3,7,...,10,2.0,9,10,7,9,7,10,9,10
2,CUS_0xd40,March,23,Scientist,10,2,10,1,3,7,...,9,2.0,9,10,8,9,7,10,9,10
3,CUS_0xd40,April,23,Scientist,10,2,10,1,3,7,...,9,2.0,9,10,7,9,7,10,9,10
4,CUS_0xd40,May,23,Scientist,10,2,10,1,3,7,...,9,2.0,9,10,8,9,7,10,9,10


# **Hypothetical Credit Score Calculation:**

**Approach:**

All features are categorized into 3 types.

* Monetary with 5 features
* Behavior with 8 features
* Credit availability with 5 features

We will calculate the score at each category

In [117]:
monetary_features = ['Debt_to_Income_Ratio_Score',
       'Monthly_Bal_to_Income_Ratio_Score',
       'Monthly_debt_to_Income_Ratio_Score',
       'Monthly_Invest_to_Income_Ratio_Score',
       'Monthly_Other_Expense_to_Income_Ratio_Score']
behavior_features = ['Credit_History_Years_Ratio_Score',
       'Spent_Level_Score', 'Value_Payments_Score',
       'Num_of_Delayed_Payment_Score', 'Delay_from_due_date_Score',
       'Secured_Loan_Count',
       'UnSecured_Loan_Count_Score',
       'Payment_of_Min_Amount_Score',
       'Credit_Utilization_Ratio_Score']
credit_avail_features = ['Num_Credit_Inquiries_Score',
       'Changed_Credit_Limit_Score', 'Interest_Rate_Score',
       'Num_Credit_Card_Score', 'Credit_Mix_Score']

In [118]:
cs_scores_df['Monetary_Score'] = cs_scores_df[monetary_features].sum(axis=1)
cs_scores_df['Behaviour_Score'] = cs_scores_df[behavior_features].sum(axis=1)
cs_scores_df['Credit_Avail_Score'] = cs_scores_df[credit_avail_features].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs_scores_df['Monetary_Score'] = cs_scores_df[monetary_features].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs_scores_df['Behaviour_Score'] = cs_scores_df[behavior_features].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs_scores_df['Credit_Avail_Score'] = cs_scores_df[

In [119]:
cs_scores_df.head()

Unnamed: 0,Customer_ID,Month,Age,Occupation,Debt_to_Income_Ratio_Score,Monthly_Bal_to_Income_Ratio_Score,Monthly_debt_to_Income_Ratio_Score,Monthly_Invest_to_Income_Ratio_Score,Monthly_Other_Expense_to_Income_Ratio_Score,Credit_History_Years_Ratio_Score,...,Payment_of_Min_Amount_Score,Credit_Utilization_Ratio_Score,Num_Credit_Inquiries_Score,Changed_Credit_Limit_Score,Interest_Rate_Score,Num_Credit_Card_Score,Credit_Mix_Score,Monetary_Score,Behaviour_Score,Credit_Avail_Score
0,CUS_0xd40,January,23,Scientist,10,2,10,1,3,7,...,10,8,9,7,10,9,10,26,58.0,45
1,CUS_0xd40,February,23,Scientist,10,2,10,1,3,7,...,10,7,9,7,10,9,10,26,58.0,45
2,CUS_0xd40,March,23,Scientist,10,2,10,1,3,7,...,10,8,9,7,10,9,10,26,59.0,45
3,CUS_0xd40,April,23,Scientist,10,2,10,1,3,7,...,10,7,9,7,10,9,10,26,59.0,45
4,CUS_0xd40,May,23,Scientist,10,2,10,1,3,7,...,10,8,9,7,10,9,10,26,57.0,45


In [120]:
cs_hypo_scores_df = cs_scores_df[['Customer_ID', 'Month', 'Age', 'Occupation','Monetary_Score', 'Behaviour_Score', 'Credit_Avail_Score']]

In [122]:
# Group by Customer_ID and calculate the average scores
cs_hypo_scores_aggr_df = cs_hypo_scores_df.groupby('Customer_ID').agg({
    'Monetary_Score': 'mean',
    'Behaviour_Score': 'mean',
    'Credit_Avail_Score': 'mean'
}).reset_index()

In [123]:
cs_hypo_scores_aggr_df.head()

Unnamed: 0,Customer_ID,Monetary_Score,Behaviour_Score,Credit_Avail_Score
0,CUS_0x1000,25.0,31.125,27.0
1,CUS_0x1009,24.375,45.0,36.0
2,CUS_0x100b,21.375,42.0,46.0
3,CUS_0x1011,21.375,38.625,33.0
4,CUS_0x1013,23.75,49.75,47.0


**Approach for different Weighted final Score**

We can create 3 final scores with different weightages assigned.
The Scale will be increased by 5 times so that the **Score Range will be 0 - 900** (18 features at 10 point scale * 5)

#Different weightages

#1: 'Monetary_Score' : **50%**, 'Behaviour_Score' : **35%**, 'Credit_Avail_Score' : **15%**

#2: 'Monetary_Score' : **40%**, 'Behaviour_Score' : **40%**, 'Credit_Avail_Score' : **20%**

#3: 'Monetary_Score' : **40%**, 'Behaviour_Score' : **35%**, 'Credit_Avail_Score' : **25%**


In [131]:
cs_hypo_scores_aggr_df['Score_weighted_50_35_15'] = ((cs_hypo_scores_aggr_df['Monetary_Score'] * 0.5) * 5 + (cs_hypo_scores_aggr_df['Behaviour_Score'] * 0.35) * 5 + (cs_hypo_scores_aggr_df['Credit_Avail_Score'] * 0.15) * 5)
cs_hypo_scores_aggr_df['Score_weighted_40_40_20'] = ((cs_hypo_scores_aggr_df['Monetary_Score'] * 0.4)  * 5 + (cs_hypo_scores_aggr_df['Behaviour_Score'] * 0.4)  * 5 + (cs_hypo_scores_aggr_df['Credit_Avail_Score'] * 0.2) * 5)
cs_hypo_scores_aggr_df['Score_weighted_40_35_25'] = ((cs_hypo_scores_aggr_df['Monetary_Score'] * 0.4)  * 5 + (cs_hypo_scores_aggr_df['Behaviour_Score'] * 0.35)  * 5 + (cs_hypo_scores_aggr_df['Credit_Avail_Score'] * 0.25)  * 5)

In [132]:
cs_hypo_scores_aggr_df.head()

Unnamed: 0,Customer_ID,Monetary_Score,Behaviour_Score,Credit_Avail_Score,Score_weighted_50_35_15,Score_weighted_40_40_20,Score_weighted_40_35_25
0,CUS_0x1000,25.0,31.125,27.0,137.21875,139.25,138.21875
1,CUS_0x1009,24.375,45.0,36.0,166.6875,174.75,172.5
2,CUS_0x100b,21.375,42.0,46.0,161.4375,172.75,173.75
3,CUS_0x1011,21.375,38.625,33.0,145.78125,153.0,151.59375
4,CUS_0x1013,23.75,49.75,47.0,181.6875,194.0,193.3125


In [129]:
cs_hypo_scores_aggr_df[['Score_weighted_50_35_15', 'Score_weighted_40_40_20', 'Score_weighted_40_35_25']].describe()

Unnamed: 0,Score_weighted_50_35_15,Score_weighted_40_40_20,Score_weighted_40_35_25
count,12500.0,12500.0,12500.0
mean,163.408097,170.66634,168.600398
std,21.62507,24.874068,24.789403
min,105.0625,107.75,105.375
25%,146.5,150.75,149.023438
50%,161.796875,168.75,166.9375
75%,180.59375,191.25,189.28125
max,240.34375,248.5,244.65625


**Insight**

Comparing the descriptive statistics of different weightages doesn't show much variance in the final Customer Credit score

# **The Final Score built Range is 0 - 300 considering for weightages (900 / 3)**

Comparing 3 different weightage strategies

* Maximum Score is between 240 to 250
* Average score is between 160 to 170
* Minimum score is between 105 to 108
* Only 25% of Customers are below than 150 score
* 75% of Customers are above 150 and below 250 score

In [130]:
#Save the cleaned dataset to drive

cs_scores_df.to_csv('Scaled_credit_score_with_all_features.csv', index=False)

In [147]:
cs_hypo_scores_aggr_df.to_csv('Aggregated_credit_score.csv', index=False)