# <center> Credit Score Classification


<div>
<img src="attachment:dataset-cover.jpg" width="500"/>
</div>

## Table of content 
- Packages importing 
- Reading Data 
- Data Exploration
- Data cleaning 
- Data Preprocessing 
- Modeling & Evaluation


## Packages importing 

In [2]:
# Packages for EDA 
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np 

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from datasist.structdata import detect_outliers
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import category_encoders as ce
import re 

# Modeling and evaluation 
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from sklearn.metrics import classification_report 
import joblib

# Packages options 
sns.set(rc={'figure.figsize': [14, 7]}, font_scale=1.2) # Standard figure size for all 
np.seterr(divide='ignore', invalid='ignore', over='ignore') ;

import warnings 
warnings.filterwarnings("ignore")



## Reading Data 

In [3]:
df = pd.read_csv("../dataset/original/train.csv", low_memory=False)

## Data Exploration

In [4]:
df.shape

(100000, 28)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  object 
 1   Customer_ID               100000 non-null  object 
 2   Month                     100000 non-null  object 
 3   Name                      90015 non-null   object 
 4   Age                       100000 non-null  object 
 5   SSN                       100000 non-null  object 
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  object 
 8   Monthly_Inhand_Salary     84998 non-null   float64
 9   Num_Bank_Accounts         100000 non-null  int64  
 10  Num_Credit_Card           100000 non-null  int64  
 11  Interest_Rate             100000 non-null  int64  
 12  Num_of_Loan               100000 non-null  object 
 13  Type_of_Loan              88592 non-null   ob

In [6]:
df.describe()

Unnamed: 0,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_Credit_Inquiries,Credit_Utilization_Ratio,Total_EMI_per_month
count,84998.0,100000.0,100000.0,100000.0,100000.0,98035.0,100000.0,100000.0
mean,4194.17085,17.09128,22.47443,72.46604,21.06878,27.754251,32.285173,1403.118217
std,3183.686167,117.404834,129.05741,466.422621,14.860104,193.177339,5.116875,8306.04127
min,303.645417,-1.0,0.0,1.0,-5.0,0.0,20.0,0.0
25%,1625.568229,3.0,4.0,8.0,10.0,3.0,28.052567,30.30666
50%,3093.745,6.0,5.0,13.0,18.0,6.0,32.305784,69.249473
75%,5957.448333,7.0,7.0,20.0,28.0,9.0,36.496663,161.224249
max,15204.633333,1798.0,1499.0,5797.0,67.0,2597.0,50.0,82331.0


In [7]:
df.duplicated().sum()

0

### Identify issues 
1. ~~ID, Name and SSN (Not useful)~~
2. Age, Annual_Income, Num_of_Loan, Num_of_Delayed_Payment, Changed_Credit_Limit, Amount_invested_monthly, Outstanding_Debt Credit_Mix, Monthly_Balance __Numerical but show as catogery (need to be fixed)__
3. Occupation, Credit_Mix has value "_____"
4. Data contains outliers 
5. Num_Credit_Card has __zeros__
6. Type_of_Loan Need to rewrite as 8 columns 
7. Num_Bank_Accounts contains negative values
8. Credit_History_Age,Payment_of_Min_Amount,Payment_Behaviour,'Credit_Mix' (needs Feature Engineering)
9. Target Columns is Imbalanced
10. A lot of missing data 

## Data cleaning

#### removing unuseful Columns (Uniques)

In [8]:
del df['ID'] # Identification 
del df['Name'] # Name of client 
del df['SSN'] # SSN (social security number of a person)

#### Fix Numerical Columns 

1. replace _  
2. convert into float 

In [9]:
N_to_fix = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Amount_invested_monthly', 'Outstanding_Debt' , 'Monthly_Balance']

In [10]:
def fix_nums(num):
    try : 
        return float(num.replace("_",""))
    except :
        return np.nan

In [11]:
for col in N_to_fix :
    df[col] = df[col].apply(fix_nums)

### Type_of_Loan

In [12]:
## Rebuild Type of loans Columns 
for i in df['Type_of_Loan'].value_counts().head(9).index[1:] : 
    df[i] = df['Type_of_Loan'].str.contains(i)

del df['Type_of_Loan']

#### Num_Bank_Accounts

In [13]:
df['Num_Bank_Accounts'] = df['Num_Bank_Accounts'].apply(lambda x :abs (x))

#### Num_Credit_Card

In [14]:
df['Num_Credit_Card'].replace(0,1,inplace=True)

#### Credit_History_Age

In [15]:
def History_age(age):
    try : 
        years = int("".join(re.findall('[0-9]',''.join(age.split("and")[0]))))
        month = int("".join(re.findall('[0-9]',''.join(age.split("and")[1]))))
        return years*12 + month
    except :
        return np.nan

In [16]:
df['Credit_History_Age'] = df['Credit_History_Age'].apply(History_age)

#### Payment_of_Min_Amount

In [17]:
df['Payment_of_Min_Amount'].replace("NM","No",inplace=True)

In [18]:
df['Payment_of_Min_Amount'].value_counts()

Payment_of_Min_Amount
Yes    52326
No     47674
Name: count, dtype: int64

#### Payment_Behaviour

In [19]:
df['Payment_Behaviour']= df['Payment_Behaviour'].replace("!@9#%8",np.nan)

In [20]:
df['Payment_Behaviour'].value_counts()

Payment_Behaviour
Low_spent_Small_value_payments      25513
High_spent_Medium_value_payments    17540
Low_spent_Medium_value_payments     13861
High_spent_Large_value_payments     13721
High_spent_Small_value_payments     11340
Low_spent_Large_value_payments      10425
Name: count, dtype: int64

### Occupation

In [21]:
df['Occupation'].value_counts()

Occupation
_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: count, dtype: int64

In [22]:
occs = df['Occupation'].value_counts().index[1:]
occs

Index(['Lawyer', 'Architect', 'Engineer', 'Scientist', 'Mechanic',
       'Accountant', 'Developer', 'Media_Manager', 'Teacher', 'Entrepreneur',
       'Doctor', 'Journalist', 'Manager', 'Musician', 'Writer'],
      dtype='object', name='Occupation')

In [23]:
# id_ = "CUS_0xb891"
# oc = df[df['Customer_ID'] == id_]['Occupation'].mode()[0]
# df[df['Customer_ID'] == id_].replace("_______",oc)

In [24]:
# for ID in df[df['Occupation'] == "_______"]['Customer_ID'] : 
#     oc = df[df['Customer_ID'] == ID]['Occupation'].mode()[0]
#     df[df['Customer_ID'] == ID] = df[df['Customer_ID'] == ID].replace("_______",oc)

In [25]:
df['Occupation'].value_counts()

Occupation
_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: count, dtype: int64

In [26]:
# df['Occupation'] = df['Occupation'].replace("_______",df['Occupation'].mode()[0])

In [27]:
df['Occupation'].value_counts()

Occupation
_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6291
Accountant       6271
Developer        6235
Media_Manager    6232
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Name: count, dtype: int64

#### Credit_Mix

In [28]:
df['Credit_Mix'].value_counts()

Credit_Mix
Standard    36479
Good        24337
_           20195
Bad         18989
Name: count, dtype: int64

In [29]:
m = {
    "Bad":0,
    "Standard":1,
    "Good":2,
    "_":np.nan
}

In [30]:
df['Credit_Mix'] = df['Credit_Mix'].map(m)

### Advanced Handling Missing Data 

In [31]:
# Edit Columns from bool to int 
for col in list(df.columns[-8:]):
    df[col] = df[col].astype(float)

In [32]:
# IDs = 1 
# for ID in df['Customer_ID'].unique() :
#     df['Customer_ID'] = df['Customer_ID'].replace(ID,IDs)
#     IDs += 1 

In [33]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=1)

In [34]:
Numericals = df.select_dtypes(exclude='object').columns[1:]
Numericals

Index(['Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Credit-Builder Loan',
       'Personal Loan', 'Debt Consolidation Loan', 'Student Loan',
       'Payday Loan', 'Mortgage Loan', 'Auto Loan', 'Home Equity Loan'],
      dtype='object')

In [35]:
# for col in Numericals[1:]:
#     imputer.fit(df[['Customer_ID',col]])
#     df[['Customer_ID',col]] = imputer.transform(df[['Customer_ID',col]])

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Customer_ID               100000 non-null  object 
 1   Month                     100000 non-null  object 
 2   Age                       100000 non-null  float64
 3   Occupation                100000 non-null  object 
 4   Annual_Income             100000 non-null  float64
 5   Monthly_Inhand_Salary     84998 non-null   float64
 6   Num_Bank_Accounts         100000 non-null  int64  
 7   Num_Credit_Card           100000 non-null  int64  
 8   Interest_Rate             100000 non-null  int64  
 9   Num_of_Loan               100000 non-null  float64
 10  Delay_from_due_date       100000 non-null  int64  
 11  Num_of_Delayed_Payment    92998 non-null   float64
 12  Changed_Credit_Limit      97909 non-null   float64
 13  Num_Credit_Inquiries      98035 non-null   fl

In [37]:
# imputer = SimpleImputer(strategy="most_frequent")
# imputer.fit(df[['Payment_Behaviour']])
# df[['Payment_Behaviour']] = imputer.transform(df[['Payment_Behaviour']])

### Handling Outliers 

In [38]:
# ## replace Outliers with median 
# for col in Numericals :
#     outliers_indecies = detect_outliers(df,0,[col])
#     median = df[col].median()
#     df[col].iloc[outliers_indecies] = median

# Data Preprocessing

### Handling Catogerical 

In [39]:
df.select_dtypes(include="object")

Unnamed: 0,Customer_ID,Month,Occupation,Payment_of_Min_Amount,Payment_Behaviour,Credit_Score
0,CUS_0xd40,January,Scientist,No,High_spent_Small_value_payments,Good
1,CUS_0xd40,February,Scientist,No,Low_spent_Large_value_payments,Good
2,CUS_0xd40,March,Scientist,No,Low_spent_Medium_value_payments,Good
3,CUS_0xd40,April,Scientist,No,Low_spent_Small_value_payments,Good
4,CUS_0xd40,May,Scientist,No,High_spent_Medium_value_payments,Good
...,...,...,...,...,...,...
99995,CUS_0x942c,April,Mechanic,No,High_spent_Large_value_payments,Poor
99996,CUS_0x942c,May,Mechanic,No,High_spent_Medium_value_payments,Poor
99997,CUS_0x942c,June,Mechanic,No,High_spent_Large_value_payments,Poor
99998,CUS_0x942c,July,Mechanic,No,Low_spent_Large_value_payments,Standard


In [40]:
df['Credit_Score'].value_counts()

Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

In [41]:
m = {
    "Poor":0,
    "Standard":1,
    "Good":2
}

In [42]:
df['Credit_Score'] = df['Credit_Score'].map(m)

In [43]:
del df['Customer_ID']

In [44]:
df = pd.get_dummies(df,drop_first=True)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 55 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   Age                                                 100000 non-null  float64
 1   Annual_Income                                       100000 non-null  float64
 2   Monthly_Inhand_Salary                               84998 non-null   float64
 3   Num_Bank_Accounts                                   100000 non-null  int64  
 4   Num_Credit_Card                                     100000 non-null  int64  
 5   Interest_Rate                                       100000 non-null  int64  
 6   Num_of_Loan                                         100000 non-null  float64
 7   Delay_from_due_date                                 100000 non-null  int64  
 8   Num_of_Delayed_Payment                              92998 non-nul

In [46]:
# df.to_csv("Preprocessed_Data.csv")
# train_data, rest_data = train_test_split(mid_prices, train_size=0.8, shuffle=False)
# validation_data, test_data = train_test_split(rest_data, test_size=0.5, shuffle=False)

# save 80% and validation 10 % and test 10%
train, test = train_test_split(df, test_size=0.2, random_state=42)
validation, test = train_test_split(test, test_size=0.5, random_state=42)

train.to_csv("../dataset/train.csv",index=False)
test.to_csv("../dataset/test.csv",index=False)
validation.to_csv("../dataset/validation.csv",index=False)

In [47]:
# df = pd.read_csv("../input/credit-score-data-preprocessed/Preprocessed_Data.csv",low_memory=False)