# Installations

In [3]:
!pip3 install pandas
!pip3 install numpy
!pip3 install seaborn
!pip3 install scikit-learn
!pip3 install matplotlib



# Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
We use dataset for credit score classification: https://www.kaggle.com/datasets/parisrohan/credit-score-classification/data

In [183]:
df = pd.read_csv('data/train.csv')
print(f"Dataset length: {len(df)}")

Dataset length: 100000


  df = pd.read_csv('data/train.csv')


In [184]:
df.columns = df.columns.str.lower()

In [211]:
df.head()

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,...,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1592.843333,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,1592.843333,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1592.843333,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [185]:
df.dtypes

id                           object
customer_id                  object
month                        object
name                         object
age                          object
ssn                          object
occupation                   object
annual_income                object
monthly_inhand_salary       float64
num_bank_accounts             int64
num_credit_card               int64
interest_rate                 int64
num_of_loan                  object
type_of_loan                 object
delay_from_due_date           int64
num_of_delayed_payment       object
changed_credit_limit         object
num_credit_inquiries        float64
credit_mix                   object
outstanding_debt             object
credit_utilization_ratio    float64
credit_history_age           object
payment_of_min_amount        object
total_emi_per_month         float64
amount_invested_monthly      object
payment_behaviour            object
monthly_balance              object
credit_score                

In [186]:
df.isnull().sum()

id                              0
customer_id                     0
month                           0
name                         9985
age                             0
ssn                             0
occupation                      0
annual_income                   0
monthly_inhand_salary       15002
num_bank_accounts               0
num_credit_card                 0
interest_rate                   0
num_of_loan                     0
type_of_loan                11408
delay_from_due_date             0
num_of_delayed_payment       7002
changed_credit_limit            0
num_credit_inquiries         1965
credit_mix                      0
outstanding_debt                0
credit_utilization_ratio        0
credit_history_age           9030
payment_of_min_amount           0
total_emi_per_month             0
amount_invested_monthly      4479
payment_behaviour               0
monthly_balance              1200
credit_score                    0
dtype: int64

## Dealing with missing values and problematic types

## month
Let's add numeric column with month number, so sorting data in the future would be easy

In [248]:
df.month.unique()

array(['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August'], dtype=object)

In [249]:
months_mapping = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}

In [250]:
df['month_number'] = df.month.map(months_mapping)
df['month_number'].head(10)

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    1
9    2
Name: month_number, dtype: int64

### name column
All customers with same id have the same name. So let's try to fill empty names using customer_id

In [198]:
print(f"Initiall number of empty names: {df[['name']].isna().sum().values[0]}")

Initiall number of empty names: 0


In [199]:
df['name'] = df[['customer_id', 'name']].groupby('customer_id').fillna(method='bfill')
df['name'] = df[['customer_id', 'name']].groupby('customer_id').fillna(method='ffill')
print(f'Number of empty names after fillna: {df["name"].isna().sum()}')

  df['name'] = df[['customer_id', 'name']].groupby('customer_id').fillna(method='bfill')
  df['name'] = df[['customer_id', 'name']].groupby('customer_id').fillna(method='bfill')
  df['name'] = df[['customer_id', 'name']].groupby('customer_id').fillna(method='ffill')
  df['name'] = df[['customer_id', 'name']].groupby('customer_id').fillna(method='ffill')


Number of empty names after fillna: 0


### monthly_inhand_salary
Now we want to fill monthly_inhand_salary column. 
* We assume that it depends on annual_income column. 
* Unfortunately annual_income columns is of object and not numeric.
* If we try to cast annual income to numeric type we get an error:

In [200]:
try:
    df['annual_income'] = df['annual_income'].apply(pd.to_numeric)
except Exception as e:
    print(e)

Unable to parse string "34847.84_" at position 0
The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Let's see what are those problematic values we are dealing with:

In [225]:
problematic_values = df[df['annual_income'].apply(pd.to_numeric, errors='coerce').isna()]['annual_income'].values
problematic_values

array([], dtype=float64)

It seems like all of those problematic values have "_" in the end. Let's check it

In [202]:
for val in problematic_values:
    assert '_' == val[-1]

Let's check that we can cast all of those values to float if we delete "_" in the end

In [203]:
for val in problematic_values:
    float(val[:-1])

So now let's write function that will cast annual_income column to numeric

In [204]:
def custom_to_numeric(value: str):
    if value[-1] == '_':
        return float(value[:-1])
    return float(value)

In [205]:
df['annual_income'] = df['annual_income'].apply(custom_to_numeric)
df['annual_income'].dtype

dtype('float64')

Let's now fill the values for monthly_inhand_salary column.  
Let's check how monthly_inhand_salary depend on annual_income. To do that let's see the proportion between those values

In [206]:
not_empty_montly_salary_index = df['monthly_inhand_salary'].dropna().index
proportion = df['annual_income'][not_empty_montly_salary_index] / df['monthly_inhand_salary'][not_empty_montly_salary_index]
proportion.describe()

count    84998.000000
mean        67.244578
std        846.095019
min          8.089821
25%         11.553886
50%         12.025936
75%         12.553691
max      54110.522117
dtype: float64

In [207]:
np.quantile(proportion, 0.1), np.quantile(proportion, 0.9)

(np.float64(10.884076704509429), np.float64(13.568605658582612))

I assume there are some extreme values of annual_income, because mean is 67, but as you can see most of the values are between 10 and 13 with median being around 12, which makes sence, because year has 12 months. So let's fill empty values of this column with **annual_income/12**

In [208]:
df['monthly_inhand_salary'] = df['monthly_inhand_salary'].fillna(df['annual_income']/12)

### num_of_loan
This column represents the number of loans taken from the bank.  
This column should be numerical, but it's an object. Let's fix it.

In [224]:
s = df['num_of_loan'].unique()
s.sort()
s

array(['-100', '0', '0_', '1', '100', '1001', '1002', '1006', '1008',
       '101', '1014', '1015', '1017', '1019', '1023', '1027_', '103',
       '1030', '1035', '1036', '1039', '1040', '1046', '1047', '1048',
       '1053', '1054', '1070', '1074', '1077', '1085', '1088', '1091',
       '1094', '1096', '1103', '1106', '1110', '1112', '1127', '1129',
       '1129_', '1131', '1131_', '1132_', '1135', '1137', '1150', '1151',
       '1152', '1154', '1159', '1160', '1171_', '1178', '1181', '1182',
       '1185_', '1187', '1189', '119', '1196', '1202', '1204', '1209',
       '1210', '1214', '1216', '1217', '1219_', '1222', '1225', '1225_',
       '1227', '1228', '123', '1236', '1241', '1257', '1259', '126',
       '1265', '1271', '1274', '1279', '1289', '1294', '1296', '1297',
       '1298', '1300', '1302', '1307', '1311_', '1312', '1313', '1318',
       '1319', '131_', '132', '1320', '1320_', '1329', '1340', '1345',
       '1347_', '1348', '1353', '1354', '1359', '136', '1363', '1365',
   

We once again see that some of the values have underscores in the end. We will cast this field to numeric using the same function

In [226]:
df['num_of_loan'] = df['num_of_loan'].apply(custom_to_numeric)
df['num_of_loan'].dtype

dtype('float64')

Also we see that num_of_loan can be negative, which is strange.   
Let's look at types of loans correspoding to num_of_loan == -100:

In [239]:
num_of_loans_check = df[df['num_of_loan'] == -100][['num_of_loan', 'customer_id', 'name']]
num_of_loans_check

Unnamed: 0,num_of_loan,customer_id,name
31,-100.0,CUS_0xb891,Jasond
34,-100.0,CUS_0x1cdb,Deepaa
39,-100.0,CUS_0x1cdb,Deepaa
53,-100.0,CUS_0x284a,Nadiaq
61,-100.0,CUS_0x5407,Annk
...,...,...,...
99877,-100.0,CUS_0x3855,Xolai
99901,-100.0,CUS_0x4986,Charles Abbotta
99902,-100.0,CUS_0x4986,Charles Abbotta
99969,-100.0,CUS_0xf16,Maria Sheahanb


Let's look at some of the clients with num_of_loan = -100:

In [242]:
df[df['customer_id'] == 'CUS_0x5407'][['month', 'type_of_loan', 'num_of_loan']]

Unnamed: 0,month,type_of_loan,num_of_loan
56,January,"Not Specified, Auto Loan, and Student Loan",3.0
57,February,"Not Specified, Auto Loan, and Student Loan",3.0
58,March,"Not Specified, Auto Loan, and Student Loan",3.0
59,April,"Not Specified, Auto Loan, and Student Loan",3.0
60,May,"Not Specified, Auto Loan, and Student Loan",3.0
61,June,"Not Specified, Auto Loan, and Student Loan",-100.0
62,July,"Not Specified, Auto Loan, and Student Loan",3.0
63,August,"Not Specified, Auto Loan, and Student Loan",3.0


In [243]:
df[df['customer_id'] == 'CUS_0x4986'][['month', 'type_of_loan', 'num_of_loan']]

Unnamed: 0,month,type_of_loan,num_of_loan
99896,January,Not Specified,1.0
99897,February,Not Specified,1.0
99898,March,Not Specified,1.0
99899,April,Not Specified,1.0
99900,May,Not Specified,1.0
99901,June,Not Specified,-100.0
99902,July,Not Specified,-100.0
99903,August,Not Specified,1.0


We can see that in those cases -100 is invalid.   
We don't want to check that number of loans if always the same for the client. Let's just make it an average in group

In [254]:
df.loc[df['num_of_loan'] == -100, 'num_of_loan'] = np.nan

In [256]:
df[['customer_id', 'num_of_loan', 'month_number']].sort_values('month_number')

Unnamed: 0,customer_id,num_of_loan,month_number
0,CUS_0xd40,4.0,1
36968,CUS_0x3dd5,6.0,1
36960,CUS_0x54bf,2.0,1
36952,CUS_0x48fe,0.0,1
36944,CUS_0x5e05,2.0,1
...,...,...,...
59239,CUS_0x885f,4.0,8
59231,CUS_0x7b40,5.0,8
59223,CUS_0x4d4,4.0,8
59287,CUS_0x242b,5.0,8


In [255]:
df['num_of_loan'] = df[['customer_id', 'num_of_loan', 'month_number']].sort_values('month_number').groupby('customer_id').fillna(method='ffill')
df[df['customer_id'] == 'CUS_0x4986'][['month', 'type_of_loan', 'num_of_loan']]

# groupby('customer_id').sortfillna(method='bfill')
# df['num_of_loan'] = df[['customer_id', 'num_of_loan']].groupby('customer_id').fillna(method='ffill')
# print(f'Number of empty names after fillna: {df["name"].isna().sum()}')
# df['num_of_loan']

  df['num_of_loan'] = df[['customer_id', 'num_of_loan', 'month_number']].sort_values('month_number').groupby('customer_id').fillna(method='ffill')
  df['num_of_loan'] = df[['customer_id', 'num_of_loan', 'month_number']].sort_values('month_number').groupby('customer_id').fillna(method='ffill')


ValueError: Columns must be same length as key

### type_of_loan

Let's look closer at this column, that represents the types of loan taken by a person. I assume we can fill empty values by using previuos data from the same client.



In [227]:
df[['customer_id', 'name']][df['type_of_loan'].isna()].head()

Unnamed: 0,customer_id,name
32,CUS_0x1cdb,Deepaa
33,CUS_0x1cdb,Deepaa
34,CUS_0x1cdb,Deepaa
35,CUS_0x1cdb,Deepaa
36,CUS_0x1cdb,Deepaa


##### If we look at Deepa's loans:

In [228]:
df[df['customer_id'] == 'CUS_0x1cdb'][['type_of_loan', 'num_of_loan']]

Unnamed: 0,type_of_loan,num_of_loan
32,,0.0
33,,0.0
34,,-100.0
35,,0.0
36,,0.0
37,,0.0
38,,0.0
39,,-100.0


We are not 100% sure that type_of_loan is NaN only in case num_of_loan = 0, because we see weird values in 

##### We want to turn some obviously numerical columns to float64 type

In [None]:
potential_numerical_columns = ['Monthly_Balance', 'Amount_invested_monthly', 'Credit_History_Age', 'Outstanding_Debt', 'Credit_Mix']

In [20]:
df['Annual_Income'].astype(np.float64)

ValueError: could not convert string to float: '34847.84_'