In [1]:
import pandas as pd
import sys
import os

## 1. Load Dataset

In [2]:
DATAPATH = "Data/loan_data.csv"

df = pd.read_csv(DATAPATH)
df.columns

Index(['customer_id', 'age', 'gender', 'income', 'employment_type',
       'credit_score', 'loan_amount', 'loan_term', 'num_delinquencies',
       'default'],
      dtype='object')

The dataset contains customer information regarding their background and financial information including their credit score and associated loans. 

Default (binary 0 vs 1) where 1 = failed to pay loan when it is due, 0 = paid interest.

In [3]:
df

Unnamed: 0,customer_id,age,gender,income,employment_type,credit_score,loan_amount,loan_term,num_delinquencies,default
0,C00001,59,Female,37880,Self-employed,624,18486,60,0,0
1,C00002,49,Female,22965,Self-employed,618,4142,60,0,0
2,C00003,35,Female,51856,Salaried,770,9660,60,2,0
3,C00004,63,Male,51565,Salaried,619,19513,36,2,0
4,C00005,28,Male,44771,Unemployed,660,20154,24,0,0
...,...,...,...,...,...,...,...,...,...,...
9995,C09996,29,Male,58641,Salaried,712,15500,36,0,0
9996,C09997,37,Male,40758,Salaried,783,7690,36,2,0
9997,C09998,56,Male,78882,Self-employed,749,16465,36,0,0
9998,C09999,37,Female,60081,Unemployed,658,4920,36,1,0


In [4]:
df.dtypes

customer_id          object
age                   int64
gender               object
income                int64
employment_type      object
credit_score          int64
loan_amount           int64
loan_term             int64
num_delinquencies     int64
default               int64
dtype: object

#### Glimps at data

In [5]:
df['gender'].value_counts()

gender
Female    5005
Male      4995
Name: count, dtype: int64

In [6]:
df['employment_type'].value_counts()

employment_type
Salaried         5956
Self-employed    3059
Unemployed        985
Name: count, dtype: int64

There are few object data types containing categories such as gender and employment_type. 

Gender = F vs. M

Employment_type = Salaried vs. Self-employed vs. Unemployed

Checking for missing data

In [9]:
df.isna().sum()

customer_id          0
age                  0
gender               0
income               0
employment_type      0
credit_score         0
loan_amount          0
loan_term            0
num_delinquencies    0
default              0
credit_rank          0
dtype: int64

Descriptive stats about the data with integer values

In [10]:
df.describe(include=[int])

Unnamed: 0,age,income,credit_score,loan_amount,loan_term,num_delinquencies,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,42.4903,49960.914,679.5128,10136.3257,36.1644,0.9977,0.082
std,12.578375,14944.638332,49.778034,4870.921073,13.248099,0.992569,0.274378
min,21.0,-1297.0,499.0,1000.0,12.0,0.0,0.0
25%,32.0,39825.5,646.0,6691.75,24.0,0.0,0.0
50%,42.5,49914.0,679.0,10037.5,36.0,1.0,0.0
75%,53.0,60011.25,713.0,13440.5,48.0,2.0,0.0
max,64.0,112218.0,884.0,27376.0,60.0,7.0,1.0


## 2. Data Preprocessing

#### 2a. Segmentation

In [11]:
#credit score bins divided into ranking
credit_score_bin = [300, 579, 580, 669, 670, 739, 740, 799, 800, 900]
df['credit_rank'] = pd.cut(df['credit_score'], bins=5, labels=[5, 4, 3, 2, 1])
df

Unnamed: 0,customer_id,age,gender,income,employment_type,credit_score,loan_amount,loan_term,num_delinquencies,default,credit_rank
0,C00001,59,Female,37880,Self-employed,624,18486,60,0,0,4
1,C00002,49,Female,22965,Self-employed,618,4142,60,0,0,4
2,C00003,35,Female,51856,Salaried,770,9660,60,2,0,2
3,C00004,63,Male,51565,Salaried,619,19513,36,2,0,4
4,C00005,28,Male,44771,Unemployed,660,20154,24,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...
9995,C09996,29,Male,58641,Salaried,712,15500,36,0,0,3
9996,C09997,37,Male,40758,Salaried,783,7690,36,2,0,2
9997,C09998,56,Male,78882,Self-employed,749,16465,36,0,0,2
9998,C09999,37,Female,60081,Unemployed,658,4920,36,1,0,3


Common FICO scores bin:

Exceptional/Excellent (1): 800-850 | Very Good (2): 740-799 | Good (3): 670-739 | Fair (4): 580-669 | Poor (5): 300-579 

ranking 1 - 5. where 1 is highest and 5 is lowest

## 3. Exploratory Data Analysis (EDA)

Find patterns that related to Default