#### Training Set Preprocessing

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [45]:
# importing the dataset
dataset = pd.read_csv("../Data/credit_risk_dataset.csv")

#### Splitting the datasets

Before we start tampering with the dataset we first need to split the dataset into train and test sets in order to prevent data leakage.

In [46]:
y = dataset['loan_status']
X = dataset.drop(['loan_status'],axis=1)

In [47]:
print(y.shape,X.shape)

(32581,) (32581, 11)


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=.2)

In [49]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(26064, 11) (26064,) (6517, 11) (6517,)


In [50]:
train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test,y_test],axis=1)

#### Handling missing values

The missing values are pretty less relative to the amount of data so simply imputing the data is fine. We shall impute the missing values with the respective median value of the column.

In [51]:
features_with_na = [feature for feature in train_data.columns if train_data[feature].isnull().sum()>0]
features_with_na

['person_emp_length', 'loan_int_rate']

In [52]:
def imputer(columns,dataset):
    for column in columns:
        median = dataset[column].median()
        dataset[column].fillna(median,inplace=True)
    return dataset

In [53]:
train_data = imputer(features_with_na,train_data)

In [54]:
train_data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [55]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,64,46000,RENT,2.0,PERSONAL,C,4800,11.09,0.1,Y,24,0
1338,26,26000,OWN,0.0,DEBTCONSOLIDATION,E,8500,16.45,0.33,N,3,1
7047,23,51000,MORTGAGE,3.0,PERSONAL,C,16000,13.11,0.31,Y,3,0
8225,22,56004,MORTGAGE,6.0,MEDICAL,A,6000,7.88,0.11,N,4,0
7178,24,79000,RENT,3.0,PERSONAL,C,7000,12.54,0.09,N,3,0


#### Handling outliers

In [56]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtype != 'O' and feature != 'loan_status']
numerical_features

['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length']

In [57]:
def detect_and_replace_outliers(features,dataset):
    dataframe = dataset.copy()
    for column in features:
        data = dataframe[column]
        data2 = sorted(data)
        q1 = np.percentile(data2,25)
        q3 = np.percentile(data2,75)
        IQR = q3-q1
        lower_bound = q1 - (1.5*IQR)
        upper_bound = q3 + (1.5*IQR)
        print(f'Column : {column}',np.round(lower_bound,5),np.round(upper_bound,5))
        values = []
        for i in data:
            if i<lower_bound or i>upper_bound:
                i = np.median(data)
                values.append(i)
                continue
            else:
                values.append(i)
        data = values
        dataframe[column] = data
    return dataframe

In [58]:
demo = detect_and_replace_outliers(numerical_features,train_data)

Column : person_age 12.5 40.5


Column : person_income -21750.0 140250.0
Column : person_emp_length -5.5 14.5
Column : loan_amnt -5875.0 23125.0
Column : loan_int_rate 1.56 20.04
Column : loan_percent_income -0.12 0.44
Column : cb_person_cred_hist_length -4.5 15.5


In [59]:
demo.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0


In [60]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,26.781768,58559.615216,4.404428,8648.812538,10.987761,0.163085,5.321017,0.217273
std,4.428679,26903.138347,3.353342,4880.118592,3.038411,0.094802,3.227007,0.412398
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,29.0,74000.0,6.0,12000.0,13.11,0.22,7.0,0.0
max,40.0,140004.0,14.0,23100.0,20.03,0.44,15.0,1.0


In [61]:
train_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,27.764695,66171.84,4.765577,9601.07332,11.008203,0.170446,5.81672,0.217273
std,6.3925,63599.33,4.054371,6315.753396,3.071511,0.106991,4.054342,0.412398
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,79500.0,7.0,12250.0,13.11,0.23,8.0,0.0
max,144.0,6000000.0,123.0,35000.0,22.48,0.78,30.0,1.0


We can clearly notice that outliers have been significantly removed. For visual representation one can plot boxplots to view the outliers.

In [62]:
train_data = demo.copy()

Adding income slabs for all the people

In [63]:
demo.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0


In [64]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0


In [65]:
q1 = np.percentile(train_data.person_income,33)
q2 = np.percentile(train_data.person_income,66)
income_slabs = []
for income in train_data.person_income.values:
    if income<=q1:
        income_slabs.append('lower')
    elif income>q1 and income<=q2:
        income_slabs.append('middle')
    else:
        income_slabs.append('upper')

In [66]:
train_data['income_slab'] = income_slabs

In [67]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0,middle
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1,lower
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0,middle
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0,middle
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0,upper


#### Log transformation of the non Gaussian distributions of numerical features

In [68]:
def log_transformation(features, dataset):
    data = dataset.copy()
    for feature in features:
        if 0 in data[feature].unique():
            pass
        else:
            data[feature] = np.log(dataset[feature])
    return data

In [69]:
demo = log_transformation(numerical_features,train_data)

In [70]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,3.275042,10.867479,4.404428,8.881418,2.356117,0.163085,1.501581,0.217273
std,0.156509,0.487163,3.353342,0.65155,0.291264,0.094802,0.578727,0.412398
min,2.995732,8.29405,0.0,6.214608,1.690096,0.0,0.693147,0.0
25%,3.135494,10.571317,2.0,8.517193,2.138889,0.09,1.098612,0.0
50%,3.258097,10.915088,4.0,8.987197,2.396986,0.15,1.386294,0.0
75%,3.367296,11.21182,6.0,9.392662,2.573375,0.22,1.94591,0.0
max,3.688879,11.849426,14.0,10.047588,2.997231,0.44,2.70805,1.0


In [71]:
train_data = demo.copy()

In [72]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,3.258097,10.736397,RENT,2.0,PERSONAL,C,8.476371,2.406044,0.1,Y,1.386294,0,middle
1338,3.258097,10.165852,OWN,0.0,DEBTCONSOLIDATION,E,9.047821,2.800325,0.33,N,1.098612,1,lower
7047,3.135494,10.839581,MORTGAGE,3.0,PERSONAL,C,9.680344,2.573375,0.31,Y,1.098612,0,middle
8225,3.091042,10.933178,MORTGAGE,6.0,MEDICAL,A,8.699515,2.064328,0.11,N,1.386294,0,middle
7178,3.178054,11.277203,RENT,3.0,PERSONAL,C,8.853665,2.528924,0.09,N,1.098612,0,upper


#### Label encoding the categorical features

In [73]:
from sklearn.preprocessing import LabelEncoder

In [74]:
LabelEncoder = LabelEncoder()

In [75]:
categorical_features = [feature for feature in train_data.columns if train_data[feature].dtype == 'O']

In [76]:
df = train_data.copy()

In [77]:
for feature in categorical_features:
    df[feature] = LabelEncoder.fit_transform(df[feature])

In [78]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,3.258097,10.736397,3,2.0,4,2,8.476371,2.406044,0.1,1,1.386294,0,1
1338,3.258097,10.165852,2,0.0,0,4,9.047821,2.800325,0.33,0,1.098612,1,0
7047,3.135494,10.839581,0,3.0,4,2,9.680344,2.573375,0.31,1,1.098612,0,1
8225,3.091042,10.933178,0,6.0,3,0,8.699515,2.064328,0.11,0,1.386294,0,1
7178,3.178054,11.277203,3,3.0,4,2,8.853665,2.528924,0.09,0,1.098612,0,2


In [79]:
train_data = df.copy()

Scaling the features

In [80]:
from sklearn.preprocessing import StandardScaler

In [81]:
scaler = StandardScaler()

In [82]:
train_data_columns =  train_data.columns
train_data = pd.DataFrame(scaler.fit_transform(train_data),columns=train_data_columns)

In [83]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
0,-0.108271,-0.269079,0.924046,-0.717038,0.849923,0.674664,-0.621678,0.171418,-0.66545,2.174477,-0.19921,-0.526863,0.007903
1,-0.108271,-1.44026,0.226253,-1.313469,-1.459482,2.39492,0.255402,1.525138,1.760709,-0.459881,-0.696314,1.898027,-1.225607
2,-0.891641,-0.057268,-1.169333,-0.418822,0.849923,0.674664,1.226217,0.745931,1.549739,2.174477,-0.696314,-0.526863,0.007903
3,-1.175666,0.134863,-1.169333,0.475825,0.272572,-1.045591,-0.27919,-1.001823,-0.559965,-0.459881,-0.19921,-0.526863,0.007903
4,-0.619706,0.841057,0.924046,-0.418822,0.849923,0.674664,-0.042595,0.593311,-0.770936,-0.459881,-0.696314,-0.526863,1.241414


In [85]:
train_data.to_csv('../Data/data-created/train_data_preprocessed.csv',index=False)