#### Training Set Preprocessing

In [204]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [205]:
# importing the dataset
dataset = pd.read_csv("../Data/credit_risk_dataset.csv")

#### Splitting the datasets

Before we start tampering with the dataset we first need to split the dataset into train and test sets in order to prevent data leakage.

In [206]:
y = dataset['loan_status']
X = dataset.drop(['loan_status'],axis=1)

In [207]:
print(y.shape,X.shape)

(32581,) (32581, 11)


In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=.2, stratify=y)

In [209]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(26064, 11) (26064,) (6517, 11) (6517,)


In [210]:
train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test,y_test],axis=1)

#### Handling missing values

The missing values are pretty less relative to the amount of data so simply imputing the data is fine. We shall impute the missing values with the respective median value of the column.

In [211]:
def features_with_na(dataset):
    data = dataset.copy()
    na_features= [feature for feature in dataset.columns if dataset[feature].isnull().sum()>0]
    return na_features

In [212]:

na_features = features_with_na(train_data)
na_features

['person_emp_length', 'loan_int_rate']

In [213]:
def imputer(columns,dataset):
    for column in columns:
        median = dataset[column].median()
        dataset[column].fillna(median,inplace=True)
    return dataset

In [214]:
train_data = imputer(na_features,train_data)

In [215]:
train_data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [216]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
15884,25,241875,MORTGAGE,4.0,EDUCATION,A,16000,7.05,0.07,N,4,0
15138,21,18000,RENT,5.0,PERSONAL,B,1500,12.18,0.08,N,4,1
7474,25,53000,MORTGAGE,10.0,MEDICAL,B,16000,12.53,0.3,N,2,0
18212,28,16800,OWN,4.0,MEDICAL,C,5000,13.98,0.3,N,8,1
6493,25,50000,MORTGAGE,2.0,VENTURE,A,10000,7.9,0.2,N,2,0


#### Handling outliers

In [217]:
def find_numerical_features(dataset):
    numerical_features = [feature for feature in dataset.columns if dataset[feature].dtype != 'O' and feature != 'loan_status']
    return numerical_features

In [218]:

numerical_features = find_numerical_features(train_data)
numerical_features

['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length']

In [219]:
def detect_and_replace_outliers(features,dataset):
    dataframe = dataset.copy()
    for column in features:
        data = dataframe[column]
        data2 = sorted(data)
        q1 = np.percentile(data2,25)
        q3 = np.percentile(data2,75)
        IQR = q3-q1
        lower_bound = q1 - (1.5*IQR)
        upper_bound = q3 + (1.5*IQR)
        print(f'Column : {column}',np.round(lower_bound,5),np.round(upper_bound,5))
        values = []
        for i in data:
            if i<lower_bound or i>upper_bound:
                i = np.median(data)
                values.append(i)
                continue
            else:
                values.append(i)
        data = values
        dataframe[column] = data
    return dataframe

In [220]:
demo = detect_and_replace_outliers(numerical_features,train_data)

Column : person_age 12.5 40.5


Column : person_income -22391.875 139835.125
Column : person_emp_length -5.5 14.5
Column : loan_amnt -5500.0 22500.0
Column : loan_int_rate 1.56 20.04
Column : loan_percent_income -0.12 0.44
Column : cb_person_cred_hist_length -4.5 15.5


In [221]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,26.78008,58144.440493,4.401473,8601.489603,10.993925,0.162853,5.297153,0.218155
std,4.439239,26371.744327,3.341072,4826.13077,3.045156,0.094574,3.212842,0.413001
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,38443.25,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,29.0,73000.0,6.0,12000.0,13.11,0.22,7.0,0.0
max,40.0,139500.0,14.0,22500.0,20.03,0.44,15.0,1.0


In [222]:
train_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,27.702732,66043.5,4.772099,9569.114487,11.013171,0.170179,5.786641,0.218155
std,6.327287,63716.0,4.115082,6295.931312,3.076108,0.106807,4.034795,0.413001
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,38443.25,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,79000.0,7.0,12000.0,13.11,0.23,8.0,0.0
max,144.0,6000000.0,123.0,35000.0,22.48,0.83,30.0,1.0


We can clearly notice that outliers have been significantly removed. For visual representation one can plot boxplots to view the outliers.

In [223]:
train_data = demo.copy()

Adding income slabs for all the people

In [224]:
demo.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
15884,25.0,55000.0,MORTGAGE,4.0,EDUCATION,A,16000.0,7.05,0.07,N,4.0,0
15138,21.0,18000.0,RENT,5.0,PERSONAL,B,1500.0,12.18,0.08,N,4.0,1
7474,25.0,53000.0,MORTGAGE,10.0,MEDICAL,B,16000.0,12.53,0.3,N,2.0,0
18212,28.0,16800.0,OWN,4.0,MEDICAL,C,5000.0,13.98,0.3,N,8.0,1
6493,25.0,50000.0,MORTGAGE,2.0,VENTURE,A,10000.0,7.9,0.2,N,2.0,0


In [225]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
15884,25.0,55000.0,MORTGAGE,4.0,EDUCATION,A,16000.0,7.05,0.07,N,4.0,0
15138,21.0,18000.0,RENT,5.0,PERSONAL,B,1500.0,12.18,0.08,N,4.0,1
7474,25.0,53000.0,MORTGAGE,10.0,MEDICAL,B,16000.0,12.53,0.3,N,2.0,0
18212,28.0,16800.0,OWN,4.0,MEDICAL,C,5000.0,13.98,0.3,N,8.0,1
6493,25.0,50000.0,MORTGAGE,2.0,VENTURE,A,10000.0,7.9,0.2,N,2.0,0


In [226]:
# function to add income slabs in the dataset
def add_income_slab(dataset):
    q1 = np.percentile(dataset.person_income,33)
    q2 = np.percentile(dataset.person_income,66)
    income_slabs = []
    for income in dataset.person_income.values:
        if income<=q1:
            income_slabs.append('lower')
        elif income>q1 and income<=q2:
            income_slabs.append('middle')
        else:
            income_slabs.append('upper')
    dataset['income_slab'] = income_slabs
    return dataset

In [227]:
train_data = add_income_slab(train_data)
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
15884,25.0,55000.0,MORTGAGE,4.0,EDUCATION,A,16000.0,7.05,0.07,N,4.0,0,middle
15138,21.0,18000.0,RENT,5.0,PERSONAL,B,1500.0,12.18,0.08,N,4.0,1,lower
7474,25.0,53000.0,MORTGAGE,10.0,MEDICAL,B,16000.0,12.53,0.3,N,2.0,0,middle
18212,28.0,16800.0,OWN,4.0,MEDICAL,C,5000.0,13.98,0.3,N,8.0,1,lower
6493,25.0,50000.0,MORTGAGE,2.0,VENTURE,A,10000.0,7.9,0.2,N,2.0,0,middle


#### Log transformation of the non Gaussian distributions of numerical features

In [228]:
def log_transformation(features, dataset):
    data = dataset.copy()
    for feature in features:
        if 0 in data[feature].unique():
            pass
        else:
            data[feature] = np.log(dataset[feature])
    return data

In [229]:
demo = log_transformation(numerical_features,train_data)

In [230]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,3.274919,10.86234,4.401473,8.877669,2.356499,0.162853,1.497878,0.218155
std,0.156864,0.48366,3.341072,0.648724,0.291991,0.094574,0.576739,0.413001
min,2.995732,8.29405,0.0,6.214608,1.690096,0.0,0.693147,0.0
25%,3.135494,10.556938,2.0,8.517193,2.138889,0.09,1.098612,0.0
50%,3.258097,10.915088,4.0,8.987197,2.396986,0.15,1.386294,0.0
75%,3.367296,11.198215,6.0,9.392662,2.573375,0.22,1.94591,0.0
max,3.688879,11.84582,14.0,10.021271,2.997231,0.44,2.70805,1.0


In [231]:
train_data = demo.copy()

In [232]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
15884,3.218876,10.915088,MORTGAGE,4.0,EDUCATION,A,9.680344,1.953028,0.07,N,1.386294,0,middle
15138,3.044522,9.798127,RENT,5.0,PERSONAL,B,7.31322,2.499795,0.08,N,1.386294,1,lower
7474,3.218876,10.878047,MORTGAGE,10.0,MEDICAL,B,9.680344,2.528126,0.3,N,0.693147,0,middle
18212,3.332205,9.729134,OWN,4.0,MEDICAL,C,8.517193,2.637628,0.3,N,2.079442,1,lower
6493,3.218876,10.819778,MORTGAGE,2.0,VENTURE,A,9.21034,2.066863,0.2,N,0.693147,0,middle


#### Label encoding the categorical features

In [233]:
from sklearn.preprocessing import LabelEncoder

In [234]:
LabelEncoder = LabelEncoder()

In [235]:
def find_categorical_features(dataset):
    categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']
    return categorical_features

In [236]:
categorical_features = find_categorical_features(train_data)
categorical_features

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file',
 'income_slab']

In [237]:
df = train_data.copy()

In [238]:
for feature in categorical_features:
    df[feature] = LabelEncoder.fit_transform(df[feature])

In [239]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
15884,3.218876,10.915088,0,4.0,1,0,9.680344,1.953028,0.07,0,1.386294,0,1
15138,3.044522,9.798127,3,5.0,4,1,7.31322,2.499795,0.08,0,1.386294,1,0
7474,3.218876,10.878047,0,10.0,3,1,9.680344,2.528126,0.3,0,0.693147,0,1
18212,3.332205,9.729134,2,4.0,3,2,8.517193,2.637628,0.3,0,2.079442,1,0
6493,3.218876,10.819778,0,2.0,5,0,9.21034,2.066863,0.2,0,0.693147,0,1


In [240]:
train_data = df.copy()

Scaling the features

In [241]:
from sklearn.preprocessing import StandardScaler

In [242]:
scaler = StandardScaler()

In [243]:
train_data_columns =  train_data.columns
y = train_data.iloc[:,-2]
df = train_data.drop('loan_status',axis=1)

In [244]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,income_slab
15884,3.218876,10.915088,0,4.0,1,0,9.680344,1.953028,0.07,0,1.386294,1
15138,3.044522,9.798127,3,5.0,4,1,7.31322,2.499795,0.08,0,1.386294,0
7474,3.218876,10.878047,0,10.0,3,1,9.680344,2.528126,0.3,0,0.693147,1
18212,3.332205,9.729134,2,4.0,3,2,8.517193,2.637628,0.3,0,2.079442,0
6493,3.218876,10.819778,0,2.0,5,0,9.21034,2.066863,0.2,0,0.693147,1


In [245]:
df = pd.DataFrame(scaler.fit_transform(df),index=train_data.index)

In [246]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
15884,-0.357282,0.109064,-1.175573,-0.120165,-0.885275,-1.046681,1.237338,-1.381823,-0.981828,-0.463611,-0.193477,-0.007246
15138,-1.468799,-2.200374,0.920052,0.179146,0.848642,-0.187122,-2.411624,0.490766,-0.876088,-0.463611,-0.193477,-1.225624
7474,-0.357282,0.032477,-1.175573,1.6757,0.270669,-0.187122,1.237338,0.587793,1.450183,-0.463611,-1.395338,-0.007246
18212,0.365197,-2.343024,0.22151,-0.120165,0.270669,0.672438,-0.555679,0.962819,1.450183,-0.463611,1.008383,-1.225624
6493,-0.357282,-0.088,-1.175573,-0.718787,1.426614,-1.046681,0.512819,-0.991957,0.392787,-0.463611,-1.395338,-0.007246


In [247]:
df.insert(11, 'loan_status', y)

In [248]:
df.columns = train_data_columns

In [249]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
15884,-0.357282,0.109064,-1.175573,-0.120165,-0.885275,-1.046681,1.237338,-1.381823,-0.981828,-0.463611,-0.193477,0,-0.007246
15138,-1.468799,-2.200374,0.920052,0.179146,0.848642,-0.187122,-2.411624,0.490766,-0.876088,-0.463611,-0.193477,1,-1.225624
7474,-0.357282,0.032477,-1.175573,1.6757,0.270669,-0.187122,1.237338,0.587793,1.450183,-0.463611,-1.395338,0,-0.007246
18212,0.365197,-2.343024,0.22151,-0.120165,0.270669,0.672438,-0.555679,0.962819,1.450183,-0.463611,1.008383,1,-1.225624
6493,-0.357282,-0.088,-1.175573,-0.718787,1.426614,-1.046681,0.512819,-0.991957,0.392787,-0.463611,-1.395338,0,-0.007246


In [250]:
train_data = df.copy()

In [251]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
15884,-0.357282,0.109064,-1.175573,-0.120165,-0.885275,-1.046681,1.237338,-1.381823,-0.981828,-0.463611,-0.193477,0,-0.007246
15138,-1.468799,-2.200374,0.920052,0.179146,0.848642,-0.187122,-2.411624,0.490766,-0.876088,-0.463611,-0.193477,1,-1.225624
7474,-0.357282,0.032477,-1.175573,1.6757,0.270669,-0.187122,1.237338,0.587793,1.450183,-0.463611,-1.395338,0,-0.007246
18212,0.365197,-2.343024,0.22151,-0.120165,0.270669,0.672438,-0.555679,0.962819,1.450183,-0.463611,1.008383,1,-1.225624
6493,-0.357282,-0.088,-1.175573,-0.718787,1.426614,-1.046681,0.512819,-0.991957,0.392787,-0.463611,-1.395338,0,-0.007246


In [252]:
train_data.to_csv('../Data/data-created/train_data_preprocessed.csv',index=False)

#### Test Data Preprocessing

In [253]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
6616,22,50000,RENT,6.0,PERSONAL,B,6000,11.89,0.12,N,2,0
21802,32,52000,RENT,0.0,PERSONAL,A,7125,7.49,0.14,N,10,0
31886,37,205000,MORTGAGE,3.0,EDUCATION,B,18000,10.39,0.09,N,14,0
17799,26,100000,MORTGAGE,5.0,DEBTCONSOLIDATION,B,24000,10.99,0.24,N,4,0
5207,23,44340,OWN,0.0,VENTURE,C,3000,12.98,0.07,Y,3,0


In [254]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6517 entries, 6616 to 9938
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  6517 non-null   int64  
 1   person_income               6517 non-null   int64  
 2   person_home_ownership       6517 non-null   object 
 3   person_emp_length           6360 non-null   float64
 4   loan_intent                 6517 non-null   object 
 5   loan_grade                  6517 non-null   object 
 6   loan_amnt                   6517 non-null   int64  
 7   loan_int_rate               5902 non-null   float64
 8   loan_percent_income         6517 non-null   float64
 9   cb_person_default_on_file   6517 non-null   object 
 10  cb_person_cred_hist_length  6517 non-null   int64  
 11  loan_status                 6517 non-null   int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 661.9+ KB


In [255]:
test_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,6517.0,6517.0,6360.0,6517.0,5902.0,6517.0,6517.0,6517.0
mean,27.862053,66200.23,4.770126,9670.385147,10.995983,0.170302,5.874482,0.218199
std,6.42947,54508.96,4.021876,6425.478279,3.261444,0.106687,4.134393,0.413055
min,20.0,5500.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,7.9,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,80000.0,7.0,12500.0,13.47,0.23,8.0,0.0
max,80.0,1900000.0,41.0,35000.0,23.22,0.76,30.0,1.0


In [256]:
test_data.isna().sum()

person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length             157
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 615
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
loan_status                     0
dtype: int64

In [257]:
na_features = features_with_na(test_data)
na_features

['person_emp_length', 'loan_int_rate']

In [258]:
test_data = imputer(na_features,test_data)

In [259]:
test_data.isna().all()

person_age                    False
person_income                 False
person_home_ownership         False
person_emp_length             False
loan_intent                   False
loan_grade                    False
loan_amnt                     False
loan_int_rate                 False
loan_percent_income           False
cb_person_default_on_file     False
cb_person_cred_hist_length    False
loan_status                   False
dtype: bool

In [260]:
numerical_features = find_numerical_features(test_data)

In [261]:
test_data = detect_and_replace_outliers(numerical_features,test_data)

Column : person_age 12.5 40.5
Column : person_income -22500.0 141500.0
Column : person_emp_length -5.5 14.5
Column : loan_amnt -6250.0 23750.0
Column : loan_int_rate 1.56 20.04
Column : loan_percent_income -0.12 0.44
Column : cb_person_cred_hist_length -4.5 15.5


In [262]:
test_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0
mean,26.758324,58903.567439,4.430566,8651.357987,10.96607,0.163655,5.324843,0.218199
std,4.388485,27221.510331,3.394599,4912.510834,3.055314,0.095565,3.230203,0.413055
min,20.0,5500.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,29.0,75000.0,6.0,12000.0,13.06,0.22,7.0,0.0
max,40.0,141000.0,14.0,23750.0,20.03,0.44,15.0,1.0


In [263]:
test_data = add_income_slab(test_data)

In [264]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
6616,22.0,50000.0,RENT,6.0,PERSONAL,B,6000.0,11.89,0.12,N,2.0,0,middle
21802,32.0,52000.0,RENT,0.0,PERSONAL,A,7125.0,7.49,0.14,N,10.0,0,middle
31886,37.0,55000.0,MORTGAGE,3.0,EDUCATION,B,18000.0,10.39,0.09,N,14.0,0,middle
17799,26.0,100000.0,MORTGAGE,5.0,DEBTCONSOLIDATION,B,8000.0,10.99,0.24,N,4.0,0,upper
5207,23.0,44340.0,OWN,0.0,VENTURE,C,3000.0,12.98,0.07,Y,3.0,0,middle


In [265]:
test_data = log_transformation(numerical_features,test_data)

In [266]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
6616,3.091042,10.819778,RENT,6.0,PERSONAL,B,8.699515,2.475698,0.12,N,0.693147,0,middle
21802,3.465736,10.858999,RENT,0.0,PERSONAL,A,8.871365,2.013569,0.14,N,2.302585,0,middle
31886,3.610918,10.915088,MORTGAGE,3.0,EDUCATION,B,9.798127,2.340844,0.09,N,2.639057,0,middle
17799,3.258097,11.512925,MORTGAGE,5.0,DEBTCONSOLIDATION,B,8.987197,2.396986,0.24,N,1.386294,0,upper
5207,3.135494,10.699642,OWN,0.0,VENTURE,C,8.006368,2.56341,0.07,Y,1.098612,0,middle


In [267]:
test_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0
mean,3.274388,10.872548,4.430566,8.881329,2.353685,0.163655,1.501592,0.218199
std,0.155108,0.487374,3.394599,0.650128,0.292675,0.095565,0.580403,0.413055
min,2.995732,8.612503,0.0,6.214608,1.690096,0.0,0.693147,0.0
25%,3.135494,10.571317,2.0,8.517193,2.138889,0.09,1.098612,0.0
50%,3.258097,10.915088,4.0,8.987197,2.396986,0.15,1.386294,0.0
75%,3.367296,11.225243,6.0,9.392662,2.569554,0.22,1.94591,0.0
max,3.688879,11.856515,14.0,10.075338,2.997231,0.44,2.70805,1.0


In [268]:
for feature in categorical_features:
    test_data[feature] = LabelEncoder.fit_transform(test_data[feature])

In [269]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
6616,3.091042,10.819778,3,6.0,4,1,8.699515,2.475698,0.12,0,0.693147,0,1
21802,3.465736,10.858999,3,0.0,4,0,8.871365,2.013569,0.14,0,2.302585,0,1
31886,3.610918,10.915088,0,3.0,1,1,9.798127,2.340844,0.09,0,2.639057,0,1
17799,3.258097,11.512925,0,5.0,0,1,8.987197,2.396986,0.24,0,1.386294,0,2
5207,3.135494,10.699642,2,0.0,5,2,8.006368,2.56341,0.07,1,1.098612,0,1


In [270]:
test_data_columns =  test_data.columns
y = test_data.iloc[:,-2]
df = test_data.drop('loan_status',axis=1)

In [271]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,income_slab
6616,3.091042,10.819778,3,6.0,4,1,8.699515,2.475698,0.12,0,0.693147,1
21802,3.465736,10.858999,3,0.0,4,0,8.871365,2.013569,0.14,0,2.302585,1
31886,3.610918,10.915088,0,3.0,1,1,9.798127,2.340844,0.09,0,2.639057,1
17799,3.258097,11.512925,0,5.0,0,1,8.987197,2.396986,0.24,0,1.386294,2
5207,3.135494,10.699642,2,0.0,5,2,8.006368,2.56341,0.07,1,1.098612,1


In [272]:
df = pd.DataFrame(scaler.fit_transform(df),index=df.index)

In [273]:
df.insert(11, 'loan_status', y)

In [274]:
df.columns = test_data_columns

In [275]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
6616,-1.182142,-0.108282,0.938524,0.462368,0.840102,-0.186927,-0.279681,0.416919,-0.456846,-0.458981,-1.39301,0,-0.004331
21802,1.233738,-0.027803,0.938524,-1.305281,0.840102,-1.035847,-0.015327,-1.162185,-0.247548,-0.458981,1.380169,0,-0.004331
31886,2.169816,0.087291,-1.146362,-0.421457,-0.88901,-0.186927,1.41029,-0.04388,-0.770793,-0.458981,1.959935,0,-0.004331
17799,-0.105041,1.314036,-1.146362,0.16776,-1.46538,-0.186927,0.162854,0.147958,0.798942,-0.458981,-0.198667,0,1.222923
5207,-0.895534,-0.354798,0.243562,-1.305281,1.416473,0.661994,-1.345933,0.716632,-0.980091,2.178741,-0.694364,0,-0.004331


In [276]:
test_data = df.copy()

In [277]:
test_data.isna().any().sum()

0

In [278]:
test_data.to_csv('../Data/data-created/test_data_preprocessed.csv',index=False)

The following cells tell us the proportion of the labels in both the training and testing sets.

In [283]:
train_data.groupby(train_data['loan_status'])['loan_status'].value_counts()

loan_status
0    20378
1     5686
Name: count, dtype: int64

In [284]:
test_data.groupby(test_data['loan_status'])['loan_status'].value_counts()

loan_status
0    5095
1    1422
Name: count, dtype: int64

As we can see that the proportion is constant across both the sets. This is the result of the 'stratify' parameter in the train-test-split function. This parameter presserves the proportion which is present in the original dataset.