#### Training Set Preprocessing

In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [126]:
# importing the dataset
dataset = pd.read_csv("../Data/credit_risk_dataset.csv")

#### Splitting the datasets

Before we start tampering with the dataset we first need to split the dataset into train and test sets in order to prevent data leakage.

In [127]:
y = dataset['loan_status']
X = dataset.drop(['loan_status'],axis=1)

In [128]:
print(y.shape,X.shape)

(32581,) (32581, 11)


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=.2)

In [130]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(26064, 11) (26064,) (6517, 11) (6517,)


In [131]:
train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test,y_test],axis=1)

#### Handling missing values

The missing values are pretty less relative to the amount of data so simply imputing the data is fine. We shall impute the missing values with the respective median value of the column.

In [132]:
def features_with_na(dataset):
    data = dataset.copy()
    na_features= [feature for feature in dataset.columns if dataset[feature].isnull().sum()>0]
    return na_features

In [133]:

na_features = features_with_na(train_data)
na_features

['person_emp_length', 'loan_int_rate']

In [134]:
def imputer(columns,dataset):
    for column in columns:
        median = dataset[column].median()
        dataset[column].fillna(median,inplace=True)
    return dataset

In [135]:
train_data = imputer(na_features,train_data)

In [136]:
train_data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [137]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,64,46000,RENT,2.0,PERSONAL,C,4800,11.09,0.1,Y,24,0
1338,26,26000,OWN,0.0,DEBTCONSOLIDATION,E,8500,16.45,0.33,N,3,1
7047,23,51000,MORTGAGE,3.0,PERSONAL,C,16000,13.11,0.31,Y,3,0
8225,22,56004,MORTGAGE,6.0,MEDICAL,A,6000,7.88,0.11,N,4,0
7178,24,79000,RENT,3.0,PERSONAL,C,7000,12.54,0.09,N,3,0


#### Handling outliers

In [138]:
def find_numerical_features(dataset):
    numerical_features = [feature for feature in dataset.columns if dataset[feature].dtype != 'O' and feature != 'loan_status']
    return numerical_features

In [139]:

numerical_features = find_numerical_features(train_data)
numerical_features

['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length']

In [140]:
def detect_and_replace_outliers(features,dataset):
    dataframe = dataset.copy()
    for column in features:
        data = dataframe[column]
        data2 = sorted(data)
        q1 = np.percentile(data2,25)
        q3 = np.percentile(data2,75)
        IQR = q3-q1
        lower_bound = q1 - (1.5*IQR)
        upper_bound = q3 + (1.5*IQR)
        print(f'Column : {column}',np.round(lower_bound,5),np.round(upper_bound,5))
        values = []
        for i in data:
            if i<lower_bound or i>upper_bound:
                i = np.median(data)
                values.append(i)
                continue
            else:
                values.append(i)
        data = values
        dataframe[column] = data
    return dataframe

In [141]:
demo = detect_and_replace_outliers(numerical_features,train_data)

Column : person_age 12.5 40.5


Column : person_income -21750.0 140250.0
Column : person_emp_length -5.5 14.5
Column : loan_amnt -5875.0 23125.0
Column : loan_int_rate 1.56 20.04
Column : loan_percent_income -0.12 0.44
Column : cb_person_cred_hist_length -4.5 15.5


In [142]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,26.781768,58559.615216,4.404428,8648.812538,10.987761,0.163085,5.321017,0.217273
std,4.428679,26903.138347,3.353342,4880.118592,3.038411,0.094802,3.227007,0.412398
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,29.0,74000.0,6.0,12000.0,13.11,0.22,7.0,0.0
max,40.0,140004.0,14.0,23100.0,20.03,0.44,15.0,1.0


In [143]:
train_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,27.764695,66171.84,4.765577,9601.07332,11.008203,0.170446,5.81672,0.217273
std,6.3925,63599.33,4.054371,6315.753396,3.071511,0.106991,4.054342,0.412398
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,79500.0,7.0,12250.0,13.11,0.23,8.0,0.0
max,144.0,6000000.0,123.0,35000.0,22.48,0.78,30.0,1.0


We can clearly notice that outliers have been significantly removed. For visual representation one can plot boxplots to view the outliers.

In [144]:
train_data = demo.copy()

Adding income slabs for all the people

In [145]:
demo.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0


In [146]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0


In [147]:
# function to add income slabs in the dataset
def add_income_slab(dataset):
    q1 = np.percentile(dataset.person_income,33)
    q2 = np.percentile(dataset.person_income,66)
    income_slabs = []
    for income in dataset.person_income.values:
        if income<=q1:
            income_slabs.append('lower')
        elif income>q1 and income<=q2:
            income_slabs.append('middle')
        else:
            income_slabs.append('upper')
    dataset['income_slab'] = income_slabs
    return dataset

In [148]:
train_data = add_income_slab(train_data)
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,26.0,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0.1,Y,4.0,0,middle
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,0.33,N,3.0,1,lower
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0.31,Y,3.0,0,middle
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0.11,N,4.0,0,middle
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0.09,N,3.0,0,upper


#### Log transformation of the non Gaussian distributions of numerical features

In [149]:
def log_transformation(features, dataset):
    data = dataset.copy()
    for feature in features:
        if 0 in data[feature].unique():
            pass
        else:
            data[feature] = np.log(dataset[feature])
    return data

In [150]:
demo = log_transformation(numerical_features,train_data)

In [151]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,3.275042,10.867479,4.404428,8.881418,2.356117,0.163085,1.501581,0.217273
std,0.156509,0.487163,3.353342,0.65155,0.291264,0.094802,0.578727,0.412398
min,2.995732,8.29405,0.0,6.214608,1.690096,0.0,0.693147,0.0
25%,3.135494,10.571317,2.0,8.517193,2.138889,0.09,1.098612,0.0
50%,3.258097,10.915088,4.0,8.987197,2.396986,0.15,1.386294,0.0
75%,3.367296,11.21182,6.0,9.392662,2.573375,0.22,1.94591,0.0
max,3.688879,11.849426,14.0,10.047588,2.997231,0.44,2.70805,1.0


In [152]:
train_data = demo.copy()

In [153]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,3.258097,10.736397,RENT,2.0,PERSONAL,C,8.476371,2.406044,0.1,Y,1.386294,0,middle
1338,3.258097,10.165852,OWN,0.0,DEBTCONSOLIDATION,E,9.047821,2.800325,0.33,N,1.098612,1,lower
7047,3.135494,10.839581,MORTGAGE,3.0,PERSONAL,C,9.680344,2.573375,0.31,Y,1.098612,0,middle
8225,3.091042,10.933178,MORTGAGE,6.0,MEDICAL,A,8.699515,2.064328,0.11,N,1.386294,0,middle
7178,3.178054,11.277203,RENT,3.0,PERSONAL,C,8.853665,2.528924,0.09,N,1.098612,0,upper


#### Label encoding the categorical features

In [154]:
from sklearn.preprocessing import LabelEncoder

In [155]:
LabelEncoder = LabelEncoder()

In [156]:
def find_categorical_features(dataset):
    categorical_features = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']
    return categorical_features

In [157]:
categorical_features = find_categorical_features(train_data)
categorical_features

['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file',
 'income_slab']

In [158]:
df = train_data.copy()

In [159]:
for feature in categorical_features:
    df[feature] = LabelEncoder.fit_transform(df[feature])

In [160]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,3.258097,10.736397,3,2.0,4,2,8.476371,2.406044,0.1,1,1.386294,0,1
1338,3.258097,10.165852,2,0.0,0,4,9.047821,2.800325,0.33,0,1.098612,1,0
7047,3.135494,10.839581,0,3.0,4,2,9.680344,2.573375,0.31,1,1.098612,0,1
8225,3.091042,10.933178,0,6.0,3,0,8.699515,2.064328,0.11,0,1.386294,0,1
7178,3.178054,11.277203,3,3.0,4,2,8.853665,2.528924,0.09,0,1.098612,0,2


In [161]:
train_data = df.copy()

Scaling the features

In [162]:
from sklearn.preprocessing import StandardScaler

In [163]:
scaler = StandardScaler()

In [164]:
train_data_columns =  train_data.columns
y = train_data.iloc[:,-2]
df = train_data.drop('loan_status',axis=1)

In [165]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,income_slab
32377,3.258097,10.736397,3,2.0,4,2,8.476371,2.406044,0.1,1,1.386294,1
1338,3.258097,10.165852,2,0.0,0,4,9.047821,2.800325,0.33,0,1.098612,0
7047,3.135494,10.839581,0,3.0,4,2,9.680344,2.573375,0.31,1,1.098612,1
8225,3.091042,10.933178,0,6.0,3,0,8.699515,2.064328,0.11,0,1.386294,1
7178,3.178054,11.277203,3,3.0,4,2,8.853665,2.528924,0.09,0,1.098612,2


In [166]:
df = pd.DataFrame(scaler.fit_transform(df),index=train_data.index)

In [167]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
32377,-0.108271,-0.269079,0.924046,-0.717038,0.849923,0.674664,-0.621678,0.171418,-0.66545,2.174477,-0.19921,0.007903
1338,-0.108271,-1.44026,0.226253,-1.313469,-1.459482,2.39492,0.255402,1.525138,1.760709,-0.459881,-0.696314,-1.225607
7047,-0.891641,-0.057268,-1.169333,-0.418822,0.849923,0.674664,1.226217,0.745931,1.549739,2.174477,-0.696314,0.007903
8225,-1.175666,0.134863,-1.169333,0.475825,0.272572,-1.045591,-0.27919,-1.001823,-0.559965,-0.459881,-0.19921,0.007903
7178,-0.619706,0.841057,0.924046,-0.418822,0.849923,0.674664,-0.042595,0.593311,-0.770936,-0.459881,-0.696314,1.241414


In [168]:
df.insert(11, 'loan_status', y)

In [169]:
df.columns = train_data_columns

In [170]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,-0.108271,-0.269079,0.924046,-0.717038,0.849923,0.674664,-0.621678,0.171418,-0.66545,2.174477,-0.19921,0,0.007903
1338,-0.108271,-1.44026,0.226253,-1.313469,-1.459482,2.39492,0.255402,1.525138,1.760709,-0.459881,-0.696314,1,-1.225607
7047,-0.891641,-0.057268,-1.169333,-0.418822,0.849923,0.674664,1.226217,0.745931,1.549739,2.174477,-0.696314,0,0.007903
8225,-1.175666,0.134863,-1.169333,0.475825,0.272572,-1.045591,-0.27919,-1.001823,-0.559965,-0.459881,-0.19921,0,0.007903
7178,-0.619706,0.841057,0.924046,-0.418822,0.849923,0.674664,-0.042595,0.593311,-0.770936,-0.459881,-0.696314,0,1.241414


In [171]:
train_data = df.copy()

In [172]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
32377,-0.108271,-0.269079,0.924046,-0.717038,0.849923,0.674664,-0.621678,0.171418,-0.66545,2.174477,-0.19921,0,0.007903
1338,-0.108271,-1.44026,0.226253,-1.313469,-1.459482,2.39492,0.255402,1.525138,1.760709,-0.459881,-0.696314,1,-1.225607
7047,-0.891641,-0.057268,-1.169333,-0.418822,0.849923,0.674664,1.226217,0.745931,1.549739,2.174477,-0.696314,0,0.007903
8225,-1.175666,0.134863,-1.169333,0.475825,0.272572,-1.045591,-0.27919,-1.001823,-0.559965,-0.459881,-0.19921,0,0.007903
7178,-0.619706,0.841057,0.924046,-0.418822,0.849923,0.674664,-0.042595,0.593311,-0.770936,-0.459881,-0.696314,0,1.241414


In [173]:
train_data.to_csv('../Data/data-created/train_data_preprocessed.csv',index=False)

#### Test Data Preprocessing

In [174]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
14668,24,28000,OWN,6.0,HOMEIMPROVEMENT,B,10000,10.37,0.36,N,2,0
24614,27,64000,RENT,0.0,PERSONAL,C,10000,15.27,0.16,Y,10,0
11096,26,72000,MORTGAGE,10.0,EDUCATION,D,16000,,0.22,N,3,0
10424,23,27996,RENT,7.0,DEBTCONSOLIDATION,A,10000,,0.36,N,2,1
26007,30,44500,RENT,2.0,MEDICAL,E,13000,16.32,0.29,N,6,1


In [175]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6517 entries, 14668 to 24385
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  6517 non-null   int64  
 1   person_income               6517 non-null   int64  
 2   person_home_ownership       6517 non-null   object 
 3   person_emp_length           6339 non-null   float64
 4   loan_intent                 6517 non-null   object 
 5   loan_grade                  6517 non-null   object 
 6   loan_amnt                   6517 non-null   int64  
 7   loan_int_rate               5880 non-null   float64
 8   loan_percent_income         6517 non-null   float64
 9   cb_person_default_on_file   6517 non-null   object 
 10  cb_person_cred_hist_length  6517 non-null   int64  
 11  loan_status                 6517 non-null   int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 661.9+ KB


In [176]:
test_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,6517.0,6517.0,6339.0,6517.0,5880.0,6517.0,6517.0,6517.0
mean,27.61424,65686.93,4.799495,9542.569434,11.018027,0.169234,5.754181,0.221728
std,6.166234,55049.68,4.273842,6347.62295,3.286681,0.105942,4.057562,0.415441
min,20.0,4200.0,0.0,700.0,5.42,0.0,2.0,0.0
25%,23.0,38000.0,2.0,5000.0,7.9,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,79000.0,7.0,12000.0,13.47,0.23,8.0,0.0
max,123.0,2039784.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [177]:
test_data.isna().sum()

person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length             178
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 637
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
loan_status                     0
dtype: int64

In [178]:
na_features = features_with_na(test_data)
na_features

['person_emp_length', 'loan_int_rate']

In [179]:
test_data = imputer(na_features,test_data)

In [180]:
test_data.isna().all()

person_age                    False
person_income                 False
person_home_ownership         False
person_emp_length             False
loan_intent                   False
loan_grade                    False
loan_amnt                     False
loan_int_rate                 False
loan_percent_income           False
cb_person_default_on_file     False
cb_person_cred_hist_length    False
loan_status                   False
dtype: bool

In [181]:
numerical_features = find_numerical_features(test_data)

In [182]:
test_data = detect_and_replace_outliers(numerical_features,test_data)

Column : person_age 12.5 40.5
Column : person_income -23500.0 140500.0
Column : person_emp_length -5.5 14.5
Column : loan_amnt -5500.0 22500.0
Column : loan_int_rate 1.56 20.04
Column : loan_percent_income -0.12 0.44
Column : cb_person_cred_hist_length -4.5 15.5


In [183]:
test_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0
mean,26.751573,58455.863894,4.418751,8532.315483,10.990726,0.16273,5.2294,0.221728
std,4.430917,26888.940765,3.34593,4793.466863,3.082156,0.094659,3.17226,0.415441
min,20.0,4200.0,0.0,700.0,5.42,0.0,2.0,0.0
25%,23.0,38000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,29.0,74004.0,6.0,11800.0,13.11,0.22,7.0,0.0
max,40.0,140304.0,14.0,22500.0,20.03,0.44,15.0,1.0


In [184]:
test_data = add_income_slab(test_data)

In [185]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
14668,24.0,28000.0,OWN,6.0,HOMEIMPROVEMENT,B,10000.0,10.37,0.36,N,2.0,0,lower
24614,27.0,64000.0,RENT,0.0,PERSONAL,C,10000.0,15.27,0.16,Y,10.0,0,middle
11096,26.0,72000.0,MORTGAGE,10.0,EDUCATION,D,16000.0,10.99,0.22,N,3.0,0,upper
10424,23.0,27996.0,RENT,7.0,DEBTCONSOLIDATION,A,10000.0,10.99,0.36,N,2.0,1,lower
26007,30.0,44500.0,RENT,2.0,MEDICAL,E,13000.0,16.32,0.29,N,6.0,1,middle


In [186]:
test_data = log_transformation(numerical_features,test_data)

In [187]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
14668,3.178054,10.23996,OWN,6.0,HOMEIMPROVEMENT,B,9.21034,2.338917,0.36,N,0.693147,0,lower
24614,3.295837,11.066638,RENT,0.0,PERSONAL,C,9.21034,2.72589,0.16,Y,2.302585,0,middle
11096,3.258097,11.184421,MORTGAGE,10.0,EDUCATION,D,9.680344,2.396986,0.22,N,1.098612,0,upper
10424,3.135494,10.239817,RENT,7.0,DEBTCONSOLIDATION,A,9.21034,2.396986,0.36,N,0.693147,1,lower
26007,3.401197,10.703244,RENT,2.0,MEDICAL,E,9.472705,2.792391,0.29,N,1.791759,1,middle


In [188]:
test_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0,6517.0
mean,3.273899,10.865324,4.418751,8.871309,2.355214,0.16273,1.486785,0.221728
std,0.156531,0.487631,3.34593,0.643567,0.295567,0.094659,0.572291,0.415441
min,2.995732,8.34284,0.0,6.55108,1.690096,0.0,0.693147,0.0
25%,3.135494,10.545341,2.0,8.517193,2.138889,0.09,1.098612,0.0
50%,3.258097,10.915088,4.0,8.987197,2.396986,0.15,1.386294,0.0
75%,3.367296,11.211874,6.0,9.375855,2.573375,0.22,1.94591,0.0
max,3.688879,11.851567,14.0,10.021271,2.997231,0.44,2.70805,1.0


In [189]:
for feature in categorical_features:
    test_data[feature] = LabelEncoder.fit_transform(test_data[feature])

In [190]:
test_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
14668,3.178054,10.23996,2,6.0,2,1,9.21034,2.338917,0.36,0,0.693147,0,0
24614,3.295837,11.066638,3,0.0,4,2,9.21034,2.72589,0.16,1,2.302585,0,1
11096,3.258097,11.184421,0,10.0,1,3,9.680344,2.396986,0.22,0,1.098612,0,2
10424,3.135494,10.239817,3,7.0,0,0,9.21034,2.396986,0.36,0,0.693147,1,0
26007,3.401197,10.703244,3,2.0,3,4,9.472705,2.792391,0.29,0,1.791759,1,1


In [191]:
test_data_columns =  test_data.columns
y = test_data.iloc[:,-2]
df = test_data.drop('loan_status',axis=1)

In [192]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,income_slab
14668,3.178054,10.23996,2,6.0,2,1,9.21034,2.338917,0.36,0,0.693147,0
24614,3.295837,11.066638,3,0.0,4,2,9.21034,2.72589,0.16,1,2.302585,1
11096,3.258097,11.184421,0,10.0,1,3,9.680344,2.396986,0.22,0,1.098612,2
10424,3.135494,10.239817,3,7.0,0,0,9.21034,2.396986,0.36,0,0.693147,0
26007,3.401197,10.703244,3,2.0,3,4,9.472705,2.792391,0.29,0,1.791759,1


In [193]:
df = pd.DataFrame(scaler.fit_transform(df),index=df.index)

In [195]:
df.insert(11, 'loan_status', y)

In [197]:
df.columns = test_data_columns

In [198]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_slab
14668,-0.612356,-1.282551,0.224636,0.472625,-0.322812,-0.193472,0.526841,-0.055143,2.084175,-0.473856,-1.386879,0,-1.216936
24614,0.140161,0.412874,0.92242,-1.320736,0.835013,0.653308,0.526841,1.254212,-0.02884,2.110345,1.425608,0,0.004686
11096,-0.100962,0.654434,-1.170933,1.668199,-0.901725,1.500089,1.257208,0.141337,0.605064,-0.473856,-0.678331,0,1.226309
10424,-0.88427,-1.282844,0.92242,0.771519,-1.480638,-1.040252,0.526841,0.141337,2.084175,-0.473856,-1.386879,1,-1.216936
26007,0.81331,-0.332406,0.92242,-0.722949,0.2561,2.346869,0.934545,1.479224,1.34462,-0.473856,0.532942,1,0.004686


In [199]:
test_data = df.copy()

In [203]:
test_data.isna().any().sum()

0

In [201]:
test_data.to_csv('../Data/data-created/test_data_preprocessed.csv',index=False)