In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
# importing the dataset
dataset = pd.read_csv("../Data/credit_risk_dataset.csv")

#### Splitting the datasets

Before we start tampering with the dataset we first need to split the dataset into train and test sets in order to prevent data leakage.

In [3]:
y = dataset['loan_status']
X = dataset.drop(['loan_status'],axis=1)

In [4]:
print(y.shape,X.shape)

(32581,) (32581, 11)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=.2)

In [6]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(26064, 11) (26064,) (6517, 11) (6517,)


In [7]:
train_data = pd.concat([X_train,y_train],axis=1)
test_data = pd.concat([X_test,y_test],axis=1)

In [8]:
#train_data.to_csv("../Data/data-created/train_data.csv", index=False)
#test_data.to_csv("../Data/data-created/test_data.csv", index=False)

#### Handling missing values

The missing values are pretty less relative to the amount of data so simply imputing the data is fine. We shall impute the missing values with the respective median value of the column.

In [9]:
features_with_na = [feature for feature in train_data.columns if train_data[feature].isnull().sum()>0]
features_with_na

['person_emp_length', 'loan_int_rate']

In [10]:
def imputer(columns,dataset):
    for column in columns:
        median = dataset[column].median()
        dataset[column].fillna(median,inplace=True)
    return dataset

In [11]:
train_data = imputer(features_with_na,train_data)

In [12]:
train_data.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [13]:
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
32377,64,46000,RENT,2.0,PERSONAL,C,4800,11.09,0.1,Y,24,0
1338,26,26000,OWN,0.0,DEBTCONSOLIDATION,E,8500,16.45,0.33,N,3,1
7047,23,51000,MORTGAGE,3.0,PERSONAL,C,16000,13.11,0.31,Y,3,0
8225,22,56004,MORTGAGE,6.0,MEDICAL,A,6000,7.88,0.11,N,4,0
7178,24,79000,RENT,3.0,PERSONAL,C,7000,12.54,0.09,N,3,0


#### Handling outliers

In [14]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtype != 'O' and feature != 'loan_status']
numerical_features

['person_age',
 'person_income',
 'person_emp_length',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_cred_hist_length']

In [15]:
def detect_and_replace_outliers(features,dataset):
    dataframe = dataset.copy()
    for column in features:
        data = dataframe[column]
        data2 = sorted(data)
        q1 = np.percentile(data2,25)
        q3 = np.percentile(data2,75)
        IQR = q3-q1
        lower_bound = q1 - (1.5*IQR)
        upper_bound = q3 + (1.5*IQR)
        print(f'Column : {column}',lower_bound,upper_bound)
        values = []
        for i in data:
            if i<lower_bound or i>upper_bound:
                i = np.median(data)
                values.append(i)
                continue
            else:
                values.append(i)
        data = values
        dataframe[column] = data
    return dataframe

In [16]:
demo = detect_and_replace_outliers(numerical_features,train_data)

Column : person_age 12.5 40.5
Column : person_income -21750.0 140250.0
Column : person_emp_length -5.5 14.5
Column : loan_amnt -5875.0 23125.0
Column : loan_int_rate 1.5600000000000014 20.04
Column : loan_percent_income -0.12000000000000002 0.44000000000000006
Column : cb_person_cred_hist_length -4.5 15.5


In [17]:
demo.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,26.781768,58559.615216,4.404428,8648.812538,10.987761,0.163085,5.321017,0.217273
std,4.428679,26903.138347,3.353342,4880.118592,3.038411,0.094802,3.227007,0.412398
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,29.0,74000.0,6.0,12000.0,13.11,0.22,7.0,0.0
max,40.0,140004.0,14.0,23100.0,20.03,0.44,15.0,1.0


In [18]:
train_data.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0,26064.0
mean,27.764695,66171.84,4.765577,9601.07332,11.008203,0.170446,5.81672,0.217273
std,6.3925,63599.33,4.054371,6315.753396,3.071511,0.106991,4.054342,0.412398
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,39000.0,2.0,5000.0,8.49,0.09,3.0,0.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.15,4.0,0.0
75%,30.0,79500.0,7.0,12250.0,13.11,0.23,8.0,0.0
max,144.0,6000000.0,123.0,35000.0,22.48,0.78,30.0,1.0


We can clearly notice that outliers have been significantly removed. For visual representation one can plot boxplots to view the outliers.

In [19]:
train_data = demo