In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2]:
# import the preliminary cleaned data in last section
train = pd.read_csv('./train_clean.csv')
test = pd.read_csv('./test_clean.csv')

In [3]:
print(train.shape)
print(test.shape)

(1175854, 70)
(120010, 70)


In [4]:
train.columns[train.dtypes == object]

Index(['term', 'int_rate', 'grade', 'sub_grade', 'home_ownership',
       'verification_status', 'purpose', 'title', 'addr_state',
       'earliest_cr_line', 'initial_list_status', 'application_type',
       'hardship_flag', 'debt_settlement_flag'],
      dtype='object')

## Categorical variable

### term

In [5]:
train['term'].value_counts()

 36 months    886826
 60 months    289028
Name: term, dtype: int64

In [6]:
test['term'].value_counts()

 36 months    86646
 60 months    33364
Name: term, dtype: int64

Only 36 months and 60 months, two categories. Use label-encoder to encode it, as it's order matters

In [7]:
from sklearn.preprocessing import LabelEncoder
term_encoder = LabelEncoder()
term_encoder = term_encoder.fit(train['term'])

train['term'] = term_encoder.transform(train['term'])
test['term'] = term_encoder.transform(test['term'])

### Int_rate

In [8]:
# Delete "%", and transform to float type
train['int_rate'] = train['int_rate'].str.strip("%").astype(float)/100
test['int_rate'] = test['int_rate'].str.strip("%").astype(float)/100

### Grade & sub_grade

In [9]:
# order matters, so use label-encode
grade_encoder = LabelEncoder()
grade_encoder = grade_encoder.fit(train['grade'])

train['grade'] = grade_encoder.transform(train['grade'])
test['grade'] = grade_encoder.transform(test['grade'])

In [10]:
sub_grade_encoder = LabelEncoder()
sub_grade_encoder = grade_encoder.fit(train['sub_grade'])

train['sub_grade'] = grade_encoder.transform(train['sub_grade'])
test['sub_grade'] = grade_encoder.transform(test['sub_grade'])

### home_owership

In [11]:
test['home_ownership'].value_counts()

MORTGAGE    59588
RENT        44917
OWN         15324
ANY           181
Name: home_ownership, dtype: int64

In [12]:
train['home_ownership'].value_counts()

MORTGAGE    577403
RENT        465761
OWN         132366
ANY            320
NONE             4
Name: home_ownership, dtype: int64

In [13]:
# train data has one more category than test, None will be added to ANY here
train.loc[train['home_ownership'].isin(['NONE']), 'home_ownership'] = 'ANY'

# Orders do not matter. Ready to use one-hot encoding later.

### Verification_status
Orders do not matter. Keep it for now.

### purpose & title
These two features are found to represent similar attribute: why they loan?
It is reasonable to keep only one, and delete another one. Here, I choose to keep purpose, and delete title.

In [14]:
train['purpose'].value_counts()

debt_consolidation    686835
credit_card           260243
home_improvement       77891
other                  67222
major_purchase         24461
medical                13723
small_business         11563
car                    11500
moving                  8183
vacation                8144
house                   5344
renewable_energy         730
wedding                   13
educational                2
Name: purpose, dtype: int64

In [15]:
# Merge the sample number, which is less than 1000, into 'other'
lists = ['renewable_energy','wedding','educational']

train.loc[train['purpose'].isin(lists), 'purpose'] = 'other'
test.loc[test['purpose'].isin(lists), 'purpose'] = 'other'

# Drop title column
train = train.drop(labels=['title'], axis=1)
test = test.drop(labels=['title'], axis=1)

### earliest_cr_line

In [16]:
# extract `earliest_year` information
train['earliest_year'] = train['earliest_cr_line'].apply(lambda x: x.split('-')[1])
test['earliest_year'] = test['earliest_cr_line'].apply(lambda x: x.split('-')[1])

train['earliest_year'] = train['earliest_year'].astype(int)
test['earliest_year'] = test['earliest_year'].astype(int)

In [17]:
# create new column called 'credit_length'
train['credit_length'] = train['year'] - train['earliest_year']
test['credit_length'] = test['year'] - test['earliest_year']

In [18]:
# remove 'year', 'earliest_cr_line' and 'earliest_year'
train = train.drop(labels=['year', 'earliest_cr_line', 'earliest_year'], axis=1)
test = test.drop(labels=['year', 'earliest_cr_line', 'earliest_year'], axis=1)

### installment , annual_inc, and loan_amnt¶

For installment, use a installment / loan_amnt, can better reflect the ratio of each loan

For loan_amnt, considering the salary of each borrower varies, so replace it with the variable of loan_amnt / annual_inc

In [None]:
train['installment_ratio'] = train['installment'] / train['loan_amnt']
test['installment_ratio'] = test['installment'] / test['loan_amnt']

train['loan_to_inc'] = train['loan_amnt'] / (train['annual_inc'])
test['loan_to_inc'] = test['loan_amnt'] / (test['annual_inc'])

train = train.drop(labels='installment', axis=1)
test = test.drop(labels='installment', axis=1)

In [None]:
# save to disk
train.to_csv('./train_clean.csv', index=False)
test.to_csv('./test_clean.csv', index=False)

print('Training set:\t', train.shape)
print('Test set:\t', test.shape)