In [29]:
import pandas as pd 
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

data = pd.read_csv('./train.csv')


# Data Cleaning / Exploration 

In [30]:
data.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [55]:
data.shape

(58645, 13)

In [31]:
# Checking for null values 
data.isna().sum()

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

In [32]:
# Checking for data types 
# Need to convert Objects to ML ready variables (Int or Floats)
print(data.dtypes)
print('\n')
print('Features that need to be converted are:')
list(data.select_dtypes(include='object').columns)

id                              int64
person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
loan_status                     int64
dtype: object


Features that need to be converted are:


['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']

In [12]:
list(data.select_dtypes(include='object').columns)


['person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file']

In [85]:
# Copying Data into data2 to keep source data untouched

data2 = data.copy()

In [77]:
# Converting each Object feature into ML friendly feature

# Identify unique variables for home ownership
print(data2['person_home_ownership'].unique())
print('\n')
print(data2['loan_intent'].unique())
print('\n')
print(data2['loan_grade'].unique())
print('\n')
print(data2['cb_person_default_on_file'].unique())

['RENT' 'OWN' 'MORTGAGE' 'OTHER']


['EDUCATION' 'MEDICAL' 'PERSONAL' 'VENTURE' 'DEBTCONSOLIDATION'
 'HOMEIMPROVEMENT']


['B' 'C' 'A' 'D' 'E' 'F' 'G']


['N' 'Y']


In [89]:
# THere are 89 Rows of OTHER 
data2[data2['person_home_ownership']=='OTHER']

# Drop "OTHER" since it represents less than 0.001% of the data
data2 = data2[data2['person_home_ownership']!='OTHER']

In [90]:
# Utilize LabelEncoder to encode the string objects
list(data2['loan_intent'].unique())

# Initialize labelEncoder 
encoder = LabelEncoder() 

# Fit and transform the data for 'loan_intent' column
data2['loan_intent'] = encoder.fit_transform(data2['loan_intent'])

# Fit and transform the data for 'loan_grade' column
data2['loan_grade'] = encoder.fit_transform(data2['loan_grade'])


data2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['loan_intent'] = encoder.fit_transform(data2['loan_intent'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['loan_grade'] = encoder.fit_transform(data2['loan_grade'])


Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,1,1,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,3,2,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,4,0,6000,8.90,0.21,N,10,0
3,3,30,70000,RENT,14.0,5,1,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,3,0,6000,6.92,0.10,N,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58640,58640,34,120000,MORTGAGE,5.0,1,3,25000,15.95,0.21,Y,10,0
58641,58641,28,28800,RENT,0.0,3,2,10000,12.73,0.35,N,8,1
58642,58642,23,44000,RENT,7.0,1,3,6800,16.00,0.15,N,2,1
58643,58643,22,30000,RENT,2.0,1,0,5000,8.90,0.17,N,3,0


In [64]:
data3 = data2.copy()

In [65]:
data3 = data3[data3['person_home_ownership']!='OTHER']

In [67]:
data3['person_home_ownership'].unique()

array(['RENT', 'OWN', 'MORTGAGE'], dtype=object)