In [11]:
# Table of Contents

# 01. Identify the data types of each variable.
# 02. Load data in Python
# 03. Sub Data set where home ownership is not "own home", 
    ## years of credit history is larger than 12 and 
    ## sort number of years of credit history in a descending way
# 04. Strong correlation or dependency with the variable “Term”
# 05. Convert nominal variables to numerical ones
# 06. Normalize columns to scale [1, 5]

In [12]:
#Loading CSV file

#Import Python Libraries
import numpy as np
import pandas as pd
from IPython.display import display, HTML

df1=pd.read_csv('Loans_20K.CSV', header=0)


In [13]:
# 01. Identify the data types of each variable.
dataType = df1.dtypes
print(dataType)

Loan Status                      object
Current Loan Amount               int64
Term                             object
Credit Score                    float64
Annual Income                   float64
Years in current job             object
Home Ownership                   object
Purpose                          object
Monthly Debt                    float64
Years of Credit History         float64
Months since last delinquent    float64
Number of Open Accounts           int64
Number of Credit Problems         int64
Current Credit Balance            int64
Maximum Open Credit               int64
Bankruptcies                    float64
Tax Liens                       float64
dtype: object


In [14]:
# 02. Load data in Python. Check whether there are missing values. 
cols = df1.columns
print('Column Name, DataTypes, MissingValues\n')
for i in cols:
    print(i,',', df1[i].dtype , ',', df1[i].isnull().any())

Column Name, DataTypes, MissingValues

Loan Status , object , False
Current Loan Amount , int64 , False
Term , object , False
Credit Score , float64 , True
Annual Income , float64 , True
Years in current job , object , True
Home Ownership , object , False
Purpose , object , False
Monthly Debt , float64 , False
Years of Credit History , float64 , False
Months since last delinquent , float64 , True
Number of Open Accounts , int64 , False
Number of Credit Problems , int64 , False
Current Credit Balance , int64 , False
Maximum Open Credit , int64 , False
Bankruptcies , float64 , True
Tax Liens , float64 , True


In [15]:
##If Yes, fill in missing values by mean values or most frequent nominal values.

df1["Credit Score"].fillna(df1["Credit Score"].mean(), inplace=True)
df1["Annual Income"].fillna(df1["Annual Income"].mean(), inplace=True)
df1["Years in current job"].fillna(df1["Years in current job"].mode().iloc[0], inplace=True)
df1["Months since last delinquent"].fillna(df1["Months since last delinquent"].mean(), inplace=True)
df1["Bankruptcies"].fillna(df1["Bankruptcies"].mean(), inplace=True)
df1["Tax Liens"].fillna(df1["Tax Liens"].mean(), inplace=True)

for i in cols:
    print(i,',', df1[i].dtype , ',', df1[i].isnull().any())

Loan Status , object , False
Current Loan Amount , int64 , False
Term , object , False
Credit Score , float64 , False
Annual Income , float64 , False
Years in current job , object , False
Home Ownership , object , False
Purpose , object , False
Monthly Debt , float64 , False
Years of Credit History , float64 , False
Months since last delinquent , float64 , False
Number of Open Accounts , int64 , False
Number of Credit Problems , int64 , False
Current Credit Balance , int64 , False
Maximum Open Credit , int64 , False
Bankruptcies , float64 , False
Tax Liens , float64 , False


In [16]:
# 03. Sub Data set where home ownership is not "own home", 
    ## years of credit history is larger than 12 and 
    ## sort number of years of credit history in a descending way
    

In [17]:
df2 =df1[(df1["Home Ownership"]!="Own Home") & (df1["Years of Credit History"]>12)].sort_values(by =["Years of Credit History"] , ascending=False)
df3 = df2[["Home Ownership", "Term", "Annual Income", "Credit Score", "Years of Credit History", "Bankruptcies"]]


In [21]:
# 04. Strong correlation or dependency with the variable “Term”
from scipy.stats import f_oneway

term_LT_with_HomeOwnership = df3[df3['Term']=='Long Term']['Home Ownership']
term_ST_with_HomeOwnership = df3[df3['Term']=='Short Term']['Home Ownership']

f_oneway(term_LT_with_HomeOwnership,term_ST_with_HomeOwnership)

KeyError: 'Term'

In [19]:
# 05. Convert nominal variables to numerical ones

df_dummies_HO=pd.get_dummies(df3['Home Ownership'])
df_dummies_Term=pd.get_dummies(df3['Term'])

# add binary variables to dataframe
df3=df3.join(df_dummies_HO)
df3=df3.join(df_dummies_Term)
# remove the original categorical variable
df3=df3.drop('Home Ownership',1)
df3=df3.drop('Term',1)


# N-1 binary variable is enough, drop 1
df3=df3.drop('Home Mortgage',1)
df3=df3.drop('Short Term',1)

display(HTML(df3.to_html()))

       HaveMortgage  Home Mortgage  Rent
13715             0              1     0
12502             0              1     0
16069             0              1     0
6173              0              0     1
5253              0              1     0


  df3=df3.drop('Home Ownership',1)
  df3=df3.drop('Term',1)
  df3=df3.drop('Home Mortgage',1)
  df3=df3.drop('Short Term',1)


Unnamed: 0,Annual Income,Credit Score,Years of Credit History,Bankruptcies,HaveMortgage,Rent,Long Term
13715,899878.0,721.0,65.0,0.0,0,0,0
12502,2044552.0,723.0,56.5,0.0,0,0,0
16069,4428121.0,735.0,54.0,0.0,0,0,0
6173,2107347.0,750.0,53.2,0.120565,0,1,0
5253,1376165.0,1094.310471,52.8,0.0,0,0,0
15733,1376165.0,1094.310471,51.5,0.0,0,1,1
7701,3864600.0,750.0,51.4,0.0,0,0,0
10253,1865800.0,750.0,51.0,0.0,0,0,0
3596,1376165.0,1094.310471,50.9,0.0,0,0,0
4174,2736266.0,628.0,50.1,0.0,0,0,1


In [None]:
# 06. Normalize columns to scale [1, 5]