In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
traindf = pd.read_csv('train.csv').set_index(['Loan_ID'])
testdf = pd.read_csv('test.csv').set_index(['Loan_ID'])

# Label Encoding

In [3]:
dfs = [traindf, testdf]
for df in dfs:
    df.replace({'Gender' : 'Male', 'Married' : 'Yes', 'Education' : 'Graduate', 'Self_Employed' : 'Yes', 'Property_Area' : 'Urban'}, 1, inplace = True) 
    df.replace({'Gender' : 'Female', 'Married' : 'No', 'Education' : 'Not Graduate', 'Self_Employed' : 'No', 'Property_Area' : 'Rural'}, 0, inplace = True)
    df.replace({'Property_Area' : 'Semiurban', 'Dependents' : '3+'}, 3, inplace = True)
traindf.replace(np.nan, 0, inplace = True)
testdf.replace(np.nan, 0, inplace = True)

In [4]:
Xtrain = traindf.drop(['Loan_Status'], axis = 1)
Xtrain = Xtrain.astype('int64')
Xtest = testdf.astype('int64')
Ytrain = traindf['Loan_Status']
payers = traindf[traindf['Loan_Status'] == 'Y']
nonPayers = traindf[traindf['Loan_Status'] == 'N']
Xtrain.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001002,1,0,0,1,0,5849,0,0,360,1,1
LP001003,1,1,1,1,0,4583,1508,128,360,1,0
LP001005,1,1,0,1,1,3000,0,66,360,1,1
LP001006,1,1,0,0,0,2583,2358,120,360,1,1
LP001008,1,0,0,1,0,6000,0,141,360,1,1


In [5]:
Xtrain.to_csv('X_Train.csv')
Xtest.to_csv('X_Test.csv')
pd.DataFrame(Ytrain).to_csv('Y_Train.csv')
payers.to_csv('payers.csv')
nonPayers.to_csv('nonPayers.csv')

# Scaling the Data

In [6]:
minmax = MinMaxScaler()
cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
sXtrain = minmax.fit_transform(traindf[cols])
sXtest = minmax.fit_transform(testdf[cols])

In [7]:
sXtrain = pd.DataFrame(sXtrain, columns = cols)
traindf.reset_index(inplace = True)
sXtrain['Loan_ID'] = traindf['Loan_ID'].tolist()
sXtrain.set_index(['Loan_ID'], inplace = True)

sXtest = pd.DataFrame(sXtest, columns = cols)
testdf.reset_index(inplace = True)
sXtest['Loan_ID'] = testdf['Loan_ID'].tolist()
sXtest.set_index(['Loan_ID'], inplace = True)

In [8]:
print(sXtrain.head())
print('===========================================================================')
print(sXtest.head())

          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term
Loan_ID                                                                   
LP001002         0.070489           0.000000    0.000000              0.75
LP001003         0.054830           0.036192    0.182857              0.75
LP001005         0.035250           0.000000    0.094286              0.75
LP001006         0.030093           0.056592    0.171429              0.75
LP001008         0.072356           0.000000    0.201429              0.75
          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term
Loan_ID                                                                   
LP001015         0.078865           0.000000    0.200000              0.75
LP001022         0.042411           0.062500    0.229091              0.75
LP001031         0.068938           0.075000    0.378182              0.75
LP001035         0.032263           0.106083    0.181818              0.75
LP001051         0.045168

In [9]:
sXtrain.to_csv('Scaled_Xtrain.csv')
sXtest.to_csv('Scaled_Xtest.csv')

# Loan Payers and NonPayers Data

In [10]:
payers.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,1.0,0.0,0,1,0.0,5849,0.0,0.0,360.0,1.0,1,Y
LP001005,1.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,1,Y
LP001006,1.0,1.0,0,0,0.0,2583,2358.0,120.0,360.0,1.0,1,Y
LP001008,1.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,1,Y
LP001011,1.0,1.0,2,1,1.0,5417,4196.0,267.0,360.0,1.0,1,Y


In [11]:
nonPayers.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001003,1.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,0,N
LP001014,1.0,1.0,3,1,0.0,3036,2504.0,158.0,360.0,0.0,3,N
LP001020,1.0,1.0,1,1,0.0,12841,10968.0,349.0,360.0,1.0,3,N
LP001029,1.0,0.0,0,1,0.0,1853,2840.0,114.0,360.0,1.0,0,N
LP001036,0.0,0.0,0,1,0.0,3510,0.0,76.0,360.0,0.0,1,N
