In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from numpy import NaN

In [2]:
X_train = pd.read_csv("Data\\train.csv")

In [3]:
X_train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
X_train.loc[23]

Loan_ID                  LP001050
Gender                        NaN
Married                       Yes
Dependents                      2
Education            Not Graduate
Self_Employed                  No
ApplicantIncome              3365
CoapplicantIncome            1917
LoanAmount                    112
Loan_Amount_Term              360
Credit_History                  0
Property_Area               Rural
Loan_Status                     N
Name: 23, dtype: object

In [5]:
X_train[X_train["Gender"].isnull()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
23,LP001050,,Yes,2,Not Graduate,No,3365,1917.0,112.0,360.0,0.0,Rural,N
126,LP001448,,Yes,3+,Graduate,No,23803,0.0,370.0,360.0,1.0,Rural,Y
171,LP001585,,Yes,3+,Graduate,No,51763,0.0,700.0,300.0,1.0,Urban,Y
188,LP001644,,Yes,0,Graduate,Yes,674,5296.0,168.0,360.0,1.0,Rural,Y
314,LP002024,,Yes,0,Graduate,No,2473,1843.0,159.0,360.0,1.0,Rural,N
334,LP002103,,Yes,1,Graduate,Yes,9833,1833.0,182.0,180.0,1.0,Urban,Y
460,LP002478,,Yes,0,Graduate,Yes,2083,4083.0,160.0,360.0,,Semiurban,Y
467,LP002501,,Yes,0,Graduate,No,16692,0.0,110.0,360.0,1.0,Semiurban,Y
477,LP002530,,Yes,2,Graduate,No,2873,1872.0,132.0,360.0,0.0,Semiurban,N
507,LP002625,,No,0,Graduate,No,3583,0.0,96.0,360.0,1.0,Urban,N


In [6]:
#X_test.head(3)

In [7]:
X_train.shape

(614, 13)

In [8]:
X_train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
def fill_gender(gen):
    gen.fillna(gen.max,inplace = True)

In [10]:
def fill_mean(gen):
    gen.fillna(gen.mean(),inplace = True)

In [11]:
def fill_zero(gen):
    gen.fillna(0,inplace = True)

In [12]:
X_train.Gender.fillna(method="bfill" , inplace = True)
X_train.Married.fillna(method="bfill" , inplace = True)
X_train.Self_Employed.fillna(method="bfill" , inplace = True)
X_train.Credit_History.fillna(method="bfill" , inplace = True)

In [13]:
fill_mean(X_train.Loan_Amount_Term)
fill_mean(X_train.LoanAmount)

In [14]:
fill_zero(X_train.Dependents)

In [15]:
X_train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [16]:
X_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [17]:
def gender(gen):
    return pd.get_dummies(gen,prefix='gender').gender_Female

In [18]:
def married(marr):
    return pd.get_dummies(marr,prefix="marriage").marriage_Yes

In [19]:
def education(edu):
    return pd.get_dummies(edu,prefix="education").education_Graduate

In [20]:
def emoloyed(emp):
    return pd.get_dummies(emp,prefix="employed").employed_Yes

In [21]:
def propert(property_area):
    return pd.get_dummies(property_area,prefix="property")

In [22]:
def statu(status):
    return pd.get_dummies(status,prefix="status").status_Y

In [23]:
gender_cat = gender(X_train.Gender)

In [24]:
married_cat = married(X_train.Married)

In [25]:
education_cat = education(X_train.Education)

In [26]:
emoloyed_cat = emoloyed(X_train.Self_Employed)

In [27]:
propert = propert(X_train.Property_Area)

In [28]:
X_train.shape

(614, 13)

In [29]:
X_train = pd.concat([X_train,gender_cat,education_cat,emoloyed_cat,propert],axis=1)
X_train.shape

(614, 19)

In [30]:
feature_column = ["ApplicantIncome","CoapplicantIncome","LoanAmount","Loan_Amount_Term",
                  "Credit_History","gender_Female","education_Graduate","employed_Yes"]

In [31]:
y = statu(X_train.Loan_Status)

In [32]:
x_train = X_train[feature_column]

In [33]:
x_train,x_test,y_train,y_test = train_test_split(x_train,y,test_size = 0.15)

In [34]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((521, 8), (93, 8), (521,), (93,))

In [35]:
x_train.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,gender_Female,education_Graduate,employed_Yes
522,5677,1424.0,100.0,360.0,1.0,0,1,1
131,2014,1929.0,74.0,360.0,1.0,0,1,0
147,1538,1425.0,30.0,360.0,1.0,0,1,0
596,6383,1000.0,187.0,360.0,1.0,0,0,1
576,3087,2210.0,136.0,360.0,0.0,0,1,0


In [36]:
randForest = RandomForestClassifier(n_estimators=25, min_samples_split=30,max_depth=5, max_features=1)
randForest.fit(x_train,y_train)
y_pred_class  = randForest.predict(x_test)
randForestScore = accuracy_score(y_test,y_pred_class)
print(randForestScore)

0.709677419355


In [37]:
logReg = LogisticRegression()
logReg.fit(x_train,y_train)
logREg_predict =logReg.predict(x_test)
accuracy_score(y_test,logREg_predict)

0.77419354838709675

In [38]:
sgdClassifier = SGDClassifier(alpha=0.001)
sgdClassifier.fit(x_train,y_train)
y_sgd_pred = sgdClassifier.predict(x_test)
accuracy_score(y_test,y_sgd_pred)



0.68817204301075274