In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score,accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [36]:
trainData = pd.read_csv('Data/train_u6lujuX_CVtuZ9i.csv')

In [37]:
trainData.head(2)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [38]:
trainData.shape

(614, 13)

In [39]:
trainData.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [40]:
trainData.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [41]:
from sklearn.preprocessing import LabelEncoder
gen = LabelEncoder()

In [43]:
trainData[trainData['Married']=='Yes']

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,LP002964,Male,Yes,2,Not Graduate,No,3987,1411.0,157.0,360.0,1.0,Rural,Y
608,LP002974,Male,Yes,0,Graduate,No,3232,1950.0,108.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y


In [34]:
#Handling with missing data

In [35]:
trainData.Gender.fillna('Male',inplace =True)

In [10]:
trainData.Married.fillna('Yes',inplace=True)

TypeError: '>=' not supported between instances of 'str' and 'float'

In [11]:
trainData.Credit_History.fillna(trainData.Credit_History.max(),inplace=True)

In [12]:
trainData.LoanAmount.fillna(trainData.LoanAmount.mean(),inplace=True)

In [13]:
trainData.Loan_Amount_Term.fillna(trainData.Loan_Amount_Term.mean(),inplace=True)

In [14]:
trainData.Self_Employed.fillna(trainData.Self_Employed.max(),inplace=True)

TypeError: '>=' not supported between instances of 'str' and 'float'

In [15]:
trainData.Dependents.fillna(0,inplace=True)

In [16]:
#Convert string values to numerical values because to algorithm can understand only numerical value not string values

In [17]:
trainData.Gender.value_counts()
gender_cat = pd.get_dummies(trainData.Gender,prefix='gender').gender_Female

In [18]:
trainData.Married.value_counts()
married_category = pd.get_dummies(trainData.Married,prefix='marriage').marriage_Yes

In [19]:
trainData.Education.value_counts()
graduate_category = pd.get_dummies(trainData.Education,prefix='education').education_Graduate

In [20]:
trainData.Self_Employed.value_counts()
self_emp_category = pd.get_dummies(trainData.Self_Employed,prefix='employed').employed_Yes

In [21]:
loan_status = pd.get_dummies(trainData.Loan_Status,prefix='status').status_Y

In [22]:
property_category = pd.get_dummies(trainData.Property_Area,prefix='property')

In [23]:
trainData.shape

(614, 13)

In [24]:
trainNew = pd.concat([trainData,gender_cat,married_category,graduate_category,self_emp_category,loan_status,property_category],axis=1)

In [25]:
trainNew.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,...,Property_Area,Loan_Status,gender_Female,marriage_Yes,education_Graduate,employed_Yes,status_Y,property_Rural,property_Semiurban,property_Urban
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,...,Urban,Y,0,0,1,0,1,0,0,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,...,Rural,N,0,1,1,0,0,1,0,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,...,Urban,Y,0,1,1,1,1,0,0,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,...,Urban,Y,0,1,0,0,1,0,0,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,...,Urban,Y,0,0,1,0,1,0,0,1


In [26]:
trainNew.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status',
       'gender_Female', 'marriage_Yes', 'education_Graduate', 'employed_Yes',
       'status_Y', 'property_Rural', 'property_Semiurban', 'property_Urban'],
      dtype='object')

In [27]:
feature_columns = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','gender_Female','marriage_Yes','education_Graduate','employed_Yes','property_Rural','property_Semiurban','property_Urban']

In [28]:
X = trainNew[feature_columns]

In [29]:
y =  trainNew['status_Y']
y

0      1
1      0
2      1
3      1
4      1
5      1
6      1
7      0
8      1
9      0
10     1
11     1
12     1
13     0
14     1
15     1
16     1
17     0
18     0
19     1
20     0
21     1
22     0
23     0
24     0
25     1
26     1
27     1
28     0
29     1
      ..
584    0
585    0
586    1
587    1
588    1
589    0
590    1
591    0
592    1
593    1
594    1
595    1
596    0
597    0
598    1
599    1
600    0
601    1
602    1
603    1
604    1
605    0
606    1
607    1
608    1
609    1
610    1
611    1
612    1
613    0
Name: status_Y, Length: 614, dtype: uint8

In [30]:
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.01,random_state=42)



In [31]:
X_train.shape

(607, 12)

In [32]:
X_test.shape

(7, 12)

In [48]:
randForest = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
randForest.fit(X_train,y_train)
y_pred_class  = randForest.predict(X_test)
randForestScore = accuracy_score(y_test,y_pred_class)
%time print ("Random forest accuraccy score",randForestScore)

Random forest accuraccy score 1.0
Wall time: 4.06 ms


In [49]:
#Import test data and do real test of our model

In [50]:
randForestNew = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
randForestNew.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=25,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
testData = pd.read_csv('Data/test_Y3wMUE5_7gLdaTN.csv')

In [52]:
testData.shape

(367, 12)

In [53]:
testData.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [54]:
testData.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [55]:
testData.Gender.fillna(testData.Gender.max(),inplace =True)
testData.Married.fillna(testData.Married.max(),inplace=True)
testData.Credit_History.fillna(testData.Credit_History.max(),inplace=True)
testData.LoanAmount.fillna(testData.LoanAmount.mean(),inplace=True)
testData.Loan_Amount_Term.fillna(testData.Loan_Amount_Term.mean(),inplace=True)
testData.Self_Employed.fillna(testData.Self_Employed.max(),inplace=True)
testData.Dependents.fillna(0,inplace=True)

TypeError: '>=' not supported between instances of 'str' and 'float'

In [42]:
gender_cat = pd.get_dummies(testData.Gender,prefix='gender').gender_Female
married_category = pd.get_dummies(testData.Married,prefix='marriage').marriage_Yes
graduate_category = pd.get_dummies(testData.Education,prefix='education').education_Graduate
self_emp_category = pd.get_dummies(testData.Self_Employed,prefix='employed').employed_Yes
property_category = pd.get_dummies(testData.Property_Area,prefix='property')

In [43]:
testDataNew = pd.concat([testData,gender_cat,married_category,graduate_category,self_emp_category,property_category],axis=1)

In [44]:
X_testData = testDataNew[feature_columns]

In [45]:
X_testData.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,gender_Female,marriage_Yes,education_Graduate,employed_Yes,property_Rural,property_Semiurban,property_Urban
0,5720,0,110.0,360.0,1.0,0,1,1,0,0,0,1
1,3076,1500,126.0,360.0,1.0,0,1,1,0,0,0,1
2,5000,1800,208.0,360.0,1.0,0,1,1,0,0,0,1
3,2340,2546,100.0,360.0,,0,1,1,0,0,0,1
4,3276,0,78.0,360.0,1.0,0,0,0,0,0,0,1


In [46]:
y_test_pread_class = randForestNew.predict(X_testData)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [47]:
randForestFormat = ["Y" if i == 1 else "N" for i in y_test_pread_class ]

NameError: name 'y_test_pread_class' is not defined

In [197]:
pd.DataFrame({'Loan_ID':testData.Loan_ID,'Loan_Status':randForestFormat}).to_csv('radom_forest_submission.csv',index=False)

In [None]:
#Solve using logistic regression

In [183]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(X_train,y_train)
logREg_predict =logReg.predict(X_test)
accuracy_score(y_test,logREg_predict)

1.0

In [184]:
logReg_y_prediction_class = logReg.predict(X_testData)

In [185]:
logRegPredictionFormat = ["Y" if i == 1 else "N" for i in logReg_y_prediction_class ]

In [186]:
#zip(logRegPredictionFormat,logReg_y_prediction_class)

In [187]:
pd.DataFrame({'Loan_ID':testData.Loan_ID,'Loan_Status':logRegPredictionFormat}).to_csv('logReg_submission.csv',index=False)