In [1]:
# Importing all the necessary libraries

import numpy as np
import pandas as pd
from sklearn import metrics, model_selection
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import seaborn as sns
from matplotlib import pyplot as plt

  from numpy.core.umath_tests import inner1d


In [2]:
# Reading/Loading the train/test data

traindata = pd.read_csv('train_data.csv')
testdata = pd.read_csv('test_data.csv')

In [3]:
# Creating a new dataframe for preprocessing
a = traindata.copy()
# Observations
print(a.shape)
print(a.columns)
a.info()

(614, 13)
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
Loan_ID              614 non-null object
Gender               601 non-null object
Married              611 non-null object
Dependents           599 non-null object
Education            614 non-null object
Self_Employed        582 non-null object
ApplicantIncome      614 non-null int64
CoapplicantIncome    614 non-null float64
LoanAmount           592 non-null float64
Loan_Amount_Term     600 non-null float64
Credit_History       564 non-null float64
Property_Area        614 non-null object
Loan_Status          614 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 62.4+ KB


In [4]:
# Filling the NULL columns with back fill values from the next column

a.fillna(method='bfill', inplace=True)
a.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [5]:
a['Loan_Status'] = np.where(a.Loan_Status =='Y',1,0)

In [15]:
# Splitting X and Y -- dropping the target and irrelevant columns from X

X = a.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = a['Loan_Status']
print(X.shape)

# Creating the dummy variables for the categorical columns

X = pd.get_dummies(X)
print(X.shape)
print(y.shape)
X.columns

(614, 11)
(614, 20)
(614,)


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female', 'Gender_Male',
       'Married_No', 'Married_Yes', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')

In [16]:
# Identfying the top 5 columns using Recursive Feature Elimination process

# Feature selection using RandomForestClassifier

logreg = LogisticRegression()
rfe = RFE(logreg, 5)
rfe = rfe.fit(X,y)
z=rfe.support_
m=set(X.columns[z])
X=X[X.columns[z]]
print(X.shape)
m

(614, 5)


{'Credit_History',
 'Dependents_1',
 'Married_No',
 'Property_Area_Rural',
 'Property_Area_Urban'}

In [17]:
# RandomForestClassifier model fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=321)
rf = RandomForestClassifier(n_estimators=25)
rf.fit(X_train, y_train)
y_pred_rf1 = rf.predict(X_test)
print("Accuracy of test data is ", accuracy_score(y_test,y_pred_rf1))
y_pred_rf2 = rf.predict(X_train)
print("Accuracy of train data is ", accuracy_score(y_train,y_pred_rf2))

Accuracy of test data is  0.9032258064516129
Accuracy of train data is  0.7934782608695652


In [21]:
## EDA with test data file

d = testdata.copy()
print(d.shape)
print(d.columns)
d.isnull().sum()

(367, 12)
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')


Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [22]:
d.fillna(method = "bfill", inplace = True) 
d = d.drop(['Loan_ID'], axis=1)
d = pd.get_dummies(d)
print(d.shape)
d.columns

(367, 20)


Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female', 'Gender_Male',
       'Married_No', 'Married_Yes', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')

In [23]:
# Aligning with features selected for train data

test = d[[ 'Credit_History', 'Dependents_1', 'Married_No', 'Property_Area_Rural', 'Property_Area_Urban']].copy()
test.shape

(367, 5)

In [24]:
# Implementing the Logistic Regression model which has the best accuracy score
prediction = rf.predict(test)
prediction

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,

In [25]:
# Creating a Dataframe with the predicted values
submission = pd.DataFrame({"Loan_ID": testdata["Loan_ID"], "Loan_Status": prediction})

In [26]:
# Converting the numerical values back to categorical values
submission["Loan_Status"] = np.where(submission.Loan_Status ==1,"Y","N")

In [27]:
# Creating the test result file
submission.to_csv('submission.csv', index=False)