In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [6]:
# Load training and testing data files
train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')

In [7]:
# Print the column names of the training dataset
list(train_data)

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [8]:
# Print the 10 records of the training dataset
train_data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [9]:
# Print the data types of the training dataset
train_data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [10]:
# Describe the all statistical properties of the training dataset
train_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [11]:
# Print which column has missing values in the training data and count of those records
train_data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [12]:
# Actual replacement of the missing values in the training data using mean value (numerical variables)
train_data.fillna(train_data.mean(), inplace=True)
train_data.isnull().sum()

  train_data.fillna(train_data.mean(), inplace=True)


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [13]:
# Actual replacement of the missing values in the training data using mode value (categorical variables)
train_data.Gender.fillna(train_data.Gender.mode()[0], inplace=True)
train_data.Married.fillna(train_data.Married.mode()[0], inplace=True)
train_data.Dependents.fillna(train_data.Dependents.mode()[0], inplace=True)
train_data.Self_Employed.fillna(train_data.Self_Employed.mode()[0], inplace=True)
train_data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [14]:
# Print which column has missing values in the test data and count of those records
test_data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [15]:
# Actual replacement of the missing values in the test data using mean value (numerical variables)
test_data.fillna(test_data.mean(), inplace=True)
test_data.isnull().sum()

  test_data.fillna(test_data.mean(), inplace=True)


Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
dtype: int64

In [16]:
# Actual replacement of the missing values in the test data using mode value (categorical variables)
test_data.Gender.fillna(test_data.Gender.mode()[0], inplace=True)
test_data.Dependents.fillna(test_data.Dependents.mode()[0], inplace=True)
test_data.Self_Employed.fillna(test_data.Self_Employed.mode()[0], inplace=True)
test_data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [17]:
# Treatment of outliers
train_data.Loan_Amount_Term=np.log(train_data.Loan_Amount_Term)

In [18]:
# Remove Loan_ID variable as it is Irrelevant
train_data = train_data.drop('Loan_ID', axis=1)
test_data = test_data.drop('Loan_ID', axis=1)

In [19]:
# Create target variable
a = train_data.drop('Loan_Status', 1)
b = train_data.Loan_Status
train_data.head(10)

  a = train_data.drop('Loan_Status', 1)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,5.886104,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,5.886104,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,5.886104,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,5.886104,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,5.886104,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,5.886104,1.0,Urban,Y
6,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,5.886104,1.0,Urban,Y
7,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,5.886104,0.0,Semiurban,N
8,Male,Yes,2,Graduate,No,4006,1526.0,168.0,5.886104,1.0,Urban,Y
9,Male,Yes,1,Graduate,No,12841,10968.0,349.0,5.886104,1.0,Semiurban,N


In [20]:
print(a)

     Gender Married Dependents     Education Self_Employed  ApplicantIncome  \
0      Male      No          0      Graduate            No             5849   
1      Male     Yes          1      Graduate            No             4583   
2      Male     Yes          0      Graduate           Yes             3000   
3      Male     Yes          0  Not Graduate            No             2583   
4      Male      No          0      Graduate            No             6000   
..      ...     ...        ...           ...           ...              ...   
609  Female      No          0      Graduate            No             2900   
610    Male     Yes         3+      Graduate            No             4106   
611    Male     Yes          1      Graduate            No             8072   
612    Male     Yes          2      Graduate            No             7583   
613  Female      No          0      Graduate           Yes             4583   

     CoapplicantIncome  LoanAmount  Loan_Amount_Ter

In [21]:
print(b)

0      Y
1      N
2      Y
3      Y
4      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 614, dtype: object


In [22]:
# Create dummy variables for categorical variables
a = pd.get_dummies(a)
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [23]:
print(a)

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0               5849                0.0  146.412162          5.886104   
1               4583             1508.0  128.000000          5.886104   
2               3000                0.0   66.000000          5.886104   
3               2583             2358.0  120.000000          5.886104   
4               6000                0.0  141.000000          5.886104   
..               ...                ...         ...               ...   
609             2900                0.0   71.000000          5.886104   
610             4106                0.0   40.000000          5.192957   
611             8072              240.0  253.000000          5.886104   
612             7583                0.0  187.000000          5.886104   
613             4583                0.0  133.000000          5.886104   

     Credit_History  Gender_Female  Gender_Male  Married_No  Married_Yes  \
0               1.0              0            1

In [24]:
# Split train data for cross validation
a_train, a_cv, b_train, b_cv = train_test_split(a, b, test_size=0.2)

In [25]:
print(b_train)

304    Y
99     Y
579    Y
268    N
382    Y
      ..
178    Y
58     Y
250    N
138    N
235    Y
Name: Loan_Status, Length: 491, dtype: object


In [26]:
# Fit model
log_model = LogisticRegression()
log_model.fit(a_train, b_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
LogisticRegression()

In [28]:
# Predict values for cv data
pred_cv = log_model.predict(a_cv)

# Evaluate accuracy of model
print(accuracy_score(b_cv, pred_cv)) # 77.24 %
matrix = confusion_matrix(b_cv, pred_cv)
print(matrix)

0.7886178861788617
[[19 19]
 [ 7 78]]


In [29]:
# Fit model
dt_model = tree.DecisionTreeClassifier(criterion='gini')
dt_model.fit(a_train, b_train)

In [30]:
# Predict values for cv data
pred_cv1 = dt_model.predict(a_cv)

# Evaluate accuracy of model
print(accuracy_score(b_cv, pred_cv1)) # 66.66 %
matrix1 = confusion_matrix(b_cv, pred_cv1)
print(matrix1)

0.6991869918699187
[[17 21]
 [16 69]]


In [31]:
# Fit model
rf_model = RandomForestClassifier()
rf_model.fit(a_train, b_train)

In [32]:
# Predict values for cv data
pred_cv2 = rf_model.predict(a_cv)

# Evaluate accuracy of model
print(accuracy_score(b_cv, pred_cv2)) # 72.36 %
matrix2 = confusion_matrix(b_cv, pred_cv2)
print(matrix2)

0.7804878048780488
[[18 20]
 [ 7 78]]


In [33]:
# Fit model 
svm_model = svm.SVC()
svm_model.fit(a_train, b_train)

In [34]:
# Predict values for cv data
pred_cv3 = svm_model.predict(a_cv)

# Evaluate accuracy of model
print(accuracy_score(b_cv, pred_cv3)) # 66.66 %
matrix3 = confusion_matrix(b_cv, pred_cv3)
print(matrix3)

0.6829268292682927
[[ 0 38]
 [ 1 84]]


In [35]:
# Fit model 
nb_model = GaussianNB()
nb_model.fit(a_train, b_train)

In [36]:
# Predict values for cv data
pred_cv4 = nb_model.predict(a_cv)

# Evaluate accuracy of model
print(accuracy_score(b_cv, pred_cv4)) # 76.42 %
matrix4 = confusion_matrix(b_cv, pred_cv4)
print(matrix4)

0.8048780487804879
[[19 19]
 [ 5 80]]


In [37]:
# Fit model
kNN_model = KNeighborsClassifier()
kNN_model.fit(a_train, b_train)

In [38]:
# Predict values for cv data
pred_cv5 = kNN_model.predict(a_cv)

# Evaluate accuracy of model
print(accuracy_score(b_cv, pred_cv5)) # 63.41 %
matrix5 = confusion_matrix(b_cv, pred_cv5)
print(matrix5)

0.6747967479674797
[[ 6 32]
 [ 8 77]]


In [39]:
# Fit model
gbm_model = GradientBoostingClassifier()
gbm_model.fit(a_train, b_train)