### Loan dataset for prediction

In [20]:
# required libraries
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [21]:
# read the dataset
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [22]:
print('\n Column Names\n')
print(data.columns)


 Column Names

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [23]:
#label encode the target variable
encode = LabelEncoder()
data.Loan_Status = encode.fit_transform(data.Loan_Status)
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [25]:
# drop the null values
data.dropna(how='any',inplace=True)
data.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1


### One way of splitting data 

In [26]:
# train-test-split   
train , test = train_test_split(data,test_size=0.2,random_state=0)
print(train.shape)
print(test.shape)


(384, 13)
(96, 13)


### Another way of splitting data

In [27]:
data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [28]:
X=data.drop(['Loan_ID','Loan_Status'],axis=1)
y=data['Loan_Status']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape)
print(X_test.shape)


(384, 11)
(96, 11)


In [29]:
X_train.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
481,Male,Yes,3+,Not Graduate,No,3095,0.0,113.0,360.0,1.0,Rural
341,Female,No,0,Graduate,No,2378,0.0,46.0,360.0,1.0,Rural
297,Female,Yes,1,Graduate,No,4666,0.0,135.0,360.0,1.0,Urban


In [30]:
X_test.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
18,Male,Yes,0,Not Graduate,No,4887,0.0,133.0,360.0,1.0,Rural
161,Male,Yes,0,Graduate,No,7933,0.0,275.0,360.0,1.0,Urban
182,Male,Yes,0,Graduate,No,4600,0.0,73.0,180.0,1.0,Semiurban


In [31]:
# encode the data
X_train= pd.get_dummies(X_train)
X_test= pd.get_dummies(X_test)
X_test.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
18,4887,0.0,133.0,360.0,1.0,0,1,0,1,1,0,0,0,0,1,1,0,1,0,0
161,7933,0.0,275.0,360.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,1
182,4600,0.0,73.0,180.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0


In [32]:
X_test.head(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
18,4887,0.0,133.0,360.0,1.0,0,1,0,1,1,0,0,0,0,1,1,0,1,0,0
161,7933,0.0,275.0,360.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,1
182,4600,0.0,73.0,180.0,1.0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0


In [33]:
print('shape of training data : ',X_train.shape)
print('shape of testing data : ',X_test.shape)

shape of training data :  (384, 20)
shape of testing data :  (96, 20)


### LogisticRegression

In [34]:
# create the object of the model
model = LogisticRegression()
model=model.fit(X_train,y_train)
predict = model.predict(X_test)

In [35]:
print('Predicted Values on Test Data',predict)
print('Accuracy Score on test data :',accuracy_score(y_test,predict))

Predicted Values on Test Data [1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1]
Accuracy Score on test data : 0.760416666667


### Decision tree

In [36]:
# create a Decision Tree classifier object
cfd=DecisionTreeClassifier()
# train decision tree classifier
model=cfd.fit(X_train,y_train)
# Predict the response for test dataset
predict=model.predict(X_test)
print('The predicted values are:',predict)
print('Accuracy:',accuracy_score(y_test,predict))

The predicted values are: [1 1 1 1 0 1 1 1 1 0 0 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1
 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0 0 0 1 0 0 1 0 0 1 1 1 0 1 1 1 1
 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 0 1 0 0 0 1 0]
Accuracy: 0.65625


In [38]:
predict=list(predict)
y_test=list(y_test)

In [46]:
import seaborn as sns
data={'Actual_v':y_test,
      'Predicted':predict}
df = pd.DataFrame(data, columns=['Actual_v','Predicted'])
confusion_matrix = pd.crosstab(df['Actual_v'], df['Predicted'],rownames=['Actual'], colnames=['Predicted'], margins = True)
confusion_matrix


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18,17,35
1,16,45,61
All,34,62,96


In [47]:
actual =y_test      # [1, 1, 0, 1, 0, 0, 1, 0, 0, 0] 
predicted =predict    # [1, 0, 0, 1, 0, 0, 1, 1, 1, 0] 
results = pd.crosstab(df['Actual_v'], df['Predicted'], 
rownames=['Actual'], colnames=['Predicted'], margins = True)
print('\nConfusion Matrix\n:')
print(results)
print('\n----------------------\n')
print('Accuracy Score :',accuracy_score(actual, predicted))
print('\n-----------------------\n')
print ('\nReport\n')
print(classification_report(actual, predicted))


Confusion Matrix
:
Predicted   0   1  All
Actual                
0          18  17   35
1          16  45   61
All        34  62   96

----------------------

Accuracy Score : 0.65625

-----------------------


Report

             precision    recall  f1-score   support

          0       0.53      0.51      0.52        35
          1       0.73      0.74      0.73        61

avg / total       0.65      0.66      0.66        96

