### 1.Load data files(train_u6lujuX_CVtuZ9i.csv and test_Y3wMUE5_7gLdaTN.csv)

In [1]:
import pandas as pd
import numpy as np

In [2]:
Train = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
Test = pd.read_csv("test_Y3wMUE5_7gLdaTN.csv")

In [3]:
Train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
Train.shape

(614, 13)

In [5]:
Test.shape

(367, 12)

### 2.Types of data columns

In [6]:
Train.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [7]:
Test.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

### DATA CLEANING AND PREPROCESSING

### 3.Find missing values

In [8]:
Train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
Test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

### 4.Impute missing values with mean (numerical variables)

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [13]:
imputer = imputer.fit(Train.iloc[:, 8:10])
Train.iloc[:, 8:10] = imputer.transform(Train.iloc[:, 8:10])

In [14]:
Train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [15]:
imputer = imputer.fit(Test.iloc[:, 8:10])
Test.iloc[:, 8:10] = imputer.transform(Test.iloc[:, 8:10])

In [16]:
Test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       29
Property_Area         0
dtype: int64

### 5.Impute missing values with mode (categorical variables)

In [17]:
imputer1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [18]:
imputer1 = imputer1.fit(Train.iloc[:, [1,2,3,5,10]])
Train.iloc[:, [1,2,3,5,10]] = imputer1.transform(Train.iloc[:, [1,2,3,5,10]])

In [19]:
Train.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [20]:
imputer1 = imputer1.fit(Test.iloc[:, [1,2,3,5,10]])
Test.iloc[:, [1,2,3,5,10]] = imputer1.transform(Test.iloc[:, [1,2,3,5,10]])

In [21]:
Test.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

### PREDICTIVE MODELLING

### 6.Remove Loan_ID variable - Irrelevant

In [23]:
Train = Train.drop('Loan_ID', axis = 1)

In [25]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


### 7.Create target variable

In [52]:
dataset = Train

### 8.Build dummy variables for categorical variables

In [53]:
dataset = pd.get_dummies(dataset, drop_first=True)

In [54]:
dataset.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,146.412162,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


In [59]:
dataset.shape

(614, 15)

### 9.Split train data for cross validation

In [65]:
x = dataset.iloc[:,0:15].values
y = dataset.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [78]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x) 
x = scaler.transform(x)

In [79]:
x

array([[ 0.07299082, -0.55448733,  0.        , ..., -0.7820157 ,
         1.42814704,  0.67451931],
       [-0.13441195, -0.03873155, -0.21927331, ..., -0.7820157 ,
        -0.70020801, -1.48253724],
       [-0.39374734, -0.55448733, -0.957641  , ..., -0.7820157 ,
         1.42814704,  0.67451931],
       ...,
       [ 0.43717437, -0.47240418,  1.26937121, ..., -0.7820157 ,
         1.42814704,  0.67451931],
       [ 0.35706382, -0.55448733,  0.4833669 , ..., -0.7820157 ,
         1.42814704,  0.67451931],
       [-0.13441195, -0.55448733, -0.15972753, ...,  1.2787467 ,
        -0.70020801, -1.48253724]])

### (a)LOGISTIC REGRESSION ALGORITHM

### 10.Fit model

In [81]:
# Fit (train) the Logistic Regression classifier
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg_model = logreg.fit(X_train, y_train)

### 11.Predict values for cv data

In [83]:
logreg_prediction = logreg.predict(X_test)

### 12.Print classification report

In [85]:
import sklearn.metrics as met
print(met.confusion_matrix(logreg_prediction, y_test))
print(met.classification_report(logreg_prediction, y_test))

[[ 49   0]
 [  2 134]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        49
           1       1.00      0.99      0.99       136

    accuracy                           0.99       185
   macro avg       0.98      0.99      0.99       185
weighted avg       0.99      0.99      0.99       185



### 13.Evaluate accuracy of model

In [145]:
log_reg_accuracy = 100*met.accuracy_score(logreg_prediction, y_test).round(2)
print("Accuracy {0:.2f}%".format(100*met.accuracy_score(logreg_prediction, y_test)))

Accuracy 98.92%


### (b)DECISION TREE ALGORITHM

### 14.Fit model

In [87]:
from sklearn.tree import DecisionTreeClassifier
decision_classifier = DecisionTreeClassifier()
decision_classifier = decision_classifier.fit(X_train,y_train)

### 15.Predict values for cv data

In [88]:
y_pred = decision_classifier.predict(X_test)

### 16.Evaluate accuracy of model

In [142]:
decision_tree = 100*met.accuracy_score(y_test, y_pred).round(2)
print("Accuracy {0:.2f}%".format(100*met.accuracy_score(y_test, y_pred)))

Accuracy 100.00%


### (c)SUPPORT VECTOR MACHINE (SVM) ALGORITHM

### 17.Fit model

In [109]:
from sklearn.svm import SVC
svm_classifier = SVC()
svm_model = svm_classifier.fit(X_train, y_train)

### 18.Predict values for cv data

In [110]:
svm_prediction = svm_classifier.predict(X_test)

### 19.Evaluate accuracy of model

In [140]:
svm_accuracy = 100*met.accuracy_score(svm_prediction, y_test).round(2)
print("Accuracy {0:.2f}%".format(100*met.accuracy_score(svm_prediction, y_test)))

Accuracy 72.43%


### (d)NAIVE BAYES ALGORITHM

### 20.Fit model

In [123]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)

### 21.Predict values for cv data

In [124]:
y_pred_gnb = gnb.predict(X_test)

### 22.Evaluate accuracy of model

In [133]:
gnb_accuracy = 100*met.accuracy_score(y_pred_gnb, y_test)
print("Accuracy {0:.2f}%".format(100*met.accuracy_score(y_pred_gnb, y_test)))

Accuracy 100.00%


In [135]:
#Select best model in order of accuracy
#Naive Bayes - 
#Logistic Regression - 
#Decision Tree - 
#Support Vector Machine - 


In [147]:
Model_accuracies = {'Naive_Bayes':[gnb_accuracy],
       'Support_VM':[svm_accuracy],
       'Decision_T':[decision_tree],
       'Log_regression':[log_reg_accuracy]}
Accuracies = pd.DataFrame(Model_accuracies)
Accuracies

Unnamed: 0,Naive_Bayes,Support_VM,Decision_T,Log_regression
0,100.0,72.0,100.0,99.0
