In [2]:
# Importing required Packages
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_link = "https://raw.githubusercontent.com/sahutkarsh/loan-prediction-analytics-vidhya/master/train.csv"

df = pd.read_csv(data_link)

In [5]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
del df['Loan_ID']

## Separating Features & Target variable

In [7]:
df_features = df.iloc[:,0:11]
df_target = df['Loan_Status']

In [8]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df_features,df_target,test_size=0.3,random_state=0)

## Pipeline creation

### Separating Numeric & Categorical Features

In [9]:
numeric_features = df_features.select_dtypes(include=[np.number]).columns
categorical_features = df_features.select_dtypes(exclude=[np.number]).columns

In [10]:
categorical_features

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

### Feature Transformation Pipeline

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# numeric_features = ['Salary']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# categorical_features = ['Age','Country']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

### Model Building Pipeline

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline
lr_model = LogisticRegression()
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()

clf_lr = make_pipeline(preprocessor , lr_model)
clf_dt = make_pipeline(preprocessor , dt_model)
clf_rf = make_pipeline(preprocessor , rf_model)

models = [clf_lr, clf_dt, clf_rf]

### Model Training

In [24]:
# Fit the pipelines
for model in models:
    model.fit(X_train, y_train)

### Accuracy

In [25]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

for i,model in enumerate(models):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.8324324324324325
Decision Tree Test Accuracy: 0.7567567567567568
RandomForest Test Accuracy: 0.7837837837837838


## Model Performance Evaluation

In [18]:
pred= clf_lr.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

results = confusion_matrix(y_test,pred)
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(y_test,pred))

print (classification_report(y_test,pred))

Confusion Matrix :
[[ 22  29]
 [  2 132]]
Accuracy Score : 0.8324324324324325
              precision    recall  f1-score   support

           N       0.92      0.43      0.59        51
           Y       0.82      0.99      0.89       134

    accuracy                           0.83       185
   macro avg       0.87      0.71      0.74       185
weighted avg       0.85      0.83      0.81       185

