In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlxtend.plotting import plot_decision_regions
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('Encoded_Loan_Data.csv')

In [5]:
df.columns

Index(['Gender_Encode', 'Married_Encode', 'Education_Encode',
       'Self_Employed_Encode', 'Dependents', 'ApplicantIncome',
       'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area_Encode', 'Loan_Status_Encode'],
      dtype='object')

In [6]:
X = df[['Gender_Encode','Married_Encode','Education_Encode','Self_Employed_Encode', 'Dependents', 'ApplicantIncome',
       'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History','Property_Area_Encode']]

y = df['Loan_Status_Encode']

In [7]:
X.shape, y.shape

((598, 11), (598,))

<h3>Training and Testing data</h3>

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.3,random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((418, 11), (180, 11), (418,), (180,))

<h3>Feature Scaling on Numeric Columns, not on encoded columns.</h3>

In [10]:
scale_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

scaler = MinMaxScaler()

X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

<h3>Logistic Regression</h3>

In [11]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)

In [12]:
y_train_pred = logistic_model.predict(X_train)

In [13]:
# Training Accuracy (built-in)
round(logistic_model.score(X_train,y_train)*100,2)

81.34

In [14]:
# Training Accuracy (manual check)
print('Training data Accuracy Score of Logistic Regression is',round(accuracy_score(y_train,y_train_pred)*100,2))

Training data Accuracy Score of Logistic Regression is 81.34


In [15]:
round(logistic_model.score(X_test,y_test)*100,2)

79.44

In [16]:
y_test_pred = logistic_model.predict(X_test)

In [17]:
print("Testing data Accuracy Score of Logistic Regression is",round(accuracy_score(y_test,y_test_pred)*100,2))

Testing data Accuracy Score of Logistic Regression is 79.44


In [33]:
df.to_csv('Final_Scale_Encoded_Loan_data.csv',index=False)

<h1>Accuracy Score</h1>

<h4>Training data Accuracy Score of Logistic Regression is 81.34</h4>

<h4>Testing data Accuracy Score of Logistic Regression is 79.44</h4>

In [18]:
print("Accuracy :", round(accuracy_score(y_test, y_test_pred)*100, 2))
print("Precision:", round(precision_score(y_test, y_test_pred)*100, 2))
print("Recall   :", round(recall_score(y_test, y_test_pred)*100, 2))
print("F1-score :", round(f1_score(y_test, y_test_pred)*100, 2))

Accuracy : 79.44
Precision: 78.06
Recall   : 97.58
F1-score : 86.74


<h3>Pickling</h3>

In [19]:
import pickle

In [20]:
pickle.dump(logistic_model,open('logistic.pkl','wb'))
pickle.dump(scaler,open('log_scale.pkl','wb'))