## Loan Prediction problem

Here we are trying to predict whether a person will repay the loan amount or not based on multiple parameters.


In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd

In [2]:
clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = GaussianNB()

df=pd.read_csv(r"train_clean_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 15 columns):
Unnamed: 0                 517 non-null int64
Loan_ID                    517 non-null object
Dependents                 517 non-null object
ApplicantIncome            517 non-null int64
CoapplicantIncome          517 non-null float64
LoanAmount                 517 non-null float64
Loan_Amount_Term           517 non-null float64
Credit_History             517 non-null float64
gender_Male                517 non-null int64
married_Yes                517 non-null int64
education_Not Graduate     517 non-null int64
property_area_Semiurban    517 non-null int64
property_area_Urban        517 non-null int64
self_employed_Yes          517 non-null int64
Loan_status_Y              517 non-null int64
dtypes: float64(4), int64(9), object(2)
memory usage: 60.7+ KB


## Train test Split

In [4]:
X = df.drop(['Loan_ID','Dependents','Loan_status_Y'],axis =1 )
y = df['Loan_status_Y']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                    random_state=42)

## Voting Classifier

We are taking the outputs of multiple classifiers like:
    1. Logistic Regression
    2. Random Forest 
    3. Gausian Naive Bayes
   
Then take the majority votes to decide the final classification.

In [5]:
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],weights=[1,2,4], 
                         voting='soft')
eclf1 = eclf1.fit(X_train, y_train)
prediction = eclf1.predict(X_test)


from sklearn.metrics import classification_report , confusion_matrix

print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.81      0.41      0.55        41
           1       0.84      0.97      0.90       130

    accuracy                           0.84       171
   macro avg       0.82      0.69      0.72       171
weighted avg       0.83      0.84      0.82       171

[[ 17  24]
 [  4 126]]




In [6]:
from sklearn.metrics import classification_report , confusion_matrix
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), 
                                     ('gnb', clf3)], voting='soft')
eclf2 = eclf2.fit(X_train, y_train)
predict = eclf2.predict(X_test)
print(classification_report(y_test, predict))
print('\n')
print(confusion_matrix(y_test, predict))

              precision    recall  f1-score   support

           0       0.86      0.44      0.58        41
           1       0.85      0.98      0.91       130

    accuracy                           0.85       171
   macro avg       0.85      0.71      0.74       171
weighted avg       0.85      0.85      0.83       171



[[ 18  23]
 [  3 127]]


