In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve,roc_auc_score
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report

In [None]:
data=pd.read_csv('loan_prediction.csv')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
data.info()

There are many null values, and object datatypes, so we need to use both imputation and encoding

In [None]:
data['Gender']=data['Gender'].fillna(data['Gender'].mode()[0])
data['Married']=data['Married'].fillna(data['Married'].mode()[0])
data['Dependents']=data['Dependents'].fillna(data['Dependents'].mode()[0])
data['Self_Employed']=data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])
data['Loan_Amount_Term']=data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0])
data['Credit_History']=data['Credit_History'].fillna(data['Credit_History'].mode()[0])
data['LoanAmount']=data['LoanAmount'].fillna(data['LoanAmount'].mean())

In [None]:
data.isna().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
lab_enc=LabelEncoder()

In [None]:
df=lab_enc.fit_transform(data['Gender'])
pd.Series(df)
data['Gender']=df
df=lab_enc.fit_transform(data['Married'])
pd.Series(df)
data['Married']=df
df=lab_enc.fit_transform(data['Dependents'])
pd.Series(df)
data['Dependents']=df
df=lab_enc.fit_transform(data['Education'])
pd.Series(df)
data['Education']=df
df=lab_enc.fit_transform(data['Self_Employed'])
pd.Series(df)
data['Self_Employed']=df
df=lab_enc.fit_transform(data['Property_Area'])
pd.Series(df)
data['Property_Area']=df
df=lab_enc.fit_transform(data['Loan_Status'])
pd.Series(df)
data['Loan_Status']=df

In [None]:
data.describe()

In [None]:
data=data.drop(columns=['Loan_ID'])
data

In [None]:
plt.figure(figsize=(25,25),facecolor='white')
p=1
for column in data:
    if p <=13:
        ax=plt.subplot(3,4,p)
        sns.distplot(data[column])
        plt.xlabel(column,fontsize=20)
    p+=1
plt.tight_layout()

In [None]:
y=data['Loan_Status']
x=data.drop(columns=['Loan_Status'])

In [None]:
plt.figure(figsize=(25,25),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=15:
        ax=plt.subplot(4,4,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Loan_Status',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
data_corr=data.corr().abs()
plt.figure(figsize=(15,15))
sns.heatmap(data_corr,annot=True,annot_kws={'size':10})
plt.show

In [None]:
from sklearn.feature_selection import SelectKBest,f_classif

In [None]:
best_features=SelectKBest(score_func=f_classif,k='all')

fit=best_features.fit(x,y)

data_scores=pd.DataFrame(fit.scores_)

data_columns=pd.DataFrame(x.columns)

feature_scores=pd.concat([data_columns,data_scores],axis=1)

feature_scores.columns=['Feature_Name','Score']

print(feature_scores.nlargest(9,'Score'))

In [None]:
plt.figure(figsize=(15,15),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=13:
        ax=plt.subplot(3,4,plotnumber)
        sns.lineplot(x='Loan_Status',y=column,data=data)
        plt.xlabel('Loan_Status',fontsize=10)
        plt.ylabel(column,fontsize=10)
    plotnumber+=1
plt.tight_layout()

In [None]:
plt.figure(figsize=(15,15))
plotno=1
for column in x:
    if plotno<=13:
        ax=plt.subplot(4,3,plotno)
        sns.stripplot(y,x[column])
    plotno+=1
plt.show()

Based on heatmap, strip plot and line plot, we can see some features have stronger bond with the outcome

These are the parameters that have strong bond 'Married','Education','CoapplicantIncome','LoanAmount','Property_Area','Loan_Amount_Term','Gender','Dependents'

In [None]:
y=data['Dependents']
x=data[['Married','Education','CoapplicantIncome','LoanAmount','Property_Area','Loan_Amount_Term','Gender','Dependents']]

In [None]:
plt.figure(figsize=(20,20),facecolor='red')
plotno=1
for column in x:
    if plotno<=14:
        ax=plt.subplot(3,4,plotno)
        sns.boxplot(data=x[column])
        plt.xlabel(column,fontsize=12)
    plotno+=1
plt.show()

In [None]:
q1=data.quantile(0.25)
q3=data.quantile(0.75)
iqr=q3-q1

In [None]:
LA=(q3.LoanAmount + (1.5*iqr.LoanAmount))
index=np.where(data['LoanAmount']>LA)
data=data.drop(data.index[index])
data.shape

In [None]:
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.17,random_state=333)

In [None]:
def metric_score(clf,x_train,x_test,y_train,y_test,train=True):
    if train:
        y_pred=clf.predict(x_train)
        print('\n----Train result----')
        print(f'Accuracy Score:{accuracy_score(y_train,y_pred)*100:.2f}%')
        
    elif train==False:
        pred=clf.predict(x_test)
        print('\n----Test result----')
        print(f'Accuracy Score:{accuracy_score(y_test,pred)*100:.2f}%')

        print('\n\n Test Classification report \n',classification_report(y_test,pred,digits=2))

In [None]:
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)

In [None]:
metric_score(knn,x_train,x_test,y_train,y_test,train=True)
metric_score(knn,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)

In [None]:
metric_score(log_reg,x_train,x_test,y_train,y_test,train=True)
metric_score(log_reg,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag_knn=BaggingClassifier(KNeighborsClassifier(n_neighbors=9),
                          n_estimators=6,max_samples=0.75,
                          bootstrap=True,random_state=366,oob_score=True)

In [None]:
bag_knn.fit(x_train,y_train)

In [None]:
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=True)
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.model_selection import KFold,cross_val_score

In [None]:
cross_val_score(log_reg,x_scaled,y,cv=10)

Logistic Regression is giving high result in both Train Test and Cross validation, so we move forward with it

In [None]:
pickle.dump(log_reg,open('Loan Prediction','wb'))