In [None]:
"""
We perform the EDA here based on the early assumptions
EDA includes:
Cleaning the data
Splitting the data
Model the data
Check for better accuracy
"""
import numpy as np
import pandas as pd
import seaborn as sns
import featuretools as ft
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris


In [None]:
# For plotting graphs
import matplotlib.pyplot as plt
%matplotlib inline
# To ignore the warnings
import warnings
warnings.filterwarnings("ignore")

After understaning the problem statement , we would now move on to hypothesis generation.

<h4>Hypothesis Generation </h4>

Below are some of the factors which we think can affect Loan Approval: 
    <ol>
    <li> Salary: Applicants with higher income have more chances of loan approval.</li>
    <li> Previous History: Applicants who have repayed their earlier debts have greater changes of loan approval .</li>
    <li> Loan Amount: If the loan amount is less , then the chances of loan approval is high. </li>
    <li> Loan Term: Loan for less time period and less amount should have higher chances of approval . </li>
    <li> EMI: Lesser the amount to be paid monthly to repay the loan , higher chances of approval . </li>

 Now, we are going to upload or read the files/data-sets using pandas. For this we used read_csv.

In [None]:
df = pd.read_csv("../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df.head(5)

To determine the accuracy of our model after training it, we will test on the unseen data called as test data and thus using the function as given below we have split our dataset into train and test.

In [None]:
train,test = train_test_split(df, test_size =0.2)

Thus, using the function above we have successfully divided our dataset. 80% of the total set is train and rest 20% is the test data

In [None]:
train.head()
train.shape

In [None]:
train.describe()

In [None]:
train.head(5)

In [None]:
test.shape

In [None]:
test.head()

In this section we will do univariate analysis. It is the simplest form of analysing data, where we examine each variable individually .

<h3>Univariate Analysis</h3>

We are visulaising different types of variables :
<ol>
    <li>Categorical Features</li>
    <li>Ordinal Features</li>
    <li>Numerical Features</li>
</ol>

In [None]:
train['Loan_Status'].value_counts()

In [None]:
gender_count = train['Gender'].value_counts()

In [None]:
train['Married'].value_counts()

In [None]:
df['Dependents'].value_counts()

In [None]:
edu_count = train['Education'].value_counts()
print(edu_count)

In [None]:
self_count = train['Self_Employed'].value_counts()
print(self_count)

In [None]:
credit_count = train['Credit_History'].value_counts()
print(credit_count)

In [None]:
prop_count = train['Property_Area'].value_counts()
print(prop_count)

<h4>Categorical Features</h4>

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(gender_count.index, gender_count.values/491, alpha=0.9 )
plt.title('Gender')

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(train['Married'].value_counts().index, train['Married'].value_counts().values/491, alpha=0.9 )
plt.title('Married')

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(self_count.index, self_count.values/491, alpha=0.9 )
plt.title('Self Employed')

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(credit_count.index, credit_count.values/491, alpha=0.9 )
plt.title('Credit History')

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(train['Loan_Status'].value_counts().index, train['Loan_Status'].value_counts().values/491, alpha=0.9 )
plt.title('Loan Status')

As we can see, our data is <strong>imbalanced</strong> since there 70-30 distribution between approved and unapproved applicants.

<h4>Ordinal Feaures </h4>

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(train['Dependents'].value_counts().index, train['Dependents'].value_counts().values/491, alpha=0.9 )
plt.title('Dependents')

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(edu_count.index, edu_count.values/491, alpha=0.9 )
plt.title('Education')

In [None]:
plt.figure(figsize=(3,5))
sns.barplot(prop_count.index, prop_count.values/491, alpha=0.9 )
plt.title('Property Area')

<h5> Here we will pause for conclusions from the data we plotted:</h5>

<ol>
<li>Firstly, we can see that almost 70% of the applicants had their loan applications accepted.</li>
<li>Most of the applicants have a good credit history.</li>
<li>Very few who are self employed have applied for the loan.</li>
<li>There is a large gender gap in the applied loan applicants. Most of them are male.</li>
<li>Almost 65% of the applicants are Married.</li>
<li>Less than 10% of the applicants have 3+ dependents.</li>
<li>Graduates make up almost 78% of the applicants.</li>
</ol>

As our main target is Loan Status Variable, we will try to find if Applicant income can exactly separate the Loan Status.
Suppose if we can find that if applicant income is above some X amount then Loan Status is yes .Else it is No. Firstly I am trying to plot the distribution plot based on Loan Status.

<h4>Numerical Features </h4>

In [None]:
sns.set_style=("whitegrid")
sns.FacetGrid(train,hue="Loan_Status",size=4).map(sns.distplot,"ApplicantIncome").add_legend();
plt.show()

In [None]:
sns.set_style=("whitegrid")
sns.FacetGrid(train,hue="Loan_Status",size=4).map(sns.distplot,"CoapplicantIncome").add_legend();
plt.show()

Unfortunately we cannot segregate based on Applicant Income alone. The same is the case with Co-applicant Income as shown in the graph plotted above. We try a different approach and do a scatter plot. Also, the distribution of both the graphs is not normal, we would try to make these distributions normal in the later section so that the algorithm can fit better.
We try a different approach and do a scatter plot.

In [None]:
sns.set_style=("whitegrid")
sns.FacetGrid(train,hue="Loan_Status",size=4).map(plt.scatter,"Credit_History","ApplicantIncome").add_legend();
plt.show()

After observing the graph above we can say that the people having credit history as zero and with an income of less than
20,000 can be segregated as NO. Although this is not very accurate.

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=train["ApplicantIncome"])

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=train["CoapplicantIncome"])

It can be inferred from the above two graphs that most of the data in the distribution of applicant income and Coapplicant Income is towards the left which means it is not normally distributed and the box plot confirms the presence of a lot of outliers which can be attributed to income disparity in the society.

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x="Education", y="ApplicantIncome", data=train)


We can see that there are graduates with very high incomes are outliers which would be taken care of later.

In [None]:
sns.set_style=("whitegrid")
sns.FacetGrid(train,hue="Loan_Status",size=4).map(sns.distplot,"LoanAmount").add_legend();
plt.show()

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=train["LoanAmount"])

We observe a lot of outliers in this variable and the distribution is fairly normal. We will treat the outliers in the next section.


<h6>Now we would like to know how well each feature correlate with Loan Status, therefore we would look at the bivariate analysis.</h6>

<h3>Bivariate Analysis</h3>

<h6>Categorical Independent variable VS target Variable </h6>

In [None]:
ct=pd.crosstab(train.Married, train.Loan_Status)
print(ct)

In [None]:
n = 1
Loan_Status = np.random.choice([True,False], n)
Married = np.random.choice(['Yes','No', ], n)
ct.plot.bar(stacked=True)
plt.legend(title='Loan_status')

plt.show()

We can infer that percentage of married people who have got their loan approved is higher when compared to non- married people.

In [None]:
ct1=pd.crosstab(train.Dependents,train.Loan_Status)

In [None]:
n = 1
Loan_Status = np.random.choice([True,False], n)
Dependents= np.random.choice(['0','1', '2','3+'], n)
ct1.plot.bar(stacked=True)
plt.legend(title='Loan_status')

plt.show()

The percentage of applicants with either 0 or 2 dependents have got their loan approved is higher.

In [None]:
ct2=pd.crosstab(train.Education,train.Loan_Status)
print(ct2)

In [None]:
n = 1
Loan_Status = np.random.choice([True,False], n)
Education= np.random.choice(['Yes','No'], n)
ct2.plot.bar(stacked=True)
plt.legend(title='Loan_status')

plt.show()

The percentage of applicants who are graduates have got their loan approved rather than the one who are not graduates.

In [None]:
ct3=pd.crosstab(train.Self_Employed,train.Loan_Status)
print(ct3)

In [None]:
n = 1
Loan_Status = np.random.choice([True,False], n)
Self_Employed= np.random.choice(['Yes','No'], n)
ct3.plot.bar(stacked=True)
plt.legend(title='Loan_status')

plt.show()

There is hardly any correlation between Loan_Status and Self_Employed applicants.
So in short we can say that it doesn’t matter whether the applicant is self employed or not.

In [None]:
ct4=pd.crosstab(train.Credit_History,train.Loan_Status)
print(ct4)

In [None]:
n = 1
Loan_Status = np.random.choice([True,False], n)
Credit_History= np.random.choice(['0.0','1.0'], n)
ct4.plot.bar(stacked=True)
plt.legend(title='Loan_status')

plt.show()

As observed from the graph above that people with bad credit history are  less likely to get a loan as compared to people
with good credit history.

In [None]:
ct5=pd.crosstab(train.Property_Area,train.Loan_Status)
print(ct5)

In [None]:
n = 1
Loan_Status = np.random.choice([True,False], n)
Property_Area= np.random.choice(['Rural','Urban','Semiurban'], n)
ct5.plot.bar(stacked=True)
plt.legend(title='Loan_status')

plt.show()

Out of the three plotted above, it can be infered that the people living in Semiurban places have a higher percentage to
to get acquire a loan.

<h6>Numerical Independent variable VS Target Variable </h6>

In [None]:
train.groupby('Loan_Status')['ApplicantIncome'].mean().plot.bar()

Here the y axis shows mean applicant income, no changes in the mean applicant income so we make bins for the applicant income varaiable based on the values in it and analyse the corresponding loan status for each bin.

In [None]:
train.shape

In [None]:
bins=[0,2500,4000,6000,81000]
group=['Low','Average','High','Very High']
train['Income_bin']=pd.cut(train['ApplicantIncome'],bins,labels=group)

In [None]:
Income_bin=pd.crosstab(train['Income_bin'],train['Loan_Status'])
Income_bin.div(Income_bin.sum(1).astype(float),axis=0).plot(kind="bar",stacked=True)
plt.xlabel('ApplicantIncome') 
P=plt.ylabel('Percentage')

It can be inferred that Applicant Income does not affect the chances of loan approval which contradicts our hypothesis, in which the one with higher applicant income would have higher chance of approval. 

In [None]:
bins=[0,1000,3000,42000]
group=['Low','Average','High']
train['Coapplicant_Income_bin']=pd.cut(train['CoapplicantIncome'],bins,labels=group)

In [None]:
Coapplicant_Income_bin=pd.crosstab(train['Coapplicant_Income_bin'],train['Loan_Status'])
Coapplicant_Income_bin.div(Coapplicant_Income_bin.sum(1).astype(float),axis=0).plot(kind="bar",stacked=True)
plt.xlabel('CoapplicantIncome') 
P=plt.ylabel('Percentage')

It shows that if the coapplicant's income is less, then there is a higher chance of loan approval. But this does not look right . The possible explanation can be that not many applicants have a coapplicant , therefore the coapplicant income for them is 0 and hence the loan approval is not dependent on it.So we will make a new variable to see the combined effect on Loan status.

In [None]:
train['Total_Income']=train['ApplicantIncome'] + train['CoapplicantIncome']

In [None]:
bins=[0,2500,4000,6000,81000]
group=['Low','Average','High','Very High']
train['Total_Income_bin']=pd.cut(train['Total_Income'],bins,labels=group)

In [None]:
Total_Income_bin=pd.crosstab(train['Total_Income_bin'],train['Loan_Status'])
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float),axis=0).plot(kind="bar",stacked=True)
plt.xlabel('TotalIncome') 
P=plt.ylabel('Percentage')

We can see that the proportion of loans getting approved for Low income is lower than when compared to Average, high and Very High total income.

In [None]:
bins=[0,100,200,700]
group=['Low','Average','High']
train['LoanAmount_bin']=pd.cut(train['LoanAmount'],bins,labels=group)

In [None]:
LoanAmount_bin=pd.crosstab(train['LoanAmount_bin'],train['Loan_Status'])
LoanAmount_bin.div(LoanAmount_bin.sum(1).astype(float),axis=0).plot(kind="bar",stacked=True)
plt.xlabel('LoanAmount') 
P=plt.ylabel('Percentage')

Proportion of approved loans is higher for low and average Loan Amount as compared to that of High loan amount which supports our our initial hypothesis that approval chance is more for less loan amount. 

Now we will drop the bins we made for the exploration part and move on to the next section of data cleaning .

In [None]:
train=train.drop(['Income_bin','Coapplicant_Income_bin','LoanAmount_bin','Total_Income_bin','Total_Income'], axis=1);

In [None]:
train.shape
train.head()

With this, we have completed our Bivariate analysis.

<h5>With this we have completed our exploratory data analysis on the training data and now we move on to data cleaning of both the train data and test data.</h5>

<h3>Data Cleaning</h3>

Checking for missing values in train dataset

In [None]:
train.isnull().sum()

This suggests that in our training dataset we have 12 missing values in Gender, 3 in married and so on.

In [None]:
cols = train.columns[:30] # first 30 columns
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(train[cols].isnull(), cmap=sns.color_palette(colours))

<h6>We have used a function for heatmap here. The heatmap is a way of representing the data in a 2-dimensional form. The data values are represented as colors in the graph. The goal of the heatmap is to provide a colored visual summary of information. Since we have less number of features here, we have used the heatmap data technique to gain a perspective on null data values.</h6>

Now that we have encountered our problem, there are three solutions which we can implement to counter it:

<ol>
    <li>Drop the observation.</li>
    <li>Drop the feature.</li>
    <li>Impute the missing data.</li>
</ol>

We have implemented the third solution. 

In [None]:
# We observe that we have many missing values in different columns. 
# Now we are going to replace the categorical values by most occuring value
# and numerical values by median. Replacing missing values

train['Gender'].fillna(train['Gender'].mode()[0],inplace = True)
train['Dependents'].fillna(train['Dependents'].mode()[0],inplace = True)
train['Married'].fillna(train['Married'].mode()[0],inplace = True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0],inplace = True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0],inplace = True)

In [None]:
train['LoanAmount'].median()

In [None]:
train['LoanAmount'].fillna(128, inplace =True)

In [None]:
train['Loan_Amount_Term'].median()

In [None]:
train['Loan_Amount_Term'].mode()[0]

The Loan_Amount_Term is a continuous variable here. So instead of directly going with the median here, we would like to get an idea of what data represents by seeing the most occuring value.

But, here we get both the mode and median as 360. So we are choosing 360 to replace which just validates our point. 

In [None]:
train['Loan_Amount_Term'].fillna(360, inplace =True)

In [None]:
train.isnull().sum()

We have successfully imputed all the missing values and now no null values exist in the train dataset.

<h5>Checking for missing  values in test dataset</h5>

In [None]:
test.isnull().sum()

In [None]:
cols = test.columns[:30] # first 30 columns
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(test[cols].isnull(), cmap=sns.color_palette(colours))

We observe that there are few missing values in test data set and now we will impute the new values , the same way we did for the train dataset above.

In [None]:
# We observe that we havefew missing values in different columns. 
# Now we are going to replace the categorical values by most occuring value
# and numerical values by median. Replacing missing values

test['Gender'].fillna(test['Gender'].mode()[0],inplace = True)
test['Dependents'].fillna(test['Dependents'].mode()[0],inplace = True)
test['Married'].fillna(test['Married'].mode()[0],inplace = True)
test['Self_Employed'].fillna(test['Self_Employed'].mode()[0],inplace = True)
test['Credit_History'].fillna(test['Credit_History'].mode()[0],inplace = True)

In [None]:
test['LoanAmount'].median()

In [None]:
test['LoanAmount'].fillna(122.5, inplace =True)

In [None]:
test['Loan_Amount_Term'].median()

In [None]:
test['Loan_Amount_Term'].mode()[0]

In [None]:
test['Loan_Amount_Term'].fillna(360, inplace =True)

In [None]:
test.isnull().sum()

<h5>Checking for outliers in our data</h5>

In [None]:
train['Loan_Amount_Term'].hist(bins=100)

For this graph we can consider, a value between 10  to 170  as an outlier .

In [None]:
test['Loan_Amount_Term'].hist(bins=100)

For this graph we can consider, a value between 50  to 170  as an outlier .

In [None]:
train['LoanAmount'].hist(bins=100)

For this graph we can consider, a value above 600 as an outlier.

In [None]:
test['LoanAmount'].hist(bins=100)

We have used a histogram plot to observe any outliers if present in the data, as shown in the graphs above, there are  a few outliers.  


The methods of handling outliers are somewhat similar to missing data. We either drop or adjust or keep them. We are trying to visualize them using log transformation.

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=train["LoanAmount"])

In [None]:
train['LoanAmount_log'] = np.log(train['LoanAmount'])
train['LoanAmount_log'].hist(bins =20)

Now the distribution looks much closer to normal and effect of extreme values has been significantly subsided.

In [None]:
test['LoanAmount_log'] = np.log(test['LoanAmount'])

In [None]:
#Check for Unique

In [None]:
train.apply(lambda x: len(x.unique()))

In [None]:
test.apply(lambda x: len(x.unique()))

The Loan_ID should not be same for two records, thus we check whether all the records have their unique Loan_ID variable. Since, we can see Loan_ID value is unique for both test and train, we proceed. 


We can say that our data is cleaned and properly structured, we can move forward with out model building. 

In [None]:
train_1 = train.drop('Loan_ID', axis = 1)
test_1 = test.drop('Loan_ID', axis =1)

In [None]:
X_train = train_1.drop('Loan_Status', 1)
y_train = train_1.Loan_Status

In [None]:
X_test = test_1.drop('Loan_Status', 1)
y_test = test_1.Loan_Status

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.head(10)

In [None]:
y_test.head(10)

Since we have a lot of categorical variables which affect Loan Status, we need to convert each of them in to numeric data for modeling.



In [None]:
X_train.head(5)

In [None]:
X_train1= pd.get_dummies(X_train)
train_2 = pd.get_dummies(train_1)
test_2 = pd.get_dummies(test_1)
X_test1 = pd.get_dummies(X_test)

In [None]:
approved_term = train_1[train_1['Loan_Status']=='Y']['Loan_Amount_Term'].value_counts()
unapproved_term = train_1[train_1['Loan_Status']=='N']['Loan_Amount_Term'].value_counts()
df1 = pd.DataFrame([approved_term,unapproved_term])
df1.index = ['Approved','Unapproved']
df1.plot(kind='bar', figsize=(10,8))

In [None]:
y_train.replace('N', 0,inplace=True)
y_train.replace('Y', 1,inplace=True)

In [None]:
data_corr = pd.concat([X_train1, y_train], axis=1)
corr = data_corr.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True);

In [None]:
data_corr.corr()

<h2> Initial Model Building </h2>

In [None]:
y_t = train_1.Loan_Status

In [None]:
train_1['Loan_Status'].replace('N', 0,inplace=True)
train_1['Loan_Status'].replace('Y', 1,inplace=True)

In [None]:
X_train1.head()

In [None]:
y_t.head()

In [None]:
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifier = []
classifier.append(("LogisticReg", LogisticRegression(solver='liblinear', multi_class='ovr')))
classifier.append(("KNN", KNeighborsClassifier()))
classifier.append(("NaiveBayes", GaussianNB()))

In [None]:
seed = 0
results = []
names = []
for name, model in classifier:
    kfold = model_selection.KFold(n_splits=10, random_state=seed ,shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train1, y_t, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train1,y_t)

In [None]:
y_pred = logreg.predict(X_test1)

In [None]:
test_1['Loan_Status'].replace('N', 0,inplace=True)
test_1['Loan_Status'].replace('Y', 1,inplace=True)

In [None]:
y_te = test_1.Loan_Status

In [None]:
#accuracy_score = accuracy_score(y_te, y_pred)
#print(accuracy_score)

In [None]:
import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


In [None]:
cm = confusion_matrix(y_te, y_pred)
print(cm)

In [None]:
report = classification_report(y_te, y_pred)
print(report)

In [None]:
"""Naive Bayes Classifier"""
from sklearn.naive_bayes import GaussianNB
NaiveB = GaussianNB()
NaiveB.fit(X_train1, y_t)
y_pred1 = NaiveB.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_nb = confusion_matrix(y_te, y_pred1)
print(cm_nb)

In [None]:
as_nb = accuracy_score(y_te, y_pred1)
print(as_nb)

In [None]:
""" LOGISTIC REGRESSION """
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 42)
classifier.fit(X_train1, y_t)
y_pred = classifier.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm_logreg = confusion_matrix(y_te, y_pred)
as_logreg=accuracy_score(y_te, y_pred)

""" K-NEAREST NEIGHBORS """
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 19, metric = 'minkowski', p = 2)
classifier.fit(X_train1, y_t)
y_pred = classifier.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_knn = confusion_matrix(y_te, y_pred)
as_knn=accuracy_score(y_te, y_pred)

In [None]:
print(cm_knn)

In [None]:
print(as_knn)

In [None]:
report = classification_report(y_te, y_pred)
print(report)

In [None]:
print(cm_logreg)

In [None]:
print(as_logreg)

Out of all the algorithms mentioned above , the best accuracy observed is with logistic regression after implementing the K-fold technique and the least accuracy is of K-nearest neighbours .

In [None]:
X_train1.head()

<h2>Feature Engineering </h2>

<h4>Getting new Data Columns</h4>

Instead of evaluating the applicant and co-applicant income separately, we combine it into a new column - Total_Income so that we get a better perspective.  

Total_Income = Applicant Income + Co-applicant Income

In [None]:
X_train1['TotalIncome'] = X_train1['ApplicantIncome'] + X_train1['CoapplicantIncome']

In [None]:
sns.distplot(X_train1['TotalIncome'])

In [None]:
X_train1['TotalIncomeLog'] = np.log(X_train1['TotalIncome'])
sns.distplot(X_train1['TotalIncomeLog'])

In [None]:
X_test1['TotalIncome'] = X_test1['ApplicantIncome'] + X_test1['CoapplicantIncome']
X_test1['TotalIncomeLog'] = np.log(X_test1['TotalIncome'])

In [None]:
X_test1['EMI'] = X_test1['LoanAmount']/ X_test1['Loan_Amount_Term']

In [None]:
X_train1['EMI'] = X_train1['LoanAmount']/ X_train1['Loan_Amount_Term']

In [None]:
sns.distplot(X_train1['EMI'])

In [None]:
X_train1['BalanceIncome'] = X_train1['TotalIncome'] - (X_train1['EMI']*1000)
sns.distplot(X_train1['BalanceIncome'])

In [None]:
X_test1['BalanceIncome'] = X_test1['TotalIncome'] - (X_test1['EMI']*1000)

In [None]:
X_train1 = X_train1.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis = 1)

In [None]:
X_train1.head()

In [None]:
X_test1 = X_test1.drop(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'], axis = 1)

In [None]:
X_test1.head()

<h2> Final Model Building </h2>

In [None]:
classifier = []
classifier.append(("LogisticReg", LogisticRegression(solver='liblinear', multi_class='ovr')))
classifier.append(("KNN", KNeighborsClassifier()))
classifier.append(("NaiveBayes", GaussianNB()))
classifier.append(("SVM Gaussian", SVC(kernel = 'rbf', class_weight='balanced',random_state = 0)))
#classifier.append(("SVM No Kernel", SVC(kernel = 'linear', random_state = 0)))
classifier.append(("Decision Tree", DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
classifier.append(("Random Forest", RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 0)))

In [None]:
seed = 0
results = []
names = []
for name, model in classifier:
    kfold = model_selection.KFold(n_splits=10, random_state=seed ,shuffle =True)
    cv_results = model_selection.cross_val_score(model, X_train1, y_t, cv=kfold)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
""" LOGISTIC REGRESSION """
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced', random_state = 13)
logreg.fit(X_train1, y_t)
y_pred = logreg.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score
cm_logreg = confusion_matrix(y_te, y_pred)
as_logreg=accuracy_score(y_te, y_pred)

In [None]:
print(as_logreg)

In [None]:
from numpy import sqrt
from numpy import argmax
# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc_logreg = metrics.auc(fpr, tpr)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (threshold[ix], gmeans[ix]))

plt.figure()
plt.title('Receiver Operating Characterstics')
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc_logreg))
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
 
# create the axis of thresholds (scores)
ax2 = plt.gca().twinx()
ax2.plot(fpr, threshold, markeredgecolor='r',linestyle='dashed', color='r')
ax2.scatter(fpr[ix], threshold[ix], marker='o', color='r', label='Best1')
ax2.set_ylabel('Threshold',color='r')
ax2.set_ylim([threshold[-1],threshold[0]])
ax2.set_xlim([fpr[0],fpr[-1]])
 
plt.show()

In [None]:
""" K-NEAREST NEIGHBORS """
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 19, metric = 'minkowski', p = 2)
KNN.fit(X_train1, y_t)
y_pred = KNN.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_knn = confusion_matrix(y_te, y_pred)
as_knn=accuracy_score(y_te, y_pred)

In [None]:
print(as_knn)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = KNN.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc_knn = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc_knn)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
"""Naive Bayes Classifier"""
from sklearn.naive_bayes import GaussianNB
NaiveB = GaussianNB()
NaiveB.fit(X_train1, y_t)
y_pred1 = NaiveB.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_nb = confusion_matrix(y_te, y_pred1)
as_nb = accuracy_score(y_te, y_pred1)
print(cm_nb)
print(as_nb)

In [None]:
from numpy import sqrt
from numpy import argmax
# calculate the fpr and tpr for all thresholds of the classification
probs = NaiveB.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc_nb = metrics.auc(fpr, tpr)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (threshold[ix], gmeans[ix]))

plt.figure()
plt.title('Receiver Operating Characterstics')
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc_nb))
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
 
# create the axis of thresholds (scores)
ax2 = plt.gca().twinx()
ax2.plot(fpr, threshold, markeredgecolor='r',linestyle='dashed', color='r')
ax2.scatter(fpr[ix], threshold[ix], marker='o', color='r', label='Best1')
ax2.set_ylabel('Threshold',color='r')
ax2.set_ylim([threshold[-1],threshold[0]])
ax2.set_xlim([fpr[0],fpr[-1]])
 
plt.show()

In [None]:
""" SVM GAUSSIAN """
from sklearn.svm import SVC
SVCG = SVC(kernel = 'rbf', class_weight='balanced',random_state = 42, probability = True)
SVCG.fit(X_train1, y_t)
y_pred = SVCG.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_svm_gaussian = confusion_matrix(y_te, y_pred)
as_svm_gaussian = accuracy_score(y_te, y_pred)
print(as_svm_gaussian)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = SVCG.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc_svg = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc_svg)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
# """ SVM NO KERNEL """
# from sklearn.svm import SVC
# classifier = SVC(kernel = 'linear', random_state = 0)
# classifier.fit(X_train1, y_t)
# y_pred = classifier.predict(X_test1)
# # Creating confusion matrix and calculating the accuracy score
# cm_svm_nokernel = confusion_matrix(y_te, y_pred)
# as_svm_nokernel = accuracy_score(y_te, y_pred)
# print(as_svm_nokernel)
""" SVM GAUSSIAN """
from sklearn.svm import SVC
SVCGI = SVC(kernel = 'rbf',random_state = 0, probability = True)
SVCGI.fit(X_train1, y_t)
y_pred = SVCGI.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_svm_gaussian1 = confusion_matrix(y_te, y_pred)
as_svm_gaussian1 = accuracy_score(y_te, y_pred)
print(as_svm_gaussian1)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = SVCGI.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
""" DECISION TREE CLASSIFICATION """
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
DTC.fit(X_train1, y_t)
y_pred = DTC.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_dtc = confusion_matrix(y_te, y_pred)
as_dtc = accuracy_score(y_te, y_pred)
print(as_dtc)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = DTC.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc_dt = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc_dt)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#Random FOrest without grid search and random values

In [None]:
""" RANDOM FOREST CLASSIFIER """
from sklearn.ensemble import RandomForestClassifier
RanForest = RandomForestClassifier(n_estimators = 42, criterion = 'entropy', random_state = 42, max_depth = 3)
RanForest.fit(X_train1, y_t)
y_pred = RanForest.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_rfc = confusion_matrix(y_te, y_pred)
as_rfc = accuracy_score(y_te, y_pred)
print(as_rfc)

In [None]:
# Random Forest with best estimator and max depth

In [None]:
from sklearn.model_selection import GridSearchCV
paramgrid = {'max_depth': list(range(1,20,2)),'n_estimators':list(range(1,200,20))}
grid_search = GridSearchCV(RandomForestClassifier(random_state=1),paramgrid)
grid_search.fit(X_train1,y_t)
grid_search.best_estimator_

In [None]:
""" RANDOM FOREST CLASSIFIER """
from sklearn.ensemble import RandomForestClassifier
RanForest = RandomForestClassifier(n_estimators = 141, criterion = 'entropy', random_state = 42, max_depth = 5)
RanForest.fit(X_train1, y_t)
y_pred = RanForest.predict(X_test1)
# Creating confusion matrix and calculating the accuracy score
cm_rfc = confusion_matrix(y_te, y_pred)
as_rfc = accuracy_score(y_te, y_pred)
print(as_rfc)

In [None]:
from numpy import sqrt
from numpy import argmax
# calculate the fpr and tpr for all thresholds of the classification
probs = RanForest.predict_proba(X_test1)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_te, preds)
roc_auc_rf = metrics.auc(fpr, tpr)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (threshold[ix], gmeans[ix]))

plt.figure()
plt.title('Receiver Operating Characterstics')
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc_rf))
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
 
# create the axis of thresholds (scores)
ax2 = plt.gca().twinx()
ax2.plot(fpr, threshold, markeredgecolor='r',linestyle='dashed', color='r')
ax2.scatter(fpr[ix], threshold[ix], marker='o', color='r', label='Best1')
ax2.set_ylabel('Threshold',color='r')
ax2.set_ylim([threshold[-1],threshold[0]])
ax2.set_xlim([fpr[0],fpr[-1]])
 
plt.show()

In [None]:
print(tpr)
print(fpr)
print(threshold)
print(gmeans)

In [None]:
# Evaluating the best method to use in this loan prediction case
score={'auc_logreg':roc_auc_logreg, 'auc_knn':roc_auc_knn, 'auc_svm_gaussian':roc_auc_svg, 'auc_nb':roc_auc_nb, 'auc_dtc':roc_auc_dt, 'auc_rfc':roc_auc_rf}
score_list=[]
for i in score:
    score_list.append(score[i])
    u=max(score_list)
    if score[i]==u:
        v=i  
    print(f"{i}={score[i]}");   
print(f"The best AUROC score in this case is {v} with accuracy score {u}")

In [None]:
# Evaluating the best method to use in this loan prediction case
score={'as_logreg':as_logreg, 'as_knn':as_knn, 'as_svm_gaussian':as_svm_gaussian, 'as_nb':as_nb, 'as_dtc':as_dtc, 'as_rfc':as_rfc}
score_list=[]
for i in score:
    score_list.append(score[i])
    u=max(score_list)
    if score[i]==u:
        v=i  
    print(f"{i}={score[i]}");   
print(f"The best accuracy score in this case is {v} with accuracy score {u}")

In [None]:
importances=pd.Series(RanForest.feature_importances_, index=X_train1.columns) 
importances.plot(kind='barh', figsize=(12,8))

In [None]:
X_train1.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline(steps=[
    ('preprocess', StandardScaler()),
    ('classification', MLPClassifier())
])

In [None]:
random_state = 42
mlp_activation = ['identity', 'tanh', 'relu']
mlp_solver = ['sgd', 'adam']
mlp_max_iter = [10, 100, 1000]
mlp_alpha = [0.01, 0.1, 1]
preprocess = [MinMaxScaler(), StandardScaler(), RobustScaler()]

In [None]:
mlp_param_grid = [
    {
        'preprocess': preprocess,
        'classification__activation': mlp_activation,
        'classification__solver': mlp_solver,
        'classification__random_state': [random_state],
        'classification__max_iter': mlp_max_iter,
        'classification__alpha': mlp_alpha
    }
]

In [None]:
print(X_train1.shape, y_t.shape)

In [None]:
print(X_test1.shape, y_te.shape)

In [None]:
strat_k_fold = StratifiedKFold(
    n_splits=10,
    random_state=42, shuffle=True
)

mlp_grid = GridSearchCV(
    pipe,
    param_grid=mlp_param_grid,
    cv=strat_k_fold,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

mlp_grid.fit(X_train1, y_t)

# Best MLPClassifier parameters
print(mlp_grid.best_params_)
# Best score for MLPClassifier with best parameters
print('\nBest F1 score for MLP: {:.2f}%'.format(mlp_grid.best_score_ * 100))

best_params = mlp_grid.best_params_

In [None]:
scaler = RobustScaler()

print('\nData preprocessing with {scaler}\n'.format(scaler=scaler))

X_train_scaler = scaler.fit_transform(X_train1)
X_test_scaler = scaler.transform(X_test1)

mlp = MLPClassifier(
    max_iter=1000,
    alpha=1,
    activation='identity',
    solver='sgd',
    random_state=42
)
mlp.fit(X_train_scaler, y_t)

mlp_predict = mlp.predict(X_test_scaler)
mlp_predict_proba = mlp.predict_proba(X_test_scaler)[:, 1]

print('MLP Accuracy: {:.2f}%'.format(accuracy_score(y_test, mlp_predict) * 100))
print('MLP AUC: {:.2f}%'.format(roc_auc_score(y_test, mlp_predict_proba) * 100))
print('MLP Classification report:\n\n', classification_report(y_test, mlp_predict))
print('MLP Training set score: {:.2f}%'.format(mlp.score(X_train_scaler, y_train) * 100))
print('MLP Testing set score: {:.2f}%'.format(mlp.score(X_test_scaler, y_test) * 100))

In [None]:
fpr, tpr, threshold = metrics.roc_curve(y_te, mlp_predict_proba)
roc_auc = metrics.auc(fpr, tpr)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (threshold[ix], gmeans[ix]))

plt.figure()
plt.title('Receiver Operating Characterstics')
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % (roc_auc))
plt.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
 
# create the axis of thresholds (scores)
ax2 = plt.gca().twinx()
ax2.plot(fpr, threshold, markeredgecolor='r',linestyle='dashed', color='r')
ax2.scatter(fpr[ix], threshold[ix], marker='o', color='r', label='Best1')
ax2.set_ylabel('Threshold',color='r')
ax2.set_ylim([threshold[-1],threshold[0]])
ax2.set_xlim([fpr[0],fpr[-1]])
 
plt.show()