The goal of this project is to determine whether a Bank would approve loan to an individual.

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train_loan prediction.csv')

In [3]:
train

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [4]:
train.drop(['Loan_ID','CoapplicantIncome','Dependents'],axis=1,inplace = True)

In [5]:
train

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,Graduate,No,5849,,360.0,1.0,Urban,Y
1,Male,Yes,Graduate,No,4583,128.0,360.0,1.0,Rural,N
2,Male,Yes,Graduate,Yes,3000,66.0,360.0,1.0,Urban,Y
3,Male,Yes,Not Graduate,No,2583,120.0,360.0,1.0,Urban,Y
4,Male,No,Graduate,No,6000,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...
609,Female,No,Graduate,No,2900,71.0,360.0,1.0,Rural,Y
610,Male,Yes,Graduate,No,4106,40.0,180.0,1.0,Rural,Y
611,Male,Yes,Graduate,No,8072,253.0,360.0,1.0,Urban,Y
612,Male,Yes,Graduate,No,7583,187.0,360.0,1.0,Urban,Y


In [6]:
train.dtypes

Gender               object
Married              object
Education            object
Self_Employed        object
ApplicantIncome       int64
LoanAmount          float64
Loan_Amount_Term    float64
Credit_History      float64
Property_Area        object
Loan_Status          object
dtype: object

We need to encode the above columns to numerical digits so that we can run the algorithms.

In [7]:
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}')

In [8]:
print_unique_col_values(train)

Gender: ['Male' 'Female' nan]
Married: ['No' 'Yes' nan]
Education: ['Graduate' 'Not Graduate']
Self_Employed: ['No' 'Yes' nan]
Property_Area: ['Urban' 'Rural' 'Semiurban']
Loan_Status: ['Y' 'N']


In [9]:
train['Gender']= train['Gender'].map({'Male':0, 'Female':1})
train['Married']= train['Married'].map({'No':0, 'Yes':1})
train['Education'] = train['Education'].map({'Graduate':1, 'Not Graduate':0})
train['Self_Employed']= train['Self_Employed'].map({'No':0, 'Yes':1})
train['Loan_Status']= train['Loan_Status'].map({'N':0, 'Y':1})
train['Property_Area']= train['Property_Area'].map({'Urban':2,'Rural':0,'Semiurban':1})

In [10]:
train

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,0.0,1,0.0,5849,,360.0,1.0,2,1
1,0.0,1.0,1,0.0,4583,128.0,360.0,1.0,0,0
2,0.0,1.0,1,1.0,3000,66.0,360.0,1.0,2,1
3,0.0,1.0,0,0.0,2583,120.0,360.0,1.0,2,1
4,0.0,0.0,1,0.0,6000,141.0,360.0,1.0,2,1
...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,1,0.0,2900,71.0,360.0,1.0,0,1
610,0.0,1.0,1,0.0,4106,40.0,180.0,1.0,0,1
611,0.0,1.0,1,0.0,8072,253.0,360.0,1.0,2,1
612,0.0,1.0,1,0.0,7583,187.0,360.0,1.0,2,1


In [11]:
train.isnull().sum()

Gender              13
Married              3
Education            0
Self_Employed       32
ApplicantIncome      0
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
Property_Area        0
Loan_Status          0
dtype: int64

Dataset has missing values, and this can cause problems for our machine learning algorithm.

Data imputation is to use a model to predict the missing values. 
This requires a model to be created for each input variable that has missing values.


" KNNImputer "


is a data transform that is first configured based on the method used to estimate the missing values.

The default distance measure is a Euclidean distance measure that is NaN aware, e.g. will not include NaN values when calculating the distance between members of the training dataset. This is set via the “metric” argument.

k-Nearest Neighbours (kNN) identifies the neighboring points through a measure of distance and the missing values can be estimated using completed values of neighboring observations.

In [12]:
from sklearn.impute import KNNImputer

In [13]:
knn = KNNImputer()

In [14]:
trains = knn.fit_transform(train)

In [15]:
trains

array([[0., 0., 1., ..., 1., 2., 1.],
       [0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 1., 2., 1.],
       ...,
       [0., 1., 1., ..., 1., 2., 1.],
       [0., 1., 1., ..., 1., 2., 1.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [16]:
trains = pd.DataFrame(trains,columns=train.columns)

In [17]:
trains.head(1)  #  Notice the loan amount is filled up.

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,0.0,1.0,0.0,5849.0,179.4,360.0,1.0,2.0,1.0


In [18]:
trains.isnull().sum()

Gender              0
Married             0
Education           0
Self_Employed       0
ApplicantIncome     0
LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
Property_Area       0
Loan_Status         0
dtype: int64

In [19]:
x = trains.iloc[:,:-1]
y = trains.Loan_Status

In [20]:
x

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.0,0.0,1.0,0.0,5849.0,179.4,360.0,1.0,2.0
1,0.0,1.0,1.0,0.0,4583.0,128.0,360.0,1.0,0.0
2,0.0,1.0,1.0,1.0,3000.0,66.0,360.0,1.0,2.0
3,0.0,1.0,0.0,0.0,2583.0,120.0,360.0,1.0,2.0
4,0.0,0.0,1.0,0.0,6000.0,141.0,360.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,1.0,0.0,2900.0,71.0,360.0,1.0,0.0
610,0.0,1.0,1.0,0.0,4106.0,40.0,180.0,1.0,0.0
611,0.0,1.0,1.0,0.0,8072.0,253.0,360.0,1.0,2.0
612,0.0,1.0,1.0,0.0,7583.0,187.0,360.0,1.0,2.0


In [21]:
y.value_counts()

1.0    422
0.0    192
Name: Loan_Status, dtype: int64

In [22]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
x_sm, y_sm = smote.fit_resample(x, y)

y_sm.value_counts()

0.0    422
1.0    422
Name: Loan_Status, dtype: int64

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
from sklearn.metrics import classification_report

In [25]:
from sklearn.pipeline import make_pipeline

In [26]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

from sklearn.ensemble import RandomForestClassifier 

model = RandomForestClassifier(max_depth=10, max_features=8, n_estimators=500) 

In [27]:
pipe = make_pipeline(sc,model)

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_sm,y_sm, test_size = 0.2,random_state=10,stratify=y_sm)

In [29]:
pipe.fit(x_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=10, max_features=8,
                                        n_estimators=500))])

In [30]:
y_pred = pipe.predict(x_test)

In [31]:
print("Accuracy", accuracy_score(y_test, y_pred))

Accuracy 0.8224852071005917


In [32]:
print("Classification Report: \n", classification_report(y_test, y_pred))

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.86      0.77      0.81        84
         1.0       0.80      0.87      0.83        85

    accuracy                           0.82       169
   macro avg       0.83      0.82      0.82       169
weighted avg       0.83      0.82      0.82       169



In [33]:
from sklearn.metrics import confusion_matrix

y_pred = pipe.predict(x_test)
cm = confusion_matrix(y_test,y_pred)

cm

array([[65, 19],
       [11, 74]], dtype=int64)

In [34]:
import pickle 
pickle_out = open("classifier-knn imputated.pkl", mode = "wb") 
pickle.dump(pipe, pickle_out) 
pickle_out.close()