In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import pickle
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
data = pd.read_csv("data/loan_approval.CSV")
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


## CLEANING DATA

In [6]:
data.isnull().any()

Loan_ID              False
Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
dtype: bool

In [7]:
#counting number of missing values
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
data['Gender'].fillna(data['Gender'].mode()[0],inplace=True)
data['Married'].fillna(data['Married'].mode()[0],inplace=True)
data['Dependents'].fillna(data['Dependents'].mode()[0],inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0],inplace=True)
data['LoanAmount'].fillna(data['LoanAmount'].median(),inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].median(),inplace=True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Gender'].fillna(data['Gender'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Married'].fillna(data['Married'].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [9]:
data.isnull().any()

Loan_ID              False
Gender               False
Married              False
Dependents           False
Education            False
Self_Employed        False
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
Property_Area        False
Loan_Status          False
dtype: bool

In [10]:
data=data.drop(['Loan_ID','Gender','Married','Self_Employed'],axis=1)
data.head()

Unnamed: 0,Dependents,Education,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,Graduate,5849,0.0,128.0,360.0,1.0,Urban,Y
1,1,Graduate,4583,1508.0,128.0,360.0,1.0,Rural,N
2,0,Graduate,3000,0.0,66.0,360.0,1.0,Urban,Y
3,0,Not Graduate,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,0,Graduate,6000,0.0,141.0,360.0,1.0,Urban,Y


In [10]:
#label encoding
le = LabelEncoder()
data['Education'] = le.fit_transform(data['Education'])
data['Loan_Status'] = le.fit_transform(data['Loan_Status'])
data['Property_Area'] = le.fit_transform(data['Property_Area'])
data['Dependents'] = le.fit_transform(data['Dependents'])

#### Education 
- 0 is graduate
- 1 is not graduate
#### Loan_Status
- 0 is N
- 1 is Y
#### Property_Area
- 0 is rural
- 1 is semiurban
- 2 is urban
#### Dependents
- 0 is 0
- 1 is 1
- 2 is 2
- 3 is 3+

In [12]:
# divide data
x = data.iloc[:,:8].values
y = data.iloc[:,8:].values

In [13]:
#one hot encoding for Property Area and Dependents
one = OneHotEncoder()
dependents = one.fit_transform(x[:,:1]).toarray()
property = one.fit_transform(x[:,7:8]).toarray()

In [14]:
x = np.delete(x,0,axis=1)

In [15]:
x = np.delete(x,6,axis=1)

In [16]:
x = np.concatenate((x,dependents),axis=1)

In [17]:
x = np.concatenate((x,property),axis=1)
x.shape

(614, 13)

##### Education, Applicant Income, CoapplicantIncome, LoanAmount, Loan_Amount_Term, Credit_History, dependents(4), property(3)

#### dependants
- 0001 is 0
- 0010 is 1
- 0100 is 2
- 1000 is 3+

#### property
- 001 is rural
- 010 is semiurban
- 100 is urban

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [19]:
#standard scaling because classification
sc =  StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


In [None]:
pickle.dump(sc,open("res/scaler.pkl","wb"))

In [None]:
np.save('res/x_train.npy', x_train)
np.save('res/x_test.npy', x_test)

## MODEL BUILDING


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [None]:

def xgboost_cv(x_train, y_train):
    param_grid = {
        'learning_rate': [0.05, 0.1, 0.25, 0.5],
        'max_depth': [3, 5, 7],
        'n_estimators': [50, 100, 200]
    }
    xg = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    grid_search = GridSearchCV(xg, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(x_train, y_train.ravel())
    
    print("Best Parameters for XGBoost:", grid_search.best_params_)
    print("Best Cross-Validation Accuracy:", grid_search.best_score_)
    best_model = grid_search.best_estimator_
    
    return best_model

def KNN_cv(x_train, y_train):
    param_grid = {
        'n_neighbors': [3, 5, 7, 10],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(x_train, y_train.ravel())
    
    print("Best Parameters for KNN:", grid_search.best_params_)
    print("Best Cross-Validation Accuracy:", grid_search.best_score_)
    best_model = grid_search.best_estimator_
    
    return best_model

def randomForest_cv(x_train, y_train):
    param_grid = {
        'n_estimators': [10, 50, 100],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30]
    }
    rfc = RandomForestClassifier(random_state=0)
    grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(x_train, y_train.ravel())
    
    print("Best Parameters for Random Forest:", grid_search.best_params_)
    print("Best Cross-Validation Accuracy:", grid_search.best_score_)
    best_model = grid_search.best_estimator_
    
    return best_model

def decisionTree_cv(x_train, y_train):
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30]
    }
    dtc = DecisionTreeClassifier(random_state=0)
    grid_search = GridSearchCV(dtc, param_grid, cv=5, scoring='accuracy', verbose=1)
    grid_search.fit(x_train, y_train.ravel())
    
    print("Best Parameters for Decision Tree:", grid_search.best_params_)
    print("Best Cross-Validation Accuracy:", grid_search.best_score_)
    best_model = grid_search.best_estimator_
    
    return best_model

def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Test Accuracy: ", accuracy_score(y_test, y_pred))


In [40]:
# Finding the best model for each classifier with optimized hyperparameters
best_xgboost_model = xgboost_cv(x_train, y_train)
evaluate_model(best_xgboost_model, x_test, y_test)




Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters for XGBoost: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}
Best Cross-Validation Accuracy: 0.8003710575139147
Confusion Matrix:
 [[14 19]
 [ 2 88]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.42      0.57        33
           1       0.82      0.98      0.89        90

    accuracy                           0.83       123
   macro avg       0.85      0.70      0.73       123
weighted avg       0.84      0.83      0.81       123

Test Accuracy:  0.8292682926829268


In [41]:
best_knn_model = KNN_cv(x_train, y_train)
evaluate_model(best_knn_model, x_test, y_test)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 3}
Best Cross-Validation Accuracy: 0.7656977942692229
Confusion Matrix:
 [[14 19]
 [12 78]]
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.42      0.47        33
           1       0.80      0.87      0.83        90

    accuracy                           0.75       123
   macro avg       0.67      0.65      0.65       123
weighted avg       0.73      0.75      0.74       123

Test Accuracy:  0.7479674796747967


In [42]:

best_rf_model = randomForest_cv(x_train, y_train)
evaluate_model(best_rf_model, x_test, y_test)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters for Random Forest: {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.8044526901669761
Confusion Matrix:
 [[14 19]
 [ 4 86]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.42      0.55        33
           1       0.82      0.96      0.88        90

    accuracy                           0.81       123
   macro avg       0.80      0.69      0.72       123
weighted avg       0.81      0.81      0.79       123

Test Accuracy:  0.8130081300813008


In [43]:

best_dt_model = decisionTree_cv(x_train, y_train)
evaluate_model(best_dt_model, x_test, y_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 10}
Best Cross-Validation Accuracy: 0.7433106575963719
Confusion Matrix:
 [[19 14]
 [14 76]]
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.58      0.58        33
           1       0.84      0.84      0.84        90

    accuracy                           0.77       123
   macro avg       0.71      0.71      0.71       123
weighted avg       0.77      0.77      0.77       123

Test Accuracy:  0.7723577235772358


In [None]:
pickle.dump(best_xgboost_model,open("res/model.pkl","wb"))