In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
data = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
#Check for Null Values
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## Feature Engineering: 

#### 1) Dealing with Missing Values

In [5]:
data['Education'] = data['Education'].replace(['Graduate', 'Not Graduate'] , [1.0 , 0.0])

mode_value = data['Education'].mode()[0]

# Replace the null values with the mode value
data['Education'].fillna(mode_value, inplace=True)


#dropping unnecessary column
data = data.drop('Loan_ID' , axis = 1)

data['Gender'] = data['Gender'].replace(['Male', 'Female'] , [1.0 , 0.0])
data['Gender'].unique()
mode_value = data['Gender'].mode()[0]
# Replace the null values with the mode value
data['Gender'].fillna(mode_value, inplace=True)

data['Married'] = data['Married'].replace(['No', 'Yes'] , [0.0 , 1.0])
mode_value = data['Married'].mode()[0]
# Replace the null values with the mode value
data['Married'].fillna(mode_value, inplace=True)

data['Self_Employed'] = data['Self_Employed'].replace(['No', 'Yes'] , [0.0 , 1.0])
mode_value = data['Self_Employed'].mode()[0]
# Replace the null values with the mode value
data['Self_Employed'].fillna(mode_value, inplace=True)

data['Property_Area'] = data['Property_Area'].replace(['Urban', 'Rural', 'Semiurban'] , [2.0 , 0.0 , 1.0])
mode_value = data['Property_Area'].mode()[0]
# Replace the null values with the mode value
data['Property_Area'].fillna(mode_value, inplace=True)

data['Loan_Status'] = data['Loan_Status'].replace(['Y', 'N'] , [1.0 , 0.0])
mode_value = data['Loan_Status'].mode()[0]
# Replace the null values with the mode value
data['Loan_Status'].fillna(mode_value, inplace=True)

data['Dependents'] = data['Dependents'].replace(['0', '1', '2', '3+'] , [0.0 , 1.0 , 2.0 , 3.0])
mode_value = data['Dependents'].mode()[0]
# Replace the null values with the mode value
data['Dependents'].fillna(mode_value, inplace=True)

mode_value = data['Credit_History'].mode()[0]
data['Credit_History'].fillna(mode_value , inplace=True)

#imputations in continuous variables
med_value = data['LoanAmount'].median()
data['LoanAmount'].fillna(med_value , inplace=True)
med_value = data['Loan_Amount_Term'].median()
data['Loan_Amount_Term'].fillna(med_value , inplace=True)

data['ApplicantIncome'] = data['ApplicantIncome'].astype(float)

In [6]:
#Seperating response and Predictors

y = data['Loan_Status']
X = data.drop('Loan_Status' , axis=1 , inplace=False)

predictors = list(X.columns)

#### 2) Standardizing the Features


In [7]:
# Standardizing the predictor values

from sklearn.preprocessing import StandardScaler
object= StandardScaler()

# standardization 
X_trans = pd.DataFrame(object.fit_transform(X) , columns=predictors) 

In [8]:
#Splitting the Data into trainig and testing sets

from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.3 , random_state = 42 , stratify = y) 

## Feature Selection

#### 1) Backward Stepwise Elimination Method

In [9]:
##Backward Stepwise Selection


X_train1 = sm.add_constant(X_train)
feature = list(X_train1.columns)
data = pd.DataFrame(X_train1 , columns=feature)
data['target'] = y_train


model = sm.OLS(data['target'] , data.drop(['target'] , axis=1)).fit()
selected_features = feature


while len(selected_features) > 1:
    p_values = model.pvalues[1:]
    max_p_value_idx = np.argmax(p_values)
    if p_values[max_p_value_idx] <= 0.2351:
        break
    feature_to_remove = selected_features[max_p_value_idx + 1]  
    selected_features.remove(feature_to_remove)
    model = sm.OLS(data['target'], data[selected_features]).fit()
    
print("the important features are \n")
print(selected_features)

the important features are 

['const', 'Married', 'Education', 'Credit_History']


#### 2) Using Select k-Best Classifier 

In [11]:
from sklearn.feature_selection import SelectKBest,f_classif #selecting features by the method of selectbestclassifier.
X1=X.columns
fs = SelectKBest(f_classif, k=5)
fs.fit(X,y)
X1 = X1[fs.get_support()]
X1

Index(['Married', 'Education', 'CoapplicantIncome', 'LoanAmount',
       'Credit_History'],
      dtype='object')

## Model Fitting

### 1) Logistic Regression

In [33]:
#fitting a logistic Model
logisticRegr = LogisticRegression(max_iter=4000)
logisticRegr.fit(X_train[X1], y_train)

In [34]:
y_pred = logisticRegr.predict(X_test[X1])

In [35]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print ("Accuracy :\n ",accuracy_score(y_test,y_pred)*100)

print("Confusion Matrix:\n ", confusion_matrix(y_test, y_pred))

print("Report : \n", classification_report(y_test, y_pred))

Accuracy :
  85.4054054054054
Confusion Matrix:
  [[ 33  25]
 [  2 125]]
Report : 
               precision    recall  f1-score   support

         0.0       0.94      0.57      0.71        58
         1.0       0.83      0.98      0.90       127

    accuracy                           0.85       185
   macro avg       0.89      0.78      0.81       185
weighted avg       0.87      0.85      0.84       185



In [36]:
# Calculating the AUC

from sklearn import metrics
auc2 = metrics.roc_auc_score(y_test, y_pred)
print("the auc score is %.3f"%auc2)

the auc score is 0.777


### 2) Random Forest Classifier

In [42]:
#Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
model_2 = RandomForestClassifier(random_state=0)
model_2.fit(X_train[X1] , y_train)
y_pred = model_2.predict(X_test[X1])
print ("Accuracy :\n ",accuracy_score(y_test,y_pred)*100)

print("Confusion Matrix:\n ", confusion_matrix(y_test, y_pred))

print("Report : \n", classification_report(y_test, y_pred))

auc = metrics.roc_auc_score(y_test, y_pred)
print("the auc score is %.3f"%auc)

Accuracy :
  84.86486486486487
Confusion Matrix:
  [[ 32  26]
 [  2 125]]
Report : 
               precision    recall  f1-score   support

         0.0       0.94      0.55      0.70        58
         1.0       0.83      0.98      0.90       127

    accuracy                           0.85       185
   macro avg       0.88      0.77      0.80       185
weighted avg       0.86      0.85      0.84       185

the auc score is 0.768


In [18]:
#Hyper parameter Tuning using RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param={"max_depth": [3,5,7],
       "max_features": randint(1, 11),
              "min_samples_split": randint(2, 11),
              "min_samples_leaf": randint(1, 11),
              "criterion": ["gini", "entropy"]}
rf1_cv=RandomizedSearchCV(model_2,param,cv=5)
rf1_cv.fit(X_train[X1],y_train)

print("Tuned Decision Tree Parameters: {}".format(rf1_cv.best_params_))
print("Best score is {}".format(rf1_cv.best_score_))
#print("Confusion Matrix:\n ", confusion_matrix(y_test, rf_pred1))
#print ("Accuracy :\n ",accuracy_score(y_test,rf_pred1)*100)
#print("Report : \n", classification_report(y_test, rf_pred1)) 

Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 5, 'max_features': 9, 'min_samples_leaf': 3, 'min_samples_split': 7}
Best score is 0.7947469220246238


In [19]:
y_pred = rf1_cv.predict(X_test[X1])

print ("Accuracy :\n ",accuracy_score(y_test,y_pred)*100)

print("Confusion Matrix:\n ", confusion_matrix(y_test, y_pred))

print("Report : \n", classification_report(y_test, y_pred))

auc = metrics.roc_auc_score(y_test, y_pred)

print("the auc score is %.3f"%auc)

Accuracy :
  81.62162162162161
Confusion Matrix:
  [[ 36  22]
 [ 12 115]]
Report : 
               precision    recall  f1-score   support

         0.0       0.75      0.62      0.68        58
         1.0       0.84      0.91      0.87       127

    accuracy                           0.82       185
   macro avg       0.79      0.76      0.78       185
weighted avg       0.81      0.82      0.81       185

the auc score is 0.763


In [40]:
def objective_function(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Create a RandomForestClassifier with the given hyperparameters
    model = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )

    # Calculate the cross-validation score using the accuracy metric
    scores = cross_val_score(model, X_train[X1], y_train, cv=5, scoring='accuracy')
    
    # Return the average score
    return scores.mean()

In [41]:
param_bounds = {
    'n_estimators': (10, 100),
    'max_depth': (3, 10),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 5)
}

In [42]:
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=param_bounds,
    random_state=42,
)

optimizer.maximize(init_points=5, n_iter=10)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.7971   [0m | [0m5.622    [0m | [0m4.803    [0m | [0m7.856    [0m | [0m63.88    [0m |
| [0m2        [0m | [0m0.7947   [0m | [0m4.092    [0m | [0m1.624    [0m | [0m2.465    [0m | [0m87.96    [0m |
| [0m3        [0m | [0m0.7855   [0m | [0m7.208    [0m | [0m3.832    [0m | [0m2.165    [0m | [0m97.29    [0m |
| [0m4        [0m | [0m0.7691   [0m | [0m8.827    [0m | [0m1.849    [0m | [0m3.455    [0m | [0m26.51    [0m |
| [0m5        [0m | [0m0.7947   [0m | [0m5.13     [0m | [0m3.099    [0m | [0m5.456    [0m | [0m36.21    [0m |
| [0m6        [0m | [0m0.7947   [0m | [0m4.835    [0m | [0m2.965    [0m | [0m5.354    [0m | [0m36.21    [0m |
| [0m7        [0m | [0m0.7947   [0m | [0m5.52     [0m | [0m3.539    [0m | [0m5.823    [0m | [0m72.81    [0m 

In [43]:
best_params = optimizer.max['params']
best_score = optimizer.max['target']

print("Best Hyperparameters:", best_params)
print("Best Score:", -best_score)

Best Hyperparameters: {'max_depth': 5.621780831931537, 'min_samples_leaf': 4.802857225639665, 'min_samples_split': 7.855951534491241, 'n_estimators': 63.8792635777333}
Best Score: -0.7970725034199726


In [44]:
model = RandomForestClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    min_samples_split=int(best_params['min_samples_split']),
    random_state=42
)

model.fit(X_train[X1], y_train)

In [45]:
y_pred = model.predict(X_test[X1])

print ("Accuracy :\n ",accuracy_score(y_test,y_pred)*100)

print("Confusion Matrix:\n ", confusion_matrix(y_test, y_pred))

print("Report : \n", classification_report(y_test, y_pred))

auc = metrics.roc_auc_score(y_test, y_pred)

print("the auc score is %.3f"%auc)

Accuracy :
  81.62162162162161
Confusion Matrix:
  [[ 33  25]
 [  9 118]]
Report : 
               precision    recall  f1-score   support

         0.0       0.79      0.57      0.66        58
         1.0       0.83      0.93      0.87       127

    accuracy                           0.82       185
   macro avg       0.81      0.75      0.77       185
weighted avg       0.81      0.82      0.81       185

the auc score is 0.749


### 3) Support Vector Machine

In [30]:
#SVM

from sklearn.svm import SVC
svm = SVC(kernel = 'rbf', random_state = 0)
svm.fit(X_train[X1], y_train)
y_pred = svm.predict(X_test[X1])

print ("Accuracy :\n ",accuracy_score(y_test,y_pred)*100)

print("Confusion Matrix:\n ", confusion_matrix(y_test, y_pred))

print("Report : \n", classification_report(y_test, y_pred))

auc = metrics.roc_auc_score(y_test, y_pred)

print("the auc score is %.3f"%auc)

Accuracy :
  68.64864864864865
Confusion Matrix:
  [[  0  58]
 [  0 127]]
Report : 
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        58
         1.0       0.69      1.00      0.81       127

    accuracy                           0.69       185
   macro avg       0.34      0.50      0.41       185
weighted avg       0.47      0.69      0.56       185

the auc score is 0.500


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
#Hyper parameter Tuning using RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

In [33]:
y_pred = grid.predict(X_test)

print ("Accuracy :\n ",accuracy_score(y_test,y_pred)*100)

print("Confusion Matrix:\n ", confusion_matrix(y_test, y_pred))

print("Report : \n", classification_report(y_test, y_pred))

auc = metrics.roc_auc_score(y_test, y_pred)

print("the auc score is %.3f"%auc)

Accuracy :
  67.56756756756756
Confusion Matrix:
  [[  7  51]
 [  9 118]]
Report : 
               precision    recall  f1-score   support

         0.0       0.44      0.12      0.19        58
         1.0       0.70      0.93      0.80       127

    accuracy                           0.68       185
   macro avg       0.57      0.52      0.49       185
weighted avg       0.62      0.68      0.61       185

the auc score is 0.525
