In [34]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Load dataset
loan_data = pd.read_csv('loan_sanction_train.csv')
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [3]:
loan_data['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [4]:
cat_nominal_cols = loan_data.select_dtypes('object').columns.difference(['Loan_ID', 'Dependents',
                                                                         'Loan_Status'])
cat_ordinal_col = ['Dependents']
num_cols = loan_data.select_dtypes(exclude='object').columns

In [5]:
x = loan_data.drop(columns=['Loan_ID', 'Loan_Status'])
y=loan_data['Loan_Status']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=23)

In [7]:
cat_ordinal_transformer = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent')),
    ('oe', OrdinalEncoder(categories=[['0','1','2','3+']])),
    ('ss', StandardScaler(with_mean=False))
])

In [8]:
cat_nominal_transformer = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder()),
    ('ss', StandardScaler(with_mean=False))
])

In [9]:
num_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='median')),
    ('ss', StandardScaler())
])

In [10]:
trnf_1 = ColumnTransformer([
    ('cat_ordinal', cat_ordinal_transformer, cat_ordinal_col),
    ('cat_nominal', cat_nominal_transformer, cat_nominal_cols),
    ('num_transformer', num_pipeline, num_cols)
], remainder='passthrough')

In [11]:
X_train_new = trnf_1.fit_transform(X_train)
X_test_new = trnf_1.transform(X_test)

In [12]:
# Encoding target feature
le = LabelEncoder()
y_train_new = le.fit_transform(y_train)
y_test_new = le.transform(y_test)

In [23]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    f1_sco = f1_score(true, predicted)
    return accuracy, f1_sco

In [26]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "K-Neighbors Classifier": KNeighborsClassifier()
}

model_list =[]
F1_score_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_new, y_train_new)
    
    y_train_pred = model.predict(X_train_new)
    y_test_pred = model.predict(X_test_new)
    
    model_train_accuracy, model_train_f1_score = evaluate_model(y_train_new, y_train_pred)
    model_test_accuracy, model_test_f1_score = evaluate_model(y_test_new, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance for Train set:")
    print(" Accuracy score: {:.2f}".format(model_train_accuracy))
    print(" F1 score: {:.2f}".format(model_train_f1_score))
    
    print("Model Performance for Test set:")
    print(" Accuracy score: {:.2f}".format(model_test_accuracy))
    print(" F1 score: {:.2f}".format(model_test_f1_score))
    
    F1_score_list.append(model_test_f1_score)
    
    print("="*35)
    print("\n")

Logistic Regression
Model Performance for Train set:
 Accuracy score: 0.82
 F1 score: 0.89
Model Performance for Test set:
 Accuracy score: 0.77
 F1 score: 0.84


Decision tree
Model Performance for Train set:
 Accuracy score: 1.00
 F1 score: 1.00
Model Performance for Test set:
 Accuracy score: 0.72
 F1 score: 0.78


Random Forest
Model Performance for Train set:
 Accuracy score: 1.00
 F1 score: 1.00
Model Performance for Test set:
 Accuracy score: 0.76
 F1 score: 0.82


Support Vector Machine
Model Performance for Train set:
 Accuracy score: 0.82
 F1 score: 0.89
Model Performance for Test set:
 Accuracy score: 0.77
 F1 score: 0.84


Gradient Boosting
Model Performance for Train set:
 Accuracy score: 0.90
 F1 score: 0.94
Model Performance for Test set:
 Accuracy score: 0.74
 F1 score: 0.81


K-Neighbors Classifier
Model Performance for Train set:
 Accuracy score: 0.82
 F1 score: 0.88
Model Performance for Test set:
 Accuracy score: 0.74
 F1 score: 0.82




In [27]:
pd.DataFrame(list(zip(model_list, F1_score_list)),
             columns=['Model_name', 'F1_score']).sort_values(by=['F1_score'],ascending=False)

Unnamed: 0,Model_name,F1_score
3,Support Vector Machine,0.83908
0,Logistic Regression,0.837209
2,Random Forest,0.821429
5,K-Neighbors Classifier,0.816092
4,Gradient Boosting,0.807229
1,Decision tree,0.77707


## Logistic Regression

In [32]:
log_reg = LogisticRegression()
log_reg.fit(X_train_new, y_train_new)
y_test_pred = log_reg.predict(X_test_new)
score = f1_score(y_test_new, y_test_pred)*100
print(f"The f1_score of the model is {score}")

The f1_score of the model is 83.72093023255815


## Support Vector Classifier

In [33]:
classify = SVC()
classify.fit(X_train_new, y_train_new)
y_test_pred_1 = classify.predict(X_test_new)
f_score = f1_score(y_test_new, y_test_pred_1)*100
print(f"The f1_score of the model is {f_score}")

The f1_score of the model is 83.9080459770115
