In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


In [2]:

# Load the dataset
df = pd.read_csv('/content/loan_prediction.csv')

# Display the first few rows of the dataset
print(df.head())


    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   

# Data Preprocessing


In [3]:
# Check for missing values
print(df.isnull().sum())


Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [4]:
# Fill missing values for categorical columns with the mode
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [5]:
# Fill missing values for numerical columns with the mean
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(), inplace=True)

In [6]:
# Convert categorical columns to numerical using Label Encoding
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Married'] = le.fit_transform(df['Married'])
df['Education'] = le.fit_transform(df['Education'])
df['Self_Employed'] = le.fit_transform(df['Self_Employed'])
df['Property_Area'] = le.fit_transform(df['Property_Area'])
df['Loan_Status'] = le.fit_transform(df['Loan_Status'])

In [7]:
#Handle '3+' in Dependents column
df['Dependents'] = df['Dependents'].replace('3+', '3') # This line replaces '3+' with '3'
df['Dependents'] = df['Dependents'].astype(int) # This line converts the 'Dependents' column to integer type

# Feature Scaling

In [8]:
# Feature Scaling for numerical columns
scaler = StandardScaler()
df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']] = scaler.fit_transform(df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']])

# Drop Loan_ID column
df = df.drop(['Loan_ID'], axis=1)

In [10]:
# Features and target variable
X = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Building


In [13]:

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, n_jobs=-1, verbose=1)
rf_grid_search.fit(X_train, y_train)
best_rf_classifier = rf_grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [14]:
# Evaluate Random Forest Classifier
y_pred_rf = best_rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_conf_matrix = confusion_matrix(y_test, y_pred_rf)
rf_class_report = classification_report(y_test, y_pred_rf)

# XGBoost Classifier


In [15]:
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, n_jobs=-1, verbose=1)
xgb_grid_search.fit(X_train, y_train)
best_xgb_classifier = xgb_grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


Parameters: { "use_label_encoder" } are not used.



In [16]:
# Evaluate XGBoost Classifier
y_pred_xgb = best_xgb_classifier.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_conf_matrix = confusion_matrix(y_test, y_pred_xgb)
xgb_class_report = classification_report(y_test, y_pred_xgb)

# Results


In [18]:
# Print Results
print("Random Forest Classifier:")
print(f"Accuracy: {rf_accuracy}")
print(f"Confusion Matrix:\n{rf_conf_matrix}")
print(f"Classification Report:\n{rf_class_report}\n")




Random Forest Classifier:
Accuracy: 0.7783783783783784
Confusion Matrix:
[[ 28  37]
 [  4 116]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.43      0.58        65
           1       0.76      0.97      0.85       120

    accuracy                           0.78       185
   macro avg       0.82      0.70      0.71       185
weighted avg       0.80      0.78      0.75       185




In [19]:
print("XGBoost Classifier:")
print(f"Accuracy: {xgb_accuracy}")
print(f"Confusion Matrix:\n{xgb_conf_matrix}")
print(f"Classification Report:\n{xgb_class_report}\n")

XGBoost Classifier:
Accuracy: 0.772972972972973
Confusion Matrix:
[[ 28  37]
 [  5 115]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.43      0.57        65
           1       0.76      0.96      0.85       120

    accuracy                           0.77       185
   macro avg       0.80      0.69      0.71       185
weighted avg       0.79      0.77      0.75       185


