In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score,confusion_matrix

In [18]:
# Load the loan approval dataset
loan_df = pd.read_csv("loan_approval_data.csv")

In [19]:
# Drop unique identifier as it does not contribute to prediction
loan_df=loan_df.drop(columns=['Applicant_ID'], axis=1)

In [20]:
# Identify numerical columns (float type) to handle missing values
# separately using appropriate numerical imputation strategies
num_mis_val = loan_df.select_dtypes(include=['float64']).columns

# Identify categorical columns (object type) for separate
# categorical missing value treatment (e.g., mode imputation)
cat_mis_val = loan_df.select_dtypes(include=['object']).columns

In [21]:
# i learnt this new method of filling the missing value

# Handle missing values using SimpleImputer
# Numerical features are imputed with the mean to preserve overall distribution
from sklearn.impute import SimpleImputer

# Impute missing values in numerical columns
num_imp = SimpleImputer(strategy='mean')
loan_df[num_mis_val] = num_imp.fit_transform(loan_df[num_mis_val])

# Impute missing values in categorical columns
# Most frequent value (mode) is used to retain category consistency
cat_imp = SimpleImputer(strategy='most_frequent')
loan_df[cat_mis_val] = cat_imp.fit_transform(loan_df[cat_mis_val])


# Feature Encoding

In [22]:
# Encode categorical variables into numerical form using LabelEncoder
# This is required as machine learning models cannot work with string values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # new thing

le = LabelEncoder()

# Encode Education_Level (ordinal/binary categorical feature)
loan_df['Education_Level'] = le.fit_transform(loan_df['Education_Level'])

# Encode target variable (Loan_Approved) into binary numerical labels
# This enables supervised learning model training
loan_df['Loan_Approved'] = le.fit_transform(loan_df['Loan_Approved'])


In [23]:
# Apply One-Hot Encoding to nominal categorical features
# OneHotEncoder is preferred here to avoid introducing
# artificial ordinal relationships between categories
from sklearn.preprocessing import OneHotEncoder

# List of categorical columns to be one-hot encoded
col = [
    'Employment_Status',
    'Marital_Status',
    'Loan_Purpose',
    'Property_Area',
    'Gender',
    'Employer_Category'
]

# Initialize OneHotEncoder
# - drop='first' helps reduce multicollinearity (dummy variable trap)
# - sparse_output=False returns a dense NumPy array
# - handle_unknown='ignore' ensures robustness during inference
ohe = OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
)

# Fit encoder on categorical features and transform them
encoded = ohe.fit_transform(loan_df[col])

In [24]:
# Convert encoded NumPy array into a DataFrame with meaningful column names
encoded_df = pd.DataFrame(
    encoded,
    columns=ohe.get_feature_names_out(col),
    index=loan_df.index
)

# Drop original categorical columns and concatenate encoded features
# to form the final preprocessed dataset
loan_df = pd.concat(
    [loan_df.drop(columns=col), encoded_df],
    axis=1
)

# Train-Test-split + Feature Scalling

In [25]:
# Separate input features (X) and target variable (y)
# Loan_Approved is the label we want the model to predict
X = loan_df.drop(columns=['Loan_Approved'],axis=1)
y = loan_df['Loan_Approved']

# Split data into training and testing sets
# Test size of 20% ensures fair evaluation on unseen data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
# Initialize StandardScaler to normalize feature values
scaler = StandardScaler()

# Fit scaler ONLY on training data to learn scaling parameters
# This prevents data leakage from the test set
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same scaling transformation to test data
X_test_scaled = scaler.transform(X_test)

# Train & Evaluate models

In [27]:
# Initialize Gaussian Naive Bayes classifier
# Suitable for continuous features assuming normal distribution
gb_model = GaussianNB()

# Train the Naive Bayes model on scaled training data
gb_model.fit(X_train_scaled, y_train)

# Generate predictions on scaled test data
y_pred = gb_model.predict(X_test_scaled)

# Evaluation metrics to assess Naive Bayes model performance
print("Evaluation For Naive Bayes")
print('Precision:', precision_score(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))
print('cm:', confusion_matrix(y_test, y_pred))

Evaluation For Naive Bayes
Precision: 0.8035714285714286
Accuracy: 0.865
Recall: 0.7377049180327869
F1: 0.7692307692307693
cm: [[128  11]
 [ 16  45]]


### Final Model Selection

Multiple machine learning models were trained and evaluated, including
Logistic Regression, KNN, and Naive Bayes.

Based on comparative performance across accuracy, precision, recall,
and F1-score, Naive Bayes demonstrated the most consistent results
and was selected as the final model for deployment.
