In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score,confusion_matrix

In [2]:
# Load the loan approval dataset
loan_df = pd.read_csv("loan_approval_data.csv")

In [3]:
# Drop unique identifier as it does not contribute to prediction
loan_df.drop(columns=['Applicant_ID'], axis=1)

Unnamed: 0,Applicant_Income,Coapplicant_Income,Employment_Status,Age,Marital_Status,Dependents,Credit_Score,Existing_Loans,DTI_Ratio,Savings,Collateral_Value,Loan_Amount,Loan_Term,Loan_Purpose,Property_Area,Education_Level,Gender,Employer_Category,Loan_Approved
0,17795.0,1387.0,Salaried,51.0,Married,0.0,637.0,4.0,0.53,19403.0,45638.0,16619.0,84.0,Personal,Urban,Not Graduate,Female,Private,No
1,2860.0,2679.0,Salaried,46.0,Married,3.0,621.0,2.0,0.30,2580.0,49272.0,38687.0,,Car,Semiurban,Graduate,,Private,No
2,7390.0,2106.0,Salaried,25.0,Single,2.0,674.0,4.0,0.20,13844.0,6908.0,27943.0,72.0,,Urban,,Female,Government,Yes
3,13964.0,8173.0,Salaried,40.0,Married,2.0,579.0,3.0,0.31,9553.0,10844.0,27819.0,60.0,Business,Rural,Graduate,Female,Government,No
4,13284.0,4223.0,Self-employed,31.0,Single,2.0,721.0,1.0,0.29,9386.0,37629.0,12741.0,72.0,Car,,Graduate,Male,Private,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,9092.0,Salaried,58.0,Married,0.0,557.0,0.0,0.59,5370.0,43563.0,8311.0,72.0,Personal,,Not Graduate,Male,Unemployed,No
996,3279.0,6356.0,Self-employed,58.0,Married,1.0,646.0,3.0,0.19,,18361.0,22563.0,12.0,Business,Urban,Graduate,Female,Government,No
997,15192.0,8433.0,Contract,48.0,Single,1.0,666.0,1.0,0.40,8581.0,41335.0,16203.0,24.0,Home,Rural,Graduate,Male,MNC,No
998,9083.0,7380.0,Unemployed,50.0,Single,1.0,748.0,3.0,0.31,13491.0,8933.0,10290.0,36.0,Personal,Urban,Graduate,Male,Private,Yes


In [4]:
# Identify numerical columns (float type) to handle missing values
# separately using appropriate numerical imputation strategies
num_mis_val = loan_df.select_dtypes(include=['float64']).columns

# Identify categorical columns (object type) for separate
# categorical missing value treatment (e.g., mode imputation)
cat_mis_val = loan_df.select_dtypes(include=['object']).columns

In [5]:
# i learnt this new method of filling the missing value

# Handle missing values using SimpleImputer
# Numerical features are imputed with the mean to preserve overall distribution
from sklearn.impute import SimpleImputer

# Impute missing values in numerical columns
num_imp = SimpleImputer(strategy='mean')
loan_df[num_mis_val] = num_imp.fit_transform(loan_df[num_mis_val])

# Impute missing values in categorical columns
# Most frequent value (mode) is used to retain category consistency
cat_imp = SimpleImputer(strategy='most_frequent')
loan_df[cat_mis_val] = cat_imp.fit_transform(loan_df[cat_mis_val])


# Feature Encoding

In [6]:
# Encode categorical variables into numerical form using LabelEncoder
# This is required as machine learning models cannot work with string values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # new thing

le = LabelEncoder()

# Encode Education_Level (ordinal/binary categorical feature)
loan_df['Education_Level'] = le.fit_transform(loan_df['Education_Level'])

# Encode target variable (Loan_Approved) into binary numerical labels
# This enables supervised learning model training
loan_df['Loan_Approved'] = le.fit_transform(loan_df['Loan_Approved'])


In [7]:
# Apply One-Hot Encoding to nominal categorical features
# OneHotEncoder is preferred here to avoid introducing
# artificial ordinal relationships between categories
from sklearn.preprocessing import OneHotEncoder

# List of categorical columns to be one-hot encoded
col = [
    'Employment_Status',
    'Marital_Status',
    'Loan_Purpose',
    'Property_Area',
    'Gender',
    'Employer_Category'
]

# Initialize OneHotEncoder
# - drop='first' helps reduce multicollinearity (dummy variable trap)
# - sparse_output=False returns a dense NumPy array
# - handle_unknown='ignore' ensures robustness during inference
ohe = OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
)

# Fit encoder on categorical features and transform them
encoded = ohe.fit_transform(loan_df[col])

In [8]:
# Convert encoded NumPy array into a DataFrame with meaningful column names
encoded_df = pd.DataFrame(
    encoded,
    columns=ohe.get_feature_names_out(col),
    index=loan_df.index
)

# Drop original categorical columns and concatenate encoded features
# to form the final preprocessed dataset
loan_df = pd.concat(
    [loan_df.drop(columns=col), encoded_df],
    axis=1
)

# Train-Test-split + Feature Scalling

In [9]:
# Separate input features (X) and target variable (y)
# Loan_Approved is the label we want the model to predict
X = loan_df.drop(columns=['Loan_Approved'],axis=1)
y = loan_df['Loan_Approved']

# Split data into training and testing sets
# Test size of 20% ensures fair evaluation on unseen data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# Initialize StandardScaler to normalize feature values
scaler = StandardScaler()

# Fit scaler ONLY on training data to learn scaling parameters
# This prevents data leakage from the test set
X_train_scaled = scaler.fit_transform(X_train)

# Apply the same scaling transformation to test data
X_test_scaled = scaler.transform(X_test)

# Train & Evaluate models

In [11]:
# KNN Model

# Initialize K-Nearest Neighbors classifier
# n_neighbors=9 defines the number of nearest points used for prediction
knn_model = KNeighborsClassifier(n_neighbors=9)

# Train the KNN model on scaled training data
knn_model.fit(X_train_scaled, y_train)

# Generate predictions on scaled test data
y_pred = knn_model.predict(X_test_scaled)

# Evaluation metrics to assess KNN model performance
print("Evaluation For kNN")
print('Precision:', precision_score(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))
print('cm:', confusion_matrix(y_test, y_pred))

Evaluation For kNN
Precision: 0.6222222222222222
Accuracy: 0.75
Recall: 0.45901639344262296
F1: 0.5283018867924528
cm: [[122  17]
 [ 33  28]]
