In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier 


In [2]:
data = pd.read_csv('loan_approval_dataset.csv')

In [3]:
data.columns = data.columns.str.strip()

In [4]:
data = data.drop_duplicates()

In [5]:
X = data.drop(['loan_id', 'loan_status'], axis=1)  # Drop ID and target columns
y = data['loan_status']  # Target: Loan approved ('Yes'/'No')

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define the numeric and categorical columns
numeric_features = ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 
                    'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
categorical_features = ['education', 'self_employed']

In [8]:
# Numeric transformer for scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values
    ('scaler', StandardScaler())  # Scale numerical data
])

# Categorical transformer for one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categorical data
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical data
])

# Combine into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [9]:
# Define the model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=42))])

# Train the model
model_pipeline.fit(X_train, Y_train)

In [10]:
# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(Y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(Y_test, y_pred))
print('Classification Report:\n', classification_report(Y_test, y_pred))

Accuracy: 0.9836065573770492
Confusion Matrix:
 [[529   7]
 [  7 311]]
Classification Report:
               precision    recall  f1-score   support

    Approved       0.99      0.99      0.99       536
    Rejected       0.98      0.98      0.98       318

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854

