In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load your data (use the same path as Day 1)
import kagglehub
import os

path = kagglehub.dataset_download("architsharma01/loan-approval-prediction-dataset")
csv_file = [f for f in os.listdir(path) if f.endswith(".csv")][0]
csv_path = os.path.join(path, csv_file)

df = pd.read_csv(csv_path)
df.columns = df.columns.str.strip()  # Fix column names

print(f"✅ Dataset loaded: {df.shape}")
print(f"✅ Columns: {list(df.columns)}")


✅ Dataset loaded: (4269, 13)
✅ Columns: ['loan_id', 'no_of_dependents', 'education', 'self_employed', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value', 'loan_status']


In [2]:
df_clean = df.copy()
missing_before = df_clean.isnull().sum()
print(f"Missing values: {missing_before.sum()}")

categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)

print(f"✅ Missing values after: {df_clean.isnull().sum().sum()}")


Missing values: 0
✅ Missing values after: 0


In [3]:
if 'income_annum' in df_clean.columns and 'loan_amount' in df_clean.columns:
    df_clean['income_to_loan_ratio'] = df_clean['income_annum'] / df_clean['loan_amount']
    df_clean['income_to_loan_ratio'].replace([np.inf, -np.inf], 0, inplace=True)

if 'residential_assets_value' in df_clean.columns:
    df_clean['total_assets'] = (df_clean['residential_assets_value'] + 
                               df_clean['commercial_assets_value'] + 
                               df_clean['luxury_assets_value'])

print(f"✅ Features engineered. New shape: {df_clean.shape}")


✅ Features engineered. New shape: (4269, 15)


In [4]:
df_encoded = df_clean.copy()
label_encoders = {}

categorical_cols = df_encoded.select_dtypes(include=['object']).columns.tolist()
target_col = 'loan_status'
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

target_encoder = LabelEncoder()
df_encoded[target_col] = target_encoder.fit_transform(df_encoded[target_col])
label_encoders[target_col] = target_encoder

print(f"✅ Categorical encoding complete")


✅ Categorical encoding complete


In [5]:
columns_to_remove = ['loan_id']
df_ml = df_encoded.drop(columns=[col for col in columns_to_remove if col in df_encoded.columns])

X = df_ml.drop([target_col], axis=1)
y = df_ml[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

os.makedirs('data/processed', exist_ok=True)
X_train.to_csv('data/processed/X_train.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
pd.Series(y_train).to_csv('data/processed/y_train.csv', index=False)
pd.Series(y_test).to_csv('data/processed/y_test.csv', index=False)

print(f"✅ Data split: Train {X_train.shape}, Test {X_test.shape}")
print(f"✅ Files saved to data/processed/")


✅ Data split: Train (3415, 13), Test (854, 13)
✅ Files saved to data/processed/
