## Importing LIBRARIES & CONFIG

In [9]:

import numpy as np
import pandas as pd
import joblib
import os
from zipfile import ZipFile
import warnings

# sklearn preprocessing 
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# model
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import roc_auc_score

# smote for over sapling
from imblearn.over_sampling import SMOTE

# filter warning
warnings.filterwarnings('ignore')


# Reproducibility
SEED = 234
np.random.seed(SEED)

# File paths
path=r'..\Data\Loan_Payback.zip'

print("Libraries loaded successfully.")

Libraries loaded successfully.


## LOAD DATA

In [11]:
# loading the dataset
with ZipFile(path, 'r') as zip_ref:
    # loading the train data
    with zip_ref.open("train.csv") as tr:
        train_raw= pd.read_csv(tr)

    # loading the test data
    with zip_ref.open("test.csv") as te:
        test_raw= pd.read_csv(te)

print("Train Shape:", train_raw.shape)
print("Test Shape:", test_raw.shape)

# Quick preview
display(train_raw.head())
display(test_raw.head())

Train Shape: (593994, 13)
Test Shape: (254569, 12)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1


## Feature Engineering

In [12]:
def feature_engineering(df):
        df= df.copy()
        if 'interest_rate' in df.columns and 'debt_to_income_ratio' in df.columns:
                df['interest_burden'] = df['interest_rate'] * df['debt_to_income_ratio']
        if 'loan_amount' in df.columns and 'annual_income' in df.columns:
                df['loan_income_ratio']= df['loan_amount']/df['annual_income']
        if 'credit_score' in df.columns and 'annual_income' in df.columns:
                df['credit_efficiency'] = df['credit_score'] / (df['annual_income'] / 1000 + 1)
        if 'annual_income' in df.columns:
                df['monthly_income'] = df['annual_income'] / 12
        if 'loan_amount' in df.columns and 'credit_score' in df.columns:
                df['loan_to_credit'] = df['loan_amount'] / (df['credit_score'] + 1)
        if 'loan_amount' in df.columns and 'interest_rate' in df.columns:
                df['risk_weighted_amount'] = df['loan_amount'] * df['interest_rate']
        return df
# Apply feature engineering
train_data = feature_engineering(train_raw)
test_data  = feature_engineering(test_raw)

print("Feature engineering completed.")
train_data.head()



Feature engineering completed.


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back,interest_burden,loan_income_ratio,credit_efficiency,monthly_income,loan_to_credit,risk_weighted_amount
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0,1.14828,0.086094,24.236046,2447.3325,3.430692,34563.5014
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0,2.14472,0.207757,27.522912,1842.335,7.210518,59342.852
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0,0.94672,0.34308,13.724583,4130.516667,24.467842,165970.264
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0,1.0465,0.099929,11.137056,3904.854167,8.768689,75387.928
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0,0.54113,0.477883,25.097465,2124.725,18.29494,124403.0303


## Column group definiations

In [13]:
# Ordinal columns (with a known order)
ordinal_cols = ['education_level', 'grade_subgrade']

# Categorical columns to one-hot encode
categorical_cols = [
    'gender',
    'marital_status',
    'employment_status',
    'loan_purpose'
]

#  numeric columns (exclude target + categoricals)
numeric_cols = [
    col for col in train_data.columns
    if col not in ordinal_cols
    and col not in categorical_cols
    and col not in ['loan_paid_back', 'id']
    and train_data[col].dtype != 'object'
]

print("Ordinal columns:", ordinal_cols)
print("One-hot categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

Ordinal columns: ['education_level', 'grade_subgrade']
One-hot categorical columns: ['gender', 'marital_status', 'employment_status', 'loan_purpose']
Numeric columns: ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'interest_burden', 'loan_income_ratio', 'credit_efficiency', 'monthly_income', 'loan_to_credit', 'risk_weighted_amount']


### Preprocessing Pipeline

In [14]:
# ---------- ORDINAL ENCODERS (Custom Order) ----------

education_order = [
    "Other",
    "High School",
    "Bachelor's",
    "Master's",
    "PhD"
]

grade_order = [
    'A1','A2','A3','A4','A5',
    'B1','B2','B3','B4','B5',
    'C1','C2','C3','C4','C5',
    'D1','D2','D3','D4','D5',
    'E1','E2','E3','E4','E5',
    'F1','F2','F3','F4','F5'
]

ordinal_encoder = OrdinalEncoder(
    categories=[education_order, grade_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

# Map ordinal columns to their encoder
ordinal_transformer = ordinal_encoder

# ---------- ONE-HOT FOR OTHER CATEGORICALS ----------
onehot_transformer = OneHotEncoder(
    drop='first',
    handle_unknown='ignore',
    sparse_output=False
)

# ---------- BUILD COLUMN TRANSFORMER ----------
preprocessor = ColumnTransformer(
    transformers=[
        ("ord", ordinal_transformer, ordinal_cols),
        ("cat", onehot_transformer, categorical_cols),
        # numeric columns will pass through unchanged
        ("num", "passthrough", numeric_cols)
    ]
)

# Fit on the FULL TRAINING DATA (X only)
preprocessor.fit(train_data.drop(columns=["loan_paid_back", "id"]))

# Save preprocessor
os.makedirs("artifacts", exist_ok=True)
joblib.dump(preprocessor, "artifacts/preprocessor.pkl")

print("Preprocessing pipeline saved → artifacts/preprocessor.pkl")

Preprocessing pipeline saved → artifacts/preprocessor.pkl


### TRANSFORM → SMOTE → FEATURE SELECTION → MODEL

In [15]:

# PREPROCESS TRAINING FEATURES
X_raw = train_data.drop(columns=["loan_paid_back", "id"])
y = train_data["loan_paid_back"]

X_processed = preprocessor.transform(X_raw)

print("Shape after preprocessing:", X_processed.shape)

# APPLY SMOTE ON PROCESSED DATA
sm = SMOTE(random_state=234)
X_balanced, y_balanced = sm.fit_resample(X_processed, y)

print("Shape after SMOTE:", X_balanced.shape)

# FEATURE SELECTION USING MUTUAL INFO
selector = SelectKBest(mutual_info_classif, k="all")  # keep all, rank later
selector.fit(X_balanced, y_balanced)

X_selected = selector.transform(X_balanced)

print("Shape after feature selection:", X_selected.shape)

# SAVE SELECTOR + FEATURE NAMES
feature_names = preprocessor.get_feature_names_out()
selected_mask = selector.get_support()
selected_features = feature_names[selected_mask]

os.makedirs("artifacts", exist_ok=True)
joblib.dump(selector, "artifacts/selector.pkl")
joblib.dump(selected_features, "artifacts/feature_names.pkl")

print("Saved selector & feature names.")

# TRAIN FINAL RANDOM FOREST MODEL
best_model = RandomForestClassifier(
    n_estimators=600,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=234,
    n_jobs=-1
)

best_model.fit(X_selected, y_balanced)

joblib.dump(best_model, "artifacts/model.pkl")
print("Model saved → artifacts/model.pkl")


Shape after preprocessing: (593994, 29)
Shape after SMOTE: (948988, 29)
Shape after feature selection: (948988, 29)
Saved selector & feature names.
Model saved → artifacts/model.pkl
