In [None]:
# Core libraries
%pip install pandas numpy joblib scikit-learn  # run this cell if you need to install packages

In [2]:
import os, pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [3]:
RAW = "../Data/loan_data.csv"          
OUT_DIR = "../Data/processed"
os.makedirs(OUT_DIR, exist_ok=True)
df = pd.read_csv(RAW)

In [4]:
df = df.copy()
if 'id' in df.columns:
    df = df.drop(columns=['id'])

In [5]:
if 'grade_subgrade' in df.columns:
    gs = df['grade_subgrade'].astype(str)
    df['grade'] = gs.str[0]
    df['subgrade'] = gs.str[1:].replace('', np.nan)
    df.drop(columns=['grade_subgrade'], inplace=True)

In [6]:
df['income_to_loan'] = df['annual_income'] / df['loan_amount'].replace(0, np.nan)
# credit bucket
bins = [0, 579, 669, 739, 799, 999]
labels = ['Poor','Fair','Good','Very Good','Excellent']
df['credit_bucket'] = pd.cut(df['credit_score'], bins=bins, labels=labels, include_lowest=True)
# interest bins (quartiles)
df['interest_bin'] = pd.qcut(df['interest_rate'], q=4, duplicates='drop', labels=['ir_q1','ir_q2','ir_q3','ir_q4'])

In [7]:
def to_int_safe(x):
    try:
        return int(x)
    except:
        return np.nan
df['subgrade_num'] = df['subgrade'].apply(to_int_safe)

In [8]:
target = 'loan_paid_back'
numeric_features = [c for c in ['annual_income','debt_to_income_ratio','credit_score','loan_amount','interest_rate','income_to_loan','subgrade_num'] if c in df.columns]
ordinal_features = [c for c in ['grade'] if c in df.columns]
ohe_features = [c for c in ['gender','marital_status','education_level','employment_status','loan_purpose','credit_bucket','interest_bin'] if c in df.columns]


In [9]:
X = df[numeric_features + ordinal_features + ohe_features].copy()
y = df[target].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:
num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

In [16]:
grade_order = [['A', 'B', 'C', 'D', 'E', 'F']]  # one list per ordinal column

ord_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ord', OrdinalEncoder(
        categories=grade_order,
        dtype=float,
        handle_unknown='use_encoded_value',  # optional but recommended
        unknown_value=-1                     # encodes unseen grades as -1
    ))
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False                  # replaces old 'sparse=False'
    ))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, numeric_features),
    ('ord', ord_pipe, ordinal_features),
    ('cat', cat_pipe, ohe_features)
], remainder='drop')

In [17]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)

In [18]:
# feature names (approx)
ohe_cols = preprocessor.named_transformers_['cat'].named_steps['ohe'].get_feature_names_out(ohe_features).tolist() if len(ohe_features)>0 else []
ord_cols = ordinal_features
feature_names = numeric_features + ord_cols + ohe_cols

In [19]:
pd.DataFrame(X_train_trans, columns=feature_names).to_csv(os.path.join(OUT_DIR, "X_train_preprocessed.csv"), index=False)
pd.DataFrame(X_test_trans, columns=feature_names).to_csv(os.path.join(OUT_DIR, "X_test_preprocessed.csv"), index=False)
y_train.to_csv(os.path.join(OUT_DIR, "y_train.csv"), index=False)
y_test.to_csv(os.path.join(OUT_DIR, "y_test.csv"), index=False)
joblib.dump(preprocessor, os.path.join(OUT_DIR, "preprocessor_finsecure.pkl"))

['../Data/processed\\preprocessor_finsecure.pkl']

In [20]:
print("Saved preprocessor and preprocessed datasets to:", OUT_DIR)
print("X_train shape:", X_train_trans.shape, "X_test shape:", X_test_trans.shape)

Saved preprocessor and preprocessed datasets to: ../Data/processed
X_train shape: (214491, 42) X_test shape: (53623, 42)
