In [10]:
import pandas as pd
import numpy as np

train = pd.read_csv("../data/processed/train_clean.csv")
test = pd.read_csv("../data/processed/test_clean.csv")

In [11]:
X_train = train.drop(columns=["default"])
y_train = train["default"]

X_test = test.drop(columns=["default"])
y_test = test["default"]


In [12]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()


In [13]:
def bin_numerical(series, n_bins=5):
    return pd.qcut(series, q=n_bins, duplicates='drop')


In [14]:
def woe(df, feature, target):
    eps = 0.0001  # avoid division by zero
    df_ = pd.crosstab(df[feature], df[target])
    df_["good"] = df_.get(0,0)
    df_["bad"] = df_.get(1,0)
    df_["dist_good"] = df_["good"] / df_["good"].sum()
    df_["dist_bad"] = df_["bad"] / df_["bad"].sum()
    df_["woe"] = np.log((df_["dist_good"]+eps)/(df_["dist_bad"]+eps))
    return df_["woe"].to_dict()


In [15]:
train_woe = pd.DataFrame()
test_woe = pd.DataFrame()

# Numerical features
for col in num_cols:
    train[f"{col}_bin"] = bin_numerical(train[col])
    woe_dict = woe(train, f"{col}_bin", "default")
    train_woe[col] = train[f"{col}_bin"].map(woe_dict)
    test[f"{col}_bin"] = bin_numerical(test[col])
    test_woe[col] = test[f"{col}_bin"].map(woe_dict)

# Categorical features
for col in cat_cols:
    woe_dict = woe(train, col, "default")
    train_woe[col] = train[col].map(woe_dict)
    test_woe[col] = test[col].map(woe_dict)


In [16]:
train_woe["default"] = y_train.values
test_woe["default"] = y_test.values


In [17]:
train_woe.to_csv("../data/processed/train_woe.csv", index=False)
test_woe.to_csv("../data/processed/test_woe.csv", index=False)
