In [5]:
# Make project root visible so we can import src modules
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

project_root


'c:\\Users\\User\\Desktop\\KAIM\\Week_4\\credit-risk-model'

In [12]:
# Feature_Engineering.py - Complete Task 3

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer



In [8]:
# ============================================================
# Load raw data
# ============================================================
X, y = load_data()
print("Raw data shape:", X.shape)
print("Target distribution:\n", y.value_counts())

2025-12-30 00:21:35,339 - INFO - Loading data from C:\Users\User\Desktop\KAIM\Week_4\credit-risk-model\Data\data.csv...
2025-12-30 00:21:38,602 - INFO - Feature DataFrame shape: (3742, 13)
2025-12-30 00:21:38,608 - INFO - Target distribution: 0 (low risk)=3550, 1 (high risk)=192


Raw data shape: (3742, 13)
Target distribution:
 total_amount
0    3550
1     192
Name: count, dtype: int64


In [14]:
# ============================================================
# Load data
# ============================================================
DATA_PATH = r"C:\Users\User\Desktop\KAIM\Week_4\credit-risk-model\Data\data.csv"
df = pd.read_csv(DATA_PATH, parse_dates=['TransactionStartTime'])

# ============================================================
# Aggregate Features per Customer
# ============================================================
agg_features = (
    df.groupby("CustomerId")
      .agg(
          total_amount=pd.NamedAgg(column="Amount", aggfunc="sum"),
          avg_amount=pd.NamedAgg(column="Amount", aggfunc="mean"),
          transaction_count=pd.NamedAgg(column="Amount", aggfunc="count"),
          std_amount=pd.NamedAgg(column="Amount", aggfunc="std"),
          total_value=pd.NamedAgg(column="Value", aggfunc="sum"),
          avg_value=pd.NamedAgg(column="Value", aggfunc="mean"),
          std_value=pd.NamedAgg(column="Value", aggfunc="std")
      )
      .reset_index()
)

df = df.merge(agg_features, on="CustomerId", how="left")



In [15]:
# ============================================================
# Time-based Features
# ============================================================
df['transaction_hour'] = df['TransactionStartTime'].dt.hour
df['transaction_day'] = df['TransactionStartTime'].dt.day
df['transaction_month'] = df['TransactionStartTime'].dt.month
df['transaction_year'] = df['TransactionStartTime'].dt.year



In [16]:
# ============================================================
# Handle Missing Values
# ============================================================
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Impute numeric columns with median
num_imputer = SimpleImputer(strategy='median')
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])

# Impute categorical columns with mode
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])



In [19]:
# ============================================================
# WoE and IV
# ============================================================

def calc_woe_iv(df, feature, target):
    eps = 0.0001
    grouped = df.groupby(feature)[target].agg(['count','sum'])
    grouped['good'] = grouped['count'] - grouped['sum']
    grouped['bad'] = grouped['sum']
    grouped['dist_good'] = grouped['good'] / grouped['good'].sum()
    grouped['dist_bad'] = grouped['bad'] / grouped['bad'].sum()
    grouped['woe'] = np.log((grouped['dist_good'] + eps) / (grouped['dist_bad'] + eps))
    grouped['iv'] = (grouped['dist_good'] - grouped['dist_bad']) * grouped['woe']
    iv = grouped['iv'].sum()
    woe_dict = grouped['woe'].to_dict()
    return woe_dict, iv

target_col = 'FraudResult'  # 

woe_cols = []
iv_dict = {}

for col in categorical_cols:
    woe_dict, iv = calc_woe_iv(df, col, target_col)
    df[col + "_woe"] = df[col].map(woe_dict)
    woe_cols.append(col + "_woe")
    iv_dict[col] = iv

print("IV values per categorical feature:")
print(iv_dict)



IV values per categorical feature:
{'TransactionId': np.float64(4.066381783677512), 'BatchId': np.float64(4.146530053337502), 'AccountId': np.float64(6.845183723260969), 'SubscriptionId': np.float64(6.844113209456131), 'CustomerId': np.float64(6.65362707926726), 'CurrencyCode': np.float64(0.0), 'ProviderId': np.float64(3.3227451830919335), 'ProductId': np.float64(3.868466779726453), 'ProductCategory': np.float64(1.063636562098841), 'ChannelId': np.float64(1.2231102812260919)}


In [20]:

# One-Hot Encoding for remaining non-WoE categorical columns
remaining_cat_cols = list(set(categorical_cols) - set([c.replace("_woe","") for c in woe_cols]))
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # updated parameter
ohe_cols = ohe.fit_transform(df[remaining_cat_cols])
ohe_df = pd.DataFrame(ohe_cols, columns=ohe.get_feature_names_out(remaining_cat_cols))
df = pd.concat([df.reset_index(drop=True), ohe_df.reset_index(drop=True)], axis=1)



In [21]:
# ============================================================
# Normalize/Standardize numeric features
# ============================================================
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(df[numeric_cols])
scaled_numeric_df = pd.DataFrame(scaled_numeric, columns=numeric_cols)
df.update(scaled_numeric_df)



In [23]:
# ============================================================
# Final clean dataset ready for modeling
# ============================================================
# Select columns for modeling
model_cols = numeric_cols + woe_cols + list(ohe_df.columns)
df_model = df[model_cols + [target_col]]


df_model.to_csv(r"C:\Users\User\Desktop\KAIM\Week_4\credit-risk-model\Data\cleaned_data.csv", index=False)

print("Feature engineering complete. Dataset ready for modeling.")


Feature engineering complete. Dataset ready for modeling.


In [24]:
# ============================================================
# Task 4 - Proxy Target Variable Engineering
# ============================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from datetime import datetime, timedelta


In [25]:
# ============================================================
# 1. Calculate RFM Metrics
# ============================================================

# Set a snapshot date for recency calculation
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Ensure TransactionStartTime is datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Aggregate RFM per customer
rfm = df.groupby('CustomerId').agg(
    recency=('TransactionStartTime', lambda x: (snapshot_date - x.max()).days),
    frequency=('TransactionId', 'count'),
    monetary=('Amount', 'sum')
).reset_index()

In [26]:
# ============================================================
# 2. Scale RFM Features
# ============================================================

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['recency', 'frequency', 'monetary']])

# ============================================================
# 3. K-Means Clustering
# ============================================================

kmeans = KMeans(n_clusters=3, random_state=42)
rfm['cluster'] = kmeans.fit_predict(rfm_scaled)

[WinError 2] The system cannot find the file specified
  File "c:\Users\User\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\User\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [27]:
# ============================================================
# 4. Identify High-Risk Cluster
# ============================================================

# Compute mean RFM values per cluster
cluster_summary = rfm.groupby('cluster')[['recency', 'frequency', 'monetary']].mean()
# High-risk cluster typically has:
# - high recency (long time since last transaction)
# - low frequency
# - low monetary
high_risk_cluster = cluster_summary.sort_values(
    by=['recency', 'frequency', 'monetary'],
    ascending=[False, True, True]
).index[0]

# Assign binary target
rfm['is_high_risk'] = (rfm['cluster'] == high_risk_cluster).astype(int)

# ============================================================
# 5. Merge Target Variable Back Into Main DataFrame
# ============================================================

# Keep only CustomerId and is_high_risk
target_df = rfm[['CustomerId', 'is_high_risk']]

# Merge into main processed dataset
df = df.merge(target_df, on='CustomerId', how='left')

# ============================================================
# 6. Optional: Save processed dataset
# ============================================================

df.to_csv("processed_data_with_target.csv", index=False)

print("Proxy target variable 'is_high_risk' created and merged successfully.")

Proxy target variable 'is_high_risk' created and merged successfully.
