### PART A — SIMPLE FEATURE ENGINEERING

In [1]:
# Load dataset
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)
df = pd.read_csv("../data/processed/credit_default.csv")

# PAY_X (delay) features
pay_cols = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

df['max_delay'] = df[pay_cols].max(axis=1)
df['avg_delay'] = df[pay_cols].mean(axis=1)
df['delay_count'] = df[pay_cols].apply(lambda r: sum(r > 0), axis=1)

# BILL_AMT features
bill_cols = [f'BILL_AMT{i}' for i in range(1,7)]

df['total_bill_6m'] = df[bill_cols].sum(axis=1)
df['avg_bill_6m'] = df[bill_cols].mean(axis=1)
df['bill_trend'] = df['BILL_AMT1'] - df['BILL_AMT6']

# PAY_AMT features
pay_amt_cols = [f'PAY_AMT{i}' for i in range(1,7)]

df['total_pay_6m'] = df[pay_amt_cols].sum(axis=1)
df['avg_pay_6m'] = df[pay_amt_cols].mean(axis=1)

# Ratios
df['utilization_ratio'] = df['total_bill_6m'] / df['LIMIT_BAL']
df['payment_ratio'] = df['total_pay_6m'] / (df['total_bill_6m'] + 1) 
#add +1 to avoid division by zero


# Drop unusable column
df = df.drop(columns=['ID'])

df.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,max_delay,avg_delay,delay_count,total_bill_6m,avg_bill_6m,bill_trend,total_pay_6m,avg_pay_6m,utilization_ratio,payment_ratio
0,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,2,-0.333333,2,7704,1284.0,3913,689,114.833333,0.3852,0.089422
1,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,2,0.5,2,17077,2846.166667,-579,5000,833.333333,0.142308,0.292774
2,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,0,0.0,0,101653,16942.166667,13690,11018,1836.333333,1.129478,0.108387
3,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,0,0.0,0,231334,38555.666667,17443,8388,1398.0,4.62668,0.036259
4,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,0,-0.333333,0,109339,18223.166667,-10514,59049,9841.5,2.18678,0.540049


In [8]:
df.to_csv("../data/processed/credit_default_engineered.csv",index=False)
print("Saved engineered dataset successfully!")

Saved engineered dataset successfully!


### PART B — ADVANCED FEATURE ENGINEERING

In [4]:
import numpy as np
import pandas as pd

df_fe = pd.read_csv("../data/processed/credit_default.csv")
# 1. Payment delay features (how late the person pays)
pay_cols = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']

df_fe['avg_delay'] = df_fe[pay_cols].mean(axis=1)
df_fe['max_delay'] = df_fe[pay_cols].max(axis=1)
df_fe['min_delay'] = df_fe[pay_cols].min(axis=1)
df_fe['num_delays'] = (df_fe[pay_cols] > 0).sum(axis=1)
df_fe['num_severe_delays'] = (df_fe[pay_cols] >= 2).sum(axis=1)

# 2. Bill amount behavior (spending pattern)
bill_cols = ['BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6']

df_fe['avg_bill'] = df_fe[bill_cols].mean(axis=1)
df_fe['max_bill'] = df_fe[bill_cols].max(axis=1)
df_fe['min_bill'] = df_fe[bill_cols].min(axis=1)
df_fe['bill_std'] = df_fe[bill_cols].std(axis=1)
df_fe['bill_growth'] = df_fe['BILL_AMT6'] - df_fe['BILL_AMT1']

# Slope (trend) of bills (linear regression on bill amounts)
bill_matrix = df_fe[bill_cols].values
df_fe['bill_trend'] = np.polyfit(range(6), bill_matrix.T, 1)[0]

# 3. Payment amount behavior (how much they repay)
pay_amt_cols = ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']

df_fe['avg_pay_amt'] = df_fe[pay_amt_cols].mean(axis=1)
df_fe['max_pay_amt'] = df_fe[pay_amt_cols].max(axis=1)
df_fe['min_pay_amt'] = df_fe[pay_amt_cols].min(axis=1)
df_fe['pay_amt_std'] = df_fe[pay_amt_cols].std(axis=1)
df_fe['pay_amt_growth'] = df_fe['PAY_AMT6'] - df_fe['PAY_AMT1']

# 4. Utilization & ratios (key credit risk features)
df_fe['utilization_ratio'] = df_fe['avg_bill'] / (df_fe['LIMIT_BAL'] + 1)
df_fe['repayment_ratio'] = df_fe['avg_pay_amt'] / (df_fe['avg_bill'] + 1)
df_fe['income_to_limit'] = df_fe['LIMIT_BAL'] / (df_fe['BILL_AMT1'] + 1)

# 5. Volatility (Risky customers have unstable behavior)
df_fe['bill_volatility'] = df_fe[bill_cols].std(axis=1)
df_fe['payment_volatility'] = df_fe[pay_amt_cols].std(axis=1)

# 6. Debt acceleration (fast-increasing debt)
df_fe['debt_acceleration'] = (df_fe['BILL_AMT6'] - df_fe['BILL_AMT5']) - (df_fe['BILL_AMT2'] - df_fe['BILL_AMT1'])

# 7. Total features
df_fe['total_bill_6m'] = df_fe[bill_cols].sum(axis=1)
df_fe['total_pay_6m'] = df_fe[pay_amt_cols].sum(axis=1)
df_fe['difference_bill_pay'] = df_fe['total_bill_6m'] - df_fe['total_pay_6m']

# 8. Remove column
df_fe = df_fe.drop(columns=['ID'])

print("Advanced FE Completed. New Shape:", df_fe.shape)


Advanced FE Completed. New Shape: (30000, 49)


In [5]:
df_fe.to_csv("../data/processed/credit_final_engineered.csv", index=False)
print("Saved at data/processed/credit_final_engineered.csv")


Saved at data/processed/credit_final_engineered.csv
