## Data Preprocessing & Feature Engineering

### Objective
This notebook prepares the credit card dataset for machine learning by:

- Cleaning and renaming columns  
- Encoding categorical variables  
- Creating meaningful financial risk features  
- Performing trainâ€“test split with stratification  
- Scaling numerical features  
- Saving processed data for model training  

In [18]:
# Importing Module
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [19]:
# Loading Dataset 
file_path="../data/credit_card_default_dataset.csv"

df=pd.read_csv(file_path,header=1)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [20]:
df=df.rename(columns={'default payment next month':'Default'})

In [21]:
df.drop('ID',axis=1,inplace=True)

In [22]:
cols=['SEX','EDUCATION','MARRIAGE']
df=pd.get_dummies(df,columns=cols,drop_first=True,dtype=int)


In [25]:
# Average Bill Per Customer
bill_cols=[f'BILL_AMT{i}' for i in range(1,7)]
df['AVG_BILL_AMT']=df[bill_cols].mean(axis=1)
df['AVG_BILL_AMT']

0          1284.000000
1          2846.166667
2         16942.166667
3         38555.666667
4         18223.166667
             ...      
29995    120891.500000
29996      3530.333333
29997     11749.333333
29998     44435.166667
29999     38479.000000
Name: AVG_BILL_AMT, Length: 30000, dtype: float64

In [26]:
#Credit Utility Ratio
df['CREDIT_UTILITY']=df['AVG_BILL_AMT']/df['LIMIT_BAL']
df["CREDIT_UTILITY"]

0        0.064200
1        0.023718
2        0.188246
3        0.771113
4        0.364463
           ...   
29995    0.549507
29996    0.023536
29997    0.391644
29998    0.555440
29999    0.769580
Name: CREDIT_UTILITY, Length: 30000, dtype: float64

In [28]:
# Average Payment Amount
pay_col=[f'PAY_AMT{i}' for i in range(1,7)]
df['AVG_PAY_AMT']=df[pay_col].mean(axis=1)
df['AVG_PAY_AMT']

0          114.833333
1          833.333333
2         1836.333333
3         1398.000000
4         9841.500000
             ...     
29995     7091.666667
29996     2415.000000
29997     5216.666667
29998    24530.166667
29999     1384.666667
Name: AVG_PAY_AMT, Length: 30000, dtype: float64

In [33]:
# Average Payment Delay
pay_delay_col=["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
df['AVG_PAY_DELAY']=df[pay_delay_col].clip(lower=0).mean(axis=1)
df['AVG_PAY_DELAY']

0        0.666667
1        0.666667
2        0.000000
3        0.000000
4        0.000000
           ...   
29995    0.000000
29996    0.000000
29997    1.500000
29998    0.166667
29999    0.000000
Name: AVG_PAY_DELAY, Length: 30000, dtype: float64

In [34]:
# Payment-to-bill Ratio
df['PAYMENT_TO_BILL']=df['AVG_PAY_AMT']/(df['AVG_BILL_AMT']+1)
df['PAYMENT_TO_BILL']

0        0.089364
1        0.292689
2        0.108382
3        0.036258
4        0.540025
           ...   
29995    0.058661
29996    0.683878
29997    0.443959
29998    0.552032
29999    0.035984
Name: PAYMENT_TO_BILL, Length: 30000, dtype: float64

In [36]:
# Separating Dependent and Independent Columns
X=df.drop(columns=['Default'])
Y=df['Default']

In [40]:
# Train-Test Split
X_Train,X_Test,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)
print("Train shape:", X_Train.shape)
print("Test shape:", X_Test.shape)

Train shape: (24000, 35)
Test shape: (6000, 35)


In [41]:
# Scaling The DataFrame
scaler=StandardScaler()
X_Train_Scaled=scaler.fit(X_Train)
X_Test_Scaled=scaler.fit(X_Test)

In [42]:
# Saving The Process Data
joblib.dump((X_Train_Scaled, X_Test_Scaled, Y_Train, Y_Test),"data/processed_data.pkl")
joblib.dump(scaler, "models/scaler.pkl")

print("Processed data and scaler saved.")


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed_data.pkl'