In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df=pd.read_csv('../data/creditcard_clean.csv')

# Feature Engineering

In [None]:
# Time based features
df["txn_hour"] = (df["txn_timestamp_sec"] // 3600).astype(int)
df["txn_day"] = (df["txn_hour"] // 24).astype(int)
df["hour_of_day"] = df["txn_hour"] % 24

# Night = 12 AM to 5 AM
df["is_night"] = df["hour_of_day"].isin([0,1,2,3,4,5]).astype(int)

# Weekend simulation: every 7th day is weekend
df["is_weekend"] = df["txn_day"].isin([5,6]).astype(int)


In [6]:
# Amount based features
df["log_amount"] = np.log1p(df["ach_amount"])

# Z-score of amount
df["zscore_amount"] = (df["ach_amount"] - df["ach_amount"].mean()) / df["ach_amount"].std()

# Amount quantile bin
df["amount_bin"] = pd.qcut(df["ach_amount"], q=10, labels=False, duplicates='drop')


In [7]:
# Embedding V1-V28 features
embedding_cols = [f"embedding_feature_{i}" for i in range(1,29)]

df["embedding_mean"] = df[embedding_cols].mean(axis=1)
df["embedding_std"]  = df[embedding_cols].std(axis=1)
df["embedding_max"]  = df[embedding_cols].max(axis=1)
df["embedding_min"]  = df[embedding_cols].min(axis=1)
df["embedding_abs_sum"] = df[embedding_cols].abs().sum(axis=1)


In [8]:
# Customer behavior features
df = df.sort_values(["customer_id", "txn_timestamp_sec"])

# Rolling 24h window (1 day)
df["count_last_24h"] = df.groupby("customer_id")["txn_timestamp_sec"].transform(
    lambda x: x.rolling(window=50, min_periods=1).count()
)

df["amount_sum_last_24h"] = df.groupby("customer_id")["ach_amount"].transform(
    lambda x: x.rolling(window=50, min_periods=1).sum()
)

# Average amount per customer
df["avg_amount_per_customer"] = df.groupby("customer_id")["ach_amount"].transform("mean")

# Frequency per customer
df["txn_frequency"] = df.groupby("customer_id")["txn_timestamp_sec"].transform(
    lambda x: x.diff().fillna(0)
)


In [9]:
# Encoding categorical fields
df = pd.get_dummies(df, columns=["txn_type", "channel"], drop_first=True)


In [10]:
# Final feature list
feature_cols = (
    embedding_cols +
    [
        "ach_amount", "log_amount", "zscore_amount", "amount_bin",
        "txn_hour", "txn_day", "hour_of_day", "is_night", "is_weekend",
        "embedding_mean", "embedding_std", "embedding_max", "embedding_min", "embedding_abs_sum",
        "count_last_24h", "amount_sum_last_24h", "avg_amount_per_customer", "txn_frequency"
    ] +
    # Add OHE categorical columns:
    [col for col in df.columns if "txn_type_" in col or "channel_" in col]
)


In [11]:
target_col = "fraud_label"


In [12]:
df.to_csv("../data/creditcard_features.csv", index=False)