In [6]:
import numpy as np
from pathlib import Path
import pandas as pd
OUT_DIR = Path(r"C:\BFSI\model_splits")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# parameters: fractions for fraud distribution
fraud_train_frac = 0.80
fraud_val_frac   = 0.10  # fraud_test_frac = 0.10 implicitly

# ensure df sorted by timestamp
df_sorted = df.sort_values("Transaction_Timestamp").reset_index(drop=True)

# extract fraud timestamps
if "isFraud" not in df_sorted.columns:
    raise ValueError("isFraud column not found")

fraud_ts = df_sorted.loc[df_sorted["isFraud"] == 1, "Transaction_Timestamp"]

if len(fraud_ts) < 3:
    raise ValueError("Too few fraud records to split across train/val/test. Consider stratified split or collect more data.")

# compute fraud-quantile cutoffs (these are timestamps)
cut_train = fraud_ts.quantile(fraud_train_frac)
cut_val   = fraud_ts.quantile(fraud_train_frac + fraud_val_frac)

print("Fraud-based cutoffs:", cut_train, cut_val)

# build splits by timestamp using those cutoffs (temporal, but ensuring fraud presence)
train_df = df_sorted[df_sorted["Transaction_Timestamp"] <= cut_train].copy()
val_df   = df_sorted[(df_sorted["Transaction_Timestamp"] > cut_train) & (df_sorted["Transaction_Timestamp"] <= cut_val)].copy()
test_df  = df_sorted[df_sorted["Transaction_Timestamp"] > cut_val].copy()

print("Sizes after fraud-based temporal split:", train_df.shape, val_df.shape, test_df.shape)
print("Fraud counts in splits:", train_df["isFraud"].sum(), val_df["isFraud"].sum(), test_df["isFraud"].sum())

# If any split has zero frauds (rare), fallback to simple approach below or adjust fractions
if val_df["isFraud"].sum() == 0 or test_df["isFraud"].sum() == 0:
    print("Warning: one of the splits still has zero frauds. Consider lowering fraud_train_frac or using stratified split.")

# Prepare X/y and drop ID columns
id_cols = [c for c in ["Transaction_ID","User_ID","Merchant_ID","Device_ID"] if c in df_sorted.columns]
target = "isFraud"

X_train = train_df.drop(columns=id_cols + [target])
y_train = train_df[target].astype(int)
X_val   = val_df.drop(columns=id_cols + [target])
y_val   = val_df[target].astype(int)
X_test  = test_df.drop(columns=id_cols + [target])
y_test  = test_df[target].astype(int)

# Save as CSV for reproducibility
X_train.to_csv(OUT_DIR / "X_train_fraudcut.csv", index=False)
y_train.to_csv(OUT_DIR / "y_train_fraudcut.csv", index=False)
X_val.to_csv(OUT_DIR / "X_val_fraudcut.csv", index=False)
y_val.to_csv(OUT_DIR / "y_val_fraudcut.csv", index=False)
X_test.to_csv(OUT_DIR / "X_test_fraudcut.csv", index=False)
y_test.to_csv(OUT_DIR / "y_test_fraudcut.csv", index=False)

print("Saved splits to", OUT_DIR)


Fraud-based cutoffs: 2024-01-28 18:39:12 2024-02-01 05:59:06
Sizes after fraud-based temporal split: (40000, 19) (5000, 19) (55000, 19)
Fraud counts in splits: 40000 5000 5000
Saved splits to C:\BFSI\model_splits
