In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# For handling imbalanced data
from imblearn.over_sampling import SMOTE
from collections import Counter

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the data
df = pd.read_csv('../data/Bank_Transaction_Fraud_Detection.csv')

print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"Fraud cases: {df['Is_Fraud'].sum():,} ({df['Is_Fraud'].mean()*100:.2f}%)")

# Display first few rows
df.head()

Dataset loaded: 200,000 rows, 24 columns
Fraud cases: 10,088 (5.04%)


Unnamed: 0,Customer_ID,Customer_Name,Gender,Age,State,City,Bank_Branch,Account_Type,Transaction_ID,Transaction_Date,Transaction_Time,Transaction_Amount,Merchant_ID,Transaction_Type,Merchant_Category,Account_Balance,Transaction_Device,Transaction_Location,Device_Type,Is_Fraud,Transaction_Currency,Customer_Contact,Transaction_Description,Customer_Email
0,d5f6ec07-d69e-4f47-b9b4-7c58ff17c19e,Osha Tella,Male,60,Kerala,Thiruvananthapuram,Thiruvananthapuram Branch,Savings,4fa3208f-9e23-42dc-b330-844829d0c12c,23-01-2025,16:04:07,32415.45,214e03c5-5c34-40d1-a66c-f440aa2bbd02,Transfer,Restaurant,74557.27,Voice Assistant,"Thiruvananthapuram, Kerala",POS,0,INR,+9198579XXXXXX,Bitcoin transaction,oshaXXXXX@XXXXX.com
1,7c14ad51-781a-4db9-b7bd-67439c175262,Hredhaan Khosla,Female,51,Maharashtra,Nashik,Nashik Branch,Business,c9de0c06-2c4c-40a9-97ed-3c7b8f97c79c,11-01-2025,17:14:53,43622.6,f9e3f11f-28d3-4199-b0ca-f225a155ede6,Bill Payment,Restaurant,74622.66,POS Mobile Device,"Nashik, Maharashtra",Desktop,0,INR,+9191074XXXXXX,Grocery delivery,hredhaanXXXX@XXXXXX.com
2,3a73a0e5-d4da-45aa-85f3-528413900a35,Ekani Nazareth,Male,20,Bihar,Bhagalpur,Bhagalpur Branch,Savings,e41c55f9-c016-4ff3-872b-cae72467c75c,25-01-2025,03:09:52,63062.56,97977d83-5486-4510-af1c-8dada3e1cfa0,Bill Payment,Groceries,66817.99,ATM,"Bhagalpur, Bihar",Desktop,0,INR,+9197745XXXXXX,Mutual fund investment,ekaniXXX@XXXXXX.com
3,7902f4ef-9050-4a79-857d-9c2ea3181940,Yamini Ramachandran,Female,57,Tamil Nadu,Chennai,Chennai Branch,Business,7f7ee11b-ff2c-45a3-802a-49bc47c02ecb,19-01-2025,12:27:02,14000.72,f45cd6b3-5092-44d0-8afb-490894605184,Debit,Entertainment,58177.08,POS Mobile App,"Chennai, Tamil Nadu",Mobile,0,INR,+9195889XXXXXX,Food delivery,yaminiXXXXX@XXXXXXX.com
4,3a4bba70-d9a9-4c5f-8b92-1735fd8c19e9,Kritika Rege,Female,43,Punjab,Amritsar,Amritsar Branch,Savings,f8e6ac6f-81a1-4985-bf12-f60967d852ef,30-01-2025,18:30:46,18335.16,70dd77dd-3b00-4b2c-8ebc-cfb8af5f6741,Transfer,Entertainment,16108.56,Virtual Card,"Amritsar, Punjab",Mobile,0,INR,+9195316XXXXXX,Debt repayment,kritikaXXXX@XXXXXX.com


In [3]:
print("="*80)
print("FEATURE ENGINEERING: TIME-BASED FEATURES")
print("="*80)

# Parse datetime
df['Transaction_DateTime'] = pd.to_datetime(df['Transaction_Date'] + ' ' + df['Transaction_Time'], 
                                             format='%d-%m-%Y %H:%M:%S')

# Extract time components
df['Hour'] = df['Transaction_DateTime'].dt.hour
df['Day_of_Week'] = df['Transaction_DateTime'].dt.dayofweek  # 0=Monday, 6=Sunday
df['Day'] = df['Transaction_DateTime'].dt.day
df['Month'] = df['Transaction_DateTime'].dt.month
df['Year'] = df['Transaction_DateTime'].dt.year

# Create time-based categories
df['Is_Weekend'] = df['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)
df['Is_Night'] = df['Hour'].apply(lambda x: 1 if x >= 22 or x <= 6 else 0)
df['Is_Business_Hours'] = df['Hour'].apply(lambda x: 1 if 9 <= x <= 17 else 0)

print("Time-based features created:")
print("  ‚Ä¢ Hour (0-23)")
print("  ‚Ä¢ Day_of_Week (0=Mon, 6=Sun)")
print("  ‚Ä¢ Day, Month, Year")
print("  ‚Ä¢ Is_Weekend (0/1)")
print("  ‚Ä¢ Is_Night (0/1)")
print("  ‚Ä¢ Is_Business_Hours (0/1)")

FEATURE ENGINEERING: TIME-BASED FEATURES
Time-based features created:
  ‚Ä¢ Hour (0-23)
  ‚Ä¢ Day_of_Week (0=Mon, 6=Sun)
  ‚Ä¢ Day, Month, Year
  ‚Ä¢ Is_Weekend (0/1)
  ‚Ä¢ Is_Night (0/1)
  ‚Ä¢ Is_Business_Hours (0/1)


In [4]:
print("="*80)
print("FEATURE ENGINEERING: TRANSACTION-BASED FEATURES")
print("="*80)

# Transaction to Balance Ratio
df['Transaction_to_Balance_Ratio'] = df['Transaction_Amount'] / df['Account_Balance']

# Amount Category (using fixed thresholds - no leakage)
df['Amount_Category'] = pd.cut(df['Transaction_Amount'], 
                                bins=[0, 25000, 50000, 75000, 100000],
                                labels=['Low', 'Medium', 'High', 'Very High'])

print("Transaction features created:")
print(f"  ‚Ä¢ Transaction_to_Balance_Ratio")
print(f"  ‚Ä¢ Amount_Category (Low/Medium/High/Very High)")
print("\nNote: Is_High_Value and Is_Low_Balance will be created AFTER train-test split")
print("      to prevent data leakage from quantile calculations")

# Display sample
print("\nSample of new features:")
print(df[['Transaction_Amount', 'Account_Balance', 'Transaction_to_Balance_Ratio', 
          'Amount_Category']].head(10))

FEATURE ENGINEERING: TRANSACTION-BASED FEATURES
Transaction features created:
  ‚Ä¢ Transaction_to_Balance_Ratio
  ‚Ä¢ Amount_Category (Low/Medium/High/Very High)

Note: Is_High_Value and Is_Low_Balance will be created AFTER train-test split
      to prevent data leakage from quantile calculations

Sample of new features:
   Transaction_Amount  Account_Balance  Transaction_to_Balance_Ratio  \
0            32415.45         74557.27                      0.434772   
1            43622.60         74622.66                      0.584576   
2            63062.56         66817.99                      0.943796   
3            14000.72         58177.08                      0.240657   
4            18335.16         16108.56                      1.138225   
5             9711.15         61258.85                      0.158526   
6            94677.01         36313.61                      2.607205   
7            67704.28         16948.73                      3.994652   
8            72953.45       

In [5]:
print("="*80)
print("FEATURE SELECTION")
print("="*80)

# Features to DROP (not useful for modeling)
drop_features = [
    'Customer_ID',           # Unique identifier
    'Customer_Name',         # Personal info
    'Transaction_ID',        # Unique identifier
    'Merchant_ID',           # Too many unique values
    'Transaction_Date',      # Already extracted features
    'Transaction_Time',      # Already extracted features
    'Transaction_DateTime',  # Already extracted features
    'Transaction_Location',  # Too many unique values (can use State/City instead)
    'Customer_Contact',      # Personal info
    'Customer_Email',        # Personal info
    'Transaction_Currency',  # All same (INR)
    'Bank_Branch',          # High cardinality, use State/City instead
]

# Categorical features (need encoding)
categorical_features = [
    'Gender',
    'State',
    'City', 
    'Account_Type',
    'Transaction_Type',
    'Merchant_Category',
    'Transaction_Device',
    'Device_Type',
    'Transaction_Description',
    'Amount_Category'
]

# Numerical features
numerical_features = [
    'Age',
    'Transaction_Amount',
    'Account_Balance',
    'Hour',
    'Day_of_Week',
    'Day',
    'Month',
    'Transaction_to_Balance_Ratio',
    'Is_Weekend',
    'Is_Night',
    'Is_Business_Hours',
    'Is_High_Value',
    'Is_Low_Balance'
]

# Target variable
target = 'Is_Fraud'

print(f"Features to DROP: {len(drop_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Target: {target}")
print(f"\nTotal features for modeling: {len(categorical_features) + len(numerical_features)}")

FEATURE SELECTION
Features to DROP: 12
Categorical features: 10
Numerical features: 13
Target: Is_Fraud

Total features for modeling: 23


In [6]:
print("="*80)
print("HANDLING HIGH CARDINALITY FEATURES")
print("="*80)

# Check cardinality of categorical features
print("Unique values in categorical features:")
for col in categorical_features:
    print(f"  ‚Ä¢ {col}: {df[col].nunique()} unique values")

# Identify high cardinality features (we'll encode these AFTER train/test split)
high_cardinality_features = ['State', 'City', 'Transaction_Description']

print(f"\nHigh cardinality features identified: {high_cardinality_features}")
print("These will be frequency-encoded AFTER train/test split to prevent data leakage")

# Remove original high cardinality features from categorical list
# (They'll be encoded later)
categorical_features = [f for f in categorical_features if f not in high_cardinality_features]

print(f"\n‚úì Categorical features for one-hot encoding: {len(categorical_features)}")
print(f"‚úì Features to frequency-encode later: {len(high_cardinality_features)}")

HANDLING HIGH CARDINALITY FEATURES
Unique values in categorical features:
  ‚Ä¢ Gender: 2 unique values
  ‚Ä¢ State: 34 unique values
  ‚Ä¢ City: 145 unique values
  ‚Ä¢ Account_Type: 3 unique values
  ‚Ä¢ Transaction_Type: 5 unique values
  ‚Ä¢ Merchant_Category: 6 unique values
  ‚Ä¢ Transaction_Device: 20 unique values
  ‚Ä¢ Device_Type: 4 unique values
  ‚Ä¢ Transaction_Description: 172 unique values
  ‚Ä¢ Amount_Category: 4 unique values

High cardinality features identified: ['State', 'City', 'Transaction_Description']
These will be frequency-encoded AFTER train/test split to prevent data leakage

‚úì Categorical features for one-hot encoding: 7
‚úì Features to frequency-encode later: 3


In [7]:
print("="*80)
print("ENCODING CATEGORICAL VARIABLES")
print("="*80)

# Create a copy for preprocessing
df_processed = df.copy()

# One-Hot Encoding for low cardinality categorical features ONLY
print(f"\nCategorical features to encode: {categorical_features}")
print(f"Number of features: {len(categorical_features)}")

# Apply One-Hot Encoding (State, City, Transaction_Description will stay as-is for now)
df_encoded = pd.get_dummies(df_processed, columns=categorical_features, drop_first=True, dtype=int)

print(f"\n‚úì One-Hot Encoding applied")
print(f"Shape before encoding: {df_processed.shape}")
print(f"Shape after encoding: {df_encoded.shape}")
print(f"New features created: {df_encoded.shape[1] - df_processed.shape[1]}")

# Verify high cardinality features are still present
print(f"\n‚úì High cardinality features preserved for later encoding:")
for col in high_cardinality_features:
    if col in df_encoded.columns:
        print(f"  ‚Ä¢ {col}: Present")

ENCODING CATEGORICAL VARIABLES

Categorical features to encode: ['Gender', 'Account_Type', 'Transaction_Type', 'Merchant_Category', 'Transaction_Device', 'Device_Type', 'Amount_Category']
Number of features: 7

‚úì One-Hot Encoding applied
Shape before encoding: (200000, 35)
Shape after encoding: (200000, 65)
New features created: 30

‚úì High cardinality features preserved for later encoding:
  ‚Ä¢ State: Present
  ‚Ä¢ City: Present
  ‚Ä¢ Transaction_Description: Present


In [8]:
print("="*80)
print("PREPARING FEATURES AND TARGET")
print("="*80)

# Drop unnecessary columns
df_model = df_encoded.drop(columns=drop_features, errors='ignore')

# Separate features (X) and target (y)
# NOTE: State, City, Transaction_Description are still in X at this point
X = df_model.drop(columns=[target])
y = df_model[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"Fraud percentage: {y.mean()*100:.2f}%")

# Verify high cardinality features are present
print(f"\n‚úì High cardinality features still in X (will be encoded after split):")
for col in high_cardinality_features:
    if col in X.columns:
        print(f"  ‚Ä¢ {col}: {X[col].nunique()} unique values")

print(f"\nTotal features: {X.shape[1]}")
print("Next step: Train-test split, then frequency encoding")

PREPARING FEATURES AND TARGET
Features (X) shape: (200000, 52)
Target (y) shape: (200000,)

Target distribution:
Is_Fraud
0    189912
1     10088
Name: count, dtype: int64
Fraud percentage: 5.04%

‚úì High cardinality features still in X (will be encoded after split):
  ‚Ä¢ State: 34 unique values
  ‚Ä¢ City: 145 unique values
  ‚Ä¢ Transaction_Description: 172 unique values

Total features: 52
Next step: Train-test split, then frequency encoding


In [9]:
print("="*80)
print("TRAIN-TEST SPLIT (Hold-out Test Set)")
print("="*80)

# Split: 80% for K-Fold CV, 20% held-out for final testing
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, 
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Total samples: {len(X):,}")
print(f"\nData for K-Fold CV: {X_train_full.shape[0]:,} samples ({X_train_full.shape[0]/len(X)*100:.0f}%)")
print(f"Hold-out test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.0f}%)")

print(f"\n{'='*50}")
print("K-Fold CV Data (Training Portion):")
print(f"{'='*50}")
print(f"Total samples: {len(y_train_full):,}")
print(y_train_full.value_counts())
print(f"Fraud rate: {y_train_full.mean()*100:.2f}%")

print(f"\n{'='*50}")
print("Hold-out Test Set (Final Evaluation):")
print(f"{'='*50}")
print(f"Total samples: {len(y_test):,}")
print(y_test.value_counts())
print(f"Fraud rate: {y_test.mean()*100:.2f}%")

print(f"\n{'='*50}")
print("Stratification Verification:")
print(f"{'='*50}")
print(f"Original fraud rate:    {y.mean()*100:.2f}%")
print(f"K-Fold data fraud rate: {y_train_full.mean()*100:.2f}%")
print(f"Test set fraud rate:    {y_test.mean()*100:.2f}%")
print("\n‚úì Stratification successful - fraud rates match!")

TRAIN-TEST SPLIT (Hold-out Test Set)
Total samples: 200,000

Data for K-Fold CV: 160,000 samples (80%)
Hold-out test set: 40,000 samples (20%)

K-Fold CV Data (Training Portion):
Total samples: 160,000
Is_Fraud
0    151930
1      8070
Name: count, dtype: int64
Fraud rate: 5.04%

Hold-out Test Set (Final Evaluation):
Total samples: 40,000
Is_Fraud
0    37982
1     2018
Name: count, dtype: int64
Fraud rate: 5.04%

Stratification Verification:
Original fraud rate:    5.04%
K-Fold data fraud rate: 5.04%
Test set fraud rate:    5.04%

‚úì Stratification successful - fraud rates match!


In [10]:
print("="*80)
print("FREQUENCY ENCODING (NO DATA LEAKAGE)")
print("="*80)

# Define function for frequency encoding
def compute_frequency_map(series):
    """Compute frequency map from a pandas Series"""
    return series.value_counts(normalize=True).to_dict()

# Apply frequency encoding to high cardinality features
# CRITICAL: Compute frequencies ONLY on training data, then apply to test
print("\nComputing frequency maps on TRAINING data only...")

frequency_maps = {}
for col in high_cardinality_features:
    # Compute frequency on TRAINING data only
    frequency_maps[col] = compute_frequency_map(X_train_full[col])
    
    # Apply to both train and test
    X_train_full[f'{col}_Frequency'] = X_train_full[col].map(frequency_maps[col]).fillna(0)
    X_test[f'{col}_Frequency'] = X_test[col].map(frequency_maps[col]).fillna(0)
    
    print(f"\n{col}:")
    print(f"  ‚Ä¢ Unique values in training: {X_train_full[col].nunique()}")
    print(f"  ‚Ä¢ Frequency range: {X_train_full[f'{col}_Frequency'].min():.6f} to {X_train_full[f'{col}_Frequency'].max():.6f}")
    print(f"  ‚Ä¢ Unseen values in test: {X_test[col].map(frequency_maps[col]).isna().sum()} (filled with 0)")

# Now drop the original categorical columns
print(f"\n{'='*80}")
print("Removing original high cardinality columns...")
X_train_full = X_train_full.drop(columns=high_cardinality_features)
X_test = X_test.drop(columns=high_cardinality_features)

print(f"\n‚úì Frequency encoding complete (NO data leakage)")
print(f"Training set shape: {X_train_full.shape}")
print(f"Test set shape: {X_test.shape}")

# Update feature names list
feature_names = X_train_full.columns.tolist()
print(f"\nFinal feature count: {len(feature_names)}")

FREQUENCY ENCODING (NO DATA LEAKAGE)

Computing frequency maps on TRAINING data only...

State:
  ‚Ä¢ Unique values in training: 34
  ‚Ä¢ Frequency range: 0.028600 to 0.030081
  ‚Ä¢ Unseen values in test: 0 (filled with 0)

City:
  ‚Ä¢ Unique values in training: 145
  ‚Ä¢ Frequency range: 0.005450 to 0.040725
  ‚Ä¢ Unseen values in test: 0 (filled with 0)

Transaction_Description:
  ‚Ä¢ Unique values in training: 172
  ‚Ä¢ Frequency range: 0.005256 to 0.006506
  ‚Ä¢ Unseen values in test: 0 (filled with 0)

Removing original high cardinality columns...

‚úì Frequency encoding complete (NO data leakage)
Training set shape: (160000, 52)
Test set shape: (40000, 52)

Final feature count: 52


In [11]:
print("="*80)
print("CREATING QUANTILE-BASED FEATURES (NO DATA LEAKAGE)")
print("="*80)

# Compute quantiles on TRAINING data only
transaction_75th = X_train_full['Transaction_Amount'].quantile(0.75)
balance_25th = X_train_full['Account_Balance'].quantile(0.25)

print(f"Quantiles computed on training data only:")
print(f"  ‚Ä¢ 75th percentile Transaction Amount: ‚Çπ{transaction_75th:,.2f}")
print(f"  ‚Ä¢ 25th percentile Account Balance: ‚Çπ{balance_25th:,.2f}")

# Apply to both train and test using the training-derived thresholds
X_train_full['Is_High_Value'] = (X_train_full['Transaction_Amount'] > transaction_75th).astype(int)
X_train_full['Is_Low_Balance'] = (X_train_full['Account_Balance'] < balance_25th).astype(int)

X_test['Is_High_Value'] = (X_test['Transaction_Amount'] > transaction_75th).astype(int)
X_test['Is_Low_Balance'] = (X_test['Account_Balance'] < balance_25th).astype(int)

print(f"\n‚úì Features created using training-set thresholds")
print(f"\nTraining set:")
print(f"  High value transactions: {X_train_full['Is_High_Value'].sum():,} ({X_train_full['Is_High_Value'].mean()*100:.1f}%)")
print(f"  Low balance accounts: {X_train_full['Is_Low_Balance'].sum():,} ({X_train_full['Is_Low_Balance'].mean()*100:.1f}%)")

print(f"\nTest set:")
print(f"  High value transactions: {X_test['Is_High_Value'].sum():,} ({X_test['Is_High_Value'].mean()*100:.1f}%)")
print(f"  Low balance accounts: {X_test['Is_Low_Balance'].sum():,} ({X_test['Is_Low_Balance'].mean()*100:.1f}%)")

# Update feature count
feature_names = X_train_full.columns.tolist()
print(f"\nFinal feature count: {len(feature_names)}")
print(f"Training shape: {X_train_full.shape}")
print(f"Test shape: {X_test.shape}")

CREATING QUANTILE-BASED FEATURES (NO DATA LEAKAGE)
Quantiles computed on training data only:
  ‚Ä¢ 75th percentile Transaction Amount: ‚Çπ74,379.05
  ‚Ä¢ 25th percentile Account Balance: ‚Çπ28,726.97

‚úì Features created using training-set thresholds

Training set:
  High value transactions: 40,000 (25.0%)
  Low balance accounts: 40,000 (25.0%)

Test set:
  High value transactions: 9,875 (24.7%)
  Low balance accounts: 9,968 (24.9%)

Final feature count: 54
Training shape: (160000, 54)
Test shape: (40000, 54)


In [12]:
print("="*80)
print("STRATIFIED K-FOLD CROSS-VALIDATION STRATEGY")
print("="*80)

from sklearn.model_selection import StratifiedKFold

# Initialize Stratified K-Fold
n_splits = 5  # We'll use 5 folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"Strategy: {n_splits}-Fold Stratified Cross-Validation")
print(f"\nHow it works:")
print(f"  1. Split {X_train_full.shape[0]:,} samples into {n_splits} folds")
print(f"  2. Each fold maintains ~{y_train_full.mean()*100:.2f}% fraud rate")
print(f"  3. For each fold:")
print(f"     ‚Ä¢ Train on {n_splits-1} folds (~{X_train_full.shape[0]*(n_splits-1)/n_splits:.0f} samples)")
print(f"     ‚Ä¢ Validate on 1 fold (~{X_train_full.shape[0]/n_splits:.0f} samples)")
print(f"     ‚Ä¢ Apply SMOTE to training folds only")
print(f"     ‚Ä¢ Scale features on training folds, apply to validation")
print(f"     ‚Ä¢ Train model and evaluate on validation fold")
print(f"  4. Average performance across all {n_splits} folds")
print(f"  5. Final model trained on all {X_train_full.shape[0]:,} samples")
print(f"  6. Final evaluation on hold-out test set ({X_test.shape[0]:,} samples)")

# Demonstrate the splits
print(f"\n{'='*80}")
print("FOLD BREAKDOWN - Verification of Stratification:")
print(f"{'='*80}")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_full, y_train_full), 1):
    y_train_fold = y_train_full.iloc[train_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    fraud_train = y_train_fold.sum()
    fraud_val = y_val_fold.sum()
    
    print(f"\nFold {fold_idx}:")
    print(f"  Training:   {len(train_idx):>7,} samples | Fraud: {fraud_train:>5,} ({y_train_fold.mean()*100:>5.2f}%)")
    print(f"  Validation: {len(val_idx):>7,} samples | Fraud: {fraud_val:>5,} ({y_val_fold.mean()*100:>5.2f}%)")

print(f"\n{'='*80}")
print("‚úì All folds maintain consistent ~5.04% fraud rate!")
print(f"{'='*80}")

STRATIFIED K-FOLD CROSS-VALIDATION STRATEGY
Strategy: 5-Fold Stratified Cross-Validation

How it works:
  1. Split 160,000 samples into 5 folds
  2. Each fold maintains ~5.04% fraud rate
  3. For each fold:
     ‚Ä¢ Train on 4 folds (~128000 samples)
     ‚Ä¢ Validate on 1 fold (~32000 samples)
     ‚Ä¢ Apply SMOTE to training folds only
     ‚Ä¢ Scale features on training folds, apply to validation
     ‚Ä¢ Train model and evaluate on validation fold
  4. Average performance across all 5 folds
  5. Final model trained on all 160,000 samples
  6. Final evaluation on hold-out test set (40,000 samples)

FOLD BREAKDOWN - Verification of Stratification:

Fold 1:
  Training:   128,000 samples | Fraud: 6,456 ( 5.04%)
  Validation:  32,000 samples | Fraud: 1,614 ( 5.04%)

Fold 2:
  Training:   128,000 samples | Fraud: 6,456 ( 5.04%)
  Validation:  32,000 samples | Fraud: 1,614 ( 5.04%)

Fold 3:
  Training:   128,000 samples | Fraud: 6,456 ( 5.04%)
  Validation:  32,000 samples | Fraud: 1,614 

In [13]:
print("="*80)
print("SAVING PREPROCESSED DATA")
print("="*80)

import joblib
import os

# Create directory for preprocessed data
os.makedirs('../data/preprocessed', exist_ok=True)

# Fit scaler on full training data (for deployment)
scaler = StandardScaler()
scaler.fit(X_train_full)

# Scale the hold-out test set
X_test_scaled = scaler.transform(X_test)

# Save the data
print("Saving files...")

# Training data for K-Fold CV (unscaled - we'll scale inside each fold)
np.save('../data/preprocessed/X_train_full.npy', X_train_full.values)
np.save('../data/preprocessed/y_train_full.npy', y_train_full.values)

# Hold-out test set
np.save('../data/preprocessed/X_test.npy', X_test.values)
np.save('../data/preprocessed/X_test_scaled.npy', X_test_scaled)
np.save('../data/preprocessed/y_test.npy', y_test.values)

# Save the scaler for deployment
joblib.dump(scaler, '../data/preprocessed/scaler.pkl')

# Save feature names
with open('../data/preprocessed/feature_names.txt', 'w') as f:
    for col in feature_names:
        f.write(f"{col}\n")

# Save preprocessing parameters (including thresholds for deployment)
params = {
    'n_folds': 5,
    'test_size': 0.2,
    'random_state': 42,
    'smote_strategy': 0.5,
    'total_features': len(feature_names),
    'feature_names': feature_names,
    'transaction_75th': transaction_75th,  # Save for deployment
    'balance_25th': balance_25th,          # Save for deployment
    'frequency_maps': frequency_maps        # Save for deployment
}
joblib.dump(params, '../data/preprocessed/preprocessing_params.pkl')

print("\n‚úì All files saved successfully!")
print("\nSaved files in '../data/preprocessed/':")
print("  1. X_train_full.npy         - Training data for K-Fold CV (160,000 samples)")
print("  2. y_train_full.npy         - Training labels")
print("  3. X_test.npy               - Hold-out test set (40,000 samples)")
print("  4. X_test_scaled.npy        - Scaled hold-out test set")
print("  5. y_test.npy               - Test labels")
print("  6. scaler.pkl               - Fitted StandardScaler")
print("  7. feature_names.txt        - List of all 54 features")
print("  8. preprocessing_params.pkl - All preprocessing parameters + thresholds + frequency maps")

print(f"\n‚úì Preprocessing complete with NO DATA LEAKAGE!")

SAVING PREPROCESSED DATA
Saving files...

‚úì All files saved successfully!

Saved files in '../data/preprocessed/':
  1. X_train_full.npy         - Training data for K-Fold CV (160,000 samples)
  2. y_train_full.npy         - Training labels
  3. X_test.npy               - Hold-out test set (40,000 samples)
  4. X_test_scaled.npy        - Scaled hold-out test set
  5. y_test.npy               - Test labels
  6. scaler.pkl               - Fitted StandardScaler
  7. feature_names.txt        - List of all 54 features
  8. preprocessing_params.pkl - All preprocessing parameters + thresholds + frequency maps

‚úì Preprocessing complete with NO DATA LEAKAGE!


In [14]:
print("="*80)
print("PREPROCESSING COMPLETE - FINAL SUMMARY")
print("="*80)

print("\nüìä ORIGINAL DATA:")
print(f"  ‚Ä¢ Total samples: {len(df):,}")
print(f"  ‚Ä¢ Original features: {df.shape[1]}")
print(f"  ‚Ä¢ Fraud cases: {df['Is_Fraud'].sum():,} ({df['Is_Fraud'].mean()*100:.2f}%)")

print("\nüîß FEATURE ENGINEERING:")
print(f"  ‚Ä¢ Time-based features created: 6")
print(f"    (Hour, Day, Month, Is_Weekend, Is_Night, Is_Business_Hours)")
print(f"  ‚Ä¢ Transaction features created: 4")
print(f"    (Transaction_to_Balance_Ratio, Is_High_Value, Is_Low_Balance, Amount_Category)")
print(f"  ‚Ä¢ Frequency encoding applied: 3")
print(f"    (State_Frequency, City_Frequency, Transaction_Description_Frequency)")

print("\nüìù ENCODING:")
print(f"  ‚Ä¢ Categorical features one-hot encoded: 7")
print(f"  ‚Ä¢ Total features after encoding: {X.shape[1]}")

print("\n‚úÇÔ∏è DATA SPLIT:")
print(f"  ‚Ä¢ K-Fold CV data (80%): {len(X_train_full):,} samples")
print(f"    - Fraud: {y_train_full.sum():,} ({y_train_full.mean()*100:.2f}%)")
print(f"  ‚Ä¢ Hold-out test set (20%): {len(X_test):,} samples")
print(f"    - Fraud: {y_test.sum():,} ({y_test.mean()*100:.2f}%)")

print("\nüîÑ CROSS-VALIDATION STRATEGY:")
print(f"  ‚Ä¢ Method: {n_splits}-Fold Stratified Cross-Validation")
print(f"  ‚Ä¢ Training per fold: {X_train_full.shape[0]*(n_splits-1)//n_splits:,} samples")
print(f"  ‚Ä¢ Validation per fold: {X_train_full.shape[0]//n_splits:,} samples")
print(f"  ‚Ä¢ SMOTE strategy: 0.5 (increase fraud to 50% of non-fraud)")
print(f"  ‚Ä¢ Scaling: StandardScaler (applied inside each fold)")

print("\nüíæ SAVED FILES:")
print(f"  ‚Ä¢ Location: ../data/preprocessed/")
print(f"  ‚Ä¢ Total size: ~100 MB")
print(f"  ‚Ä¢ Files: 8 (data arrays, scaler, parameters, feature names)")

print("\n‚úÖ PREPROCESSING COMPLETE!")
print("\n" + "="*80)
print("READY FOR MODEL TRAINING!")
print("="*80)

PREPROCESSING COMPLETE - FINAL SUMMARY

üìä ORIGINAL DATA:
  ‚Ä¢ Total samples: 200,000
  ‚Ä¢ Original features: 35
  ‚Ä¢ Fraud cases: 10,088 (5.04%)

üîß FEATURE ENGINEERING:
  ‚Ä¢ Time-based features created: 6
    (Hour, Day, Month, Is_Weekend, Is_Night, Is_Business_Hours)
  ‚Ä¢ Transaction features created: 4
    (Transaction_to_Balance_Ratio, Is_High_Value, Is_Low_Balance, Amount_Category)
  ‚Ä¢ Frequency encoding applied: 3
    (State_Frequency, City_Frequency, Transaction_Description_Frequency)

üìù ENCODING:
  ‚Ä¢ Categorical features one-hot encoded: 7
  ‚Ä¢ Total features after encoding: 52

‚úÇÔ∏è DATA SPLIT:
  ‚Ä¢ K-Fold CV data (80%): 160,000 samples
    - Fraud: 8,070 (5.04%)
  ‚Ä¢ Hold-out test set (20%): 40,000 samples
    - Fraud: 2,018 (5.04%)

üîÑ CROSS-VALIDATION STRATEGY:
  ‚Ä¢ Method: 5-Fold Stratified Cross-Validation
  ‚Ä¢ Training per fold: 128,000 samples
  ‚Ä¢ Validation per fold: 32,000 samples
  ‚Ä¢ SMOTE strategy: 0.5 (increase fraud to 50% of non-frau