In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# For handling imbalanced data
from imblearn.over_sampling import SMOTE
from collections import Counter

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the data
df = pd.read_csv('../data/Bank_Transaction_Fraud_Detection.csv')

print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"Fraud cases: {df['Is_Fraud'].sum():,} ({df['Is_Fraud'].mean()*100:.2f}%)")

# Display first few rows
df.head()

Dataset loaded: 200,000 rows, 24 columns
Fraud cases: 10,088 (5.04%)


Unnamed: 0,Customer_ID,Customer_Name,Gender,Age,State,City,Bank_Branch,Account_Type,Transaction_ID,Transaction_Date,Transaction_Time,Transaction_Amount,Merchant_ID,Transaction_Type,Merchant_Category,Account_Balance,Transaction_Device,Transaction_Location,Device_Type,Is_Fraud,Transaction_Currency,Customer_Contact,Transaction_Description,Customer_Email
0,d5f6ec07-d69e-4f47-b9b4-7c58ff17c19e,Osha Tella,Male,60,Kerala,Thiruvananthapuram,Thiruvananthapuram Branch,Savings,4fa3208f-9e23-42dc-b330-844829d0c12c,23-01-2025,16:04:07,32415.45,214e03c5-5c34-40d1-a66c-f440aa2bbd02,Transfer,Restaurant,74557.27,Voice Assistant,"Thiruvananthapuram, Kerala",POS,0,INR,+9198579XXXXXX,Bitcoin transaction,oshaXXXXX@XXXXX.com
1,7c14ad51-781a-4db9-b7bd-67439c175262,Hredhaan Khosla,Female,51,Maharashtra,Nashik,Nashik Branch,Business,c9de0c06-2c4c-40a9-97ed-3c7b8f97c79c,11-01-2025,17:14:53,43622.6,f9e3f11f-28d3-4199-b0ca-f225a155ede6,Bill Payment,Restaurant,74622.66,POS Mobile Device,"Nashik, Maharashtra",Desktop,0,INR,+9191074XXXXXX,Grocery delivery,hredhaanXXXX@XXXXXX.com
2,3a73a0e5-d4da-45aa-85f3-528413900a35,Ekani Nazareth,Male,20,Bihar,Bhagalpur,Bhagalpur Branch,Savings,e41c55f9-c016-4ff3-872b-cae72467c75c,25-01-2025,03:09:52,63062.56,97977d83-5486-4510-af1c-8dada3e1cfa0,Bill Payment,Groceries,66817.99,ATM,"Bhagalpur, Bihar",Desktop,0,INR,+9197745XXXXXX,Mutual fund investment,ekaniXXX@XXXXXX.com
3,7902f4ef-9050-4a79-857d-9c2ea3181940,Yamini Ramachandran,Female,57,Tamil Nadu,Chennai,Chennai Branch,Business,7f7ee11b-ff2c-45a3-802a-49bc47c02ecb,19-01-2025,12:27:02,14000.72,f45cd6b3-5092-44d0-8afb-490894605184,Debit,Entertainment,58177.08,POS Mobile App,"Chennai, Tamil Nadu",Mobile,0,INR,+9195889XXXXXX,Food delivery,yaminiXXXXX@XXXXXXX.com
4,3a4bba70-d9a9-4c5f-8b92-1735fd8c19e9,Kritika Rege,Female,43,Punjab,Amritsar,Amritsar Branch,Savings,f8e6ac6f-81a1-4985-bf12-f60967d852ef,30-01-2025,18:30:46,18335.16,70dd77dd-3b00-4b2c-8ebc-cfb8af5f6741,Transfer,Entertainment,16108.56,Virtual Card,"Amritsar, Punjab",Mobile,0,INR,+9195316XXXXXX,Debt repayment,kritikaXXXX@XXXXXX.com


In [3]:
print("="*80)
print("FEATURE ENGINEERING: TIME-BASED FEATURES")
print("="*80)

# Parse datetime
df['Transaction_DateTime'] = pd.to_datetime(df['Transaction_Date'] + ' ' + df['Transaction_Time'], 
                                             format='%d-%m-%Y %H:%M:%S')

# Extract time components
df['Hour'] = df['Transaction_DateTime'].dt.hour
df['Day_of_Week'] = df['Transaction_DateTime'].dt.dayofweek  # 0=Monday, 6=Sunday
df['Day'] = df['Transaction_DateTime'].dt.day
df['Month'] = df['Transaction_DateTime'].dt.month
df['Year'] = df['Transaction_DateTime'].dt.year

# Create time-based categories
df['Is_Weekend'] = df['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)
df['Is_Night'] = df['Hour'].apply(lambda x: 1 if x >= 22 or x <= 6 else 0)
df['Is_Business_Hours'] = df['Hour'].apply(lambda x: 1 if 9 <= x <= 17 else 0)

print("Time-based features created:")
print("  ‚Ä¢ Hour (0-23)")
print("  ‚Ä¢ Day_of_Week (0=Mon, 6=Sun)")
print("  ‚Ä¢ Day, Month, Year")
print("  ‚Ä¢ Is_Weekend (0/1)")
print("  ‚Ä¢ Is_Night (0/1)")
print("  ‚Ä¢ Is_Business_Hours (0/1)")

FEATURE ENGINEERING: TIME-BASED FEATURES
Time-based features created:
  ‚Ä¢ Hour (0-23)
  ‚Ä¢ Day_of_Week (0=Mon, 6=Sun)
  ‚Ä¢ Day, Month, Year
  ‚Ä¢ Is_Weekend (0/1)
  ‚Ä¢ Is_Night (0/1)
  ‚Ä¢ Is_Business_Hours (0/1)


In [4]:
print("="*80)
print("FEATURE ENGINEERING: TRANSACTION-BASED FEATURES")
print("="*80)

# Transaction to Balance Ratio
df['Transaction_to_Balance_Ratio'] = df['Transaction_Amount'] / df['Account_Balance']

# High value transaction flag (above 75th percentile)
transaction_75th = df['Transaction_Amount'].quantile(0.75)
df['Is_High_Value'] = (df['Transaction_Amount'] > transaction_75th).astype(int)

# Low balance flag (below 25th percentile)
balance_25th = df['Account_Balance'].quantile(0.25)
df['Is_Low_Balance'] = (df['Account_Balance'] < balance_25th).astype(int)

# Transaction amount bins
df['Amount_Category'] = pd.cut(df['Transaction_Amount'], 
                                bins=[0, 25000, 50000, 75000, 100000],
                                labels=['Low', 'Medium', 'High', 'Very High'])

print("Transaction features created:")
print(f"  ‚Ä¢ Transaction_to_Balance_Ratio")
print(f"  ‚Ä¢ Is_High_Value (>{transaction_75th:.0f})")
print(f"  ‚Ä¢ Is_Low_Balance (<{balance_25th:.0f})")
print(f"  ‚Ä¢ Amount_Category (Low/Medium/High/Very High)")

# Display sample
print("\nSample of new features:")
print(df[['Transaction_Amount', 'Account_Balance', 'Transaction_to_Balance_Ratio', 
          'Is_High_Value', 'Is_Low_Balance', 'Amount_Category']].head(10))

FEATURE ENGINEERING: TRANSACTION-BASED FEATURES
Transaction features created:
  ‚Ä¢ Transaction_to_Balance_Ratio
  ‚Ä¢ Is_High_Value (>74315)
  ‚Ä¢ Is_Low_Balance (<28742)
  ‚Ä¢ Amount_Category (Low/Medium/High/Very High)

Sample of new features:
   Transaction_Amount  Account_Balance  Transaction_to_Balance_Ratio  \
0            32415.45         74557.27                      0.434772   
1            43622.60         74622.66                      0.584576   
2            63062.56         66817.99                      0.943796   
3            14000.72         58177.08                      0.240657   
4            18335.16         16108.56                      1.138225   
5             9711.15         61258.85                      0.158526   
6            94677.01         36313.61                      2.607205   
7            67704.28         16948.73                      3.994652   
8            72953.45         18138.71                      4.021976   
9             5689.02         658

In [5]:
print("="*80)
print("FEATURE SELECTION")
print("="*80)

# Features to DROP (not useful for modeling)
drop_features = [
    'Customer_ID',           # Unique identifier
    'Customer_Name',         # Personal info
    'Transaction_ID',        # Unique identifier
    'Merchant_ID',           # Too many unique values
    'Transaction_Date',      # Already extracted features
    'Transaction_Time',      # Already extracted features
    'Transaction_DateTime',  # Already extracted features
    'Transaction_Location',  # Too many unique values (can use State/City instead)
    'Customer_Contact',      # Personal info
    'Customer_Email',        # Personal info
    'Transaction_Currency',  # All same (INR)
    'Bank_Branch',          # High cardinality, use State/City instead
]

# Categorical features (need encoding)
categorical_features = [
    'Gender',
    'State',
    'City', 
    'Account_Type',
    'Transaction_Type',
    'Merchant_Category',
    'Transaction_Device',
    'Device_Type',
    'Transaction_Description',
    'Amount_Category'
]

# Numerical features
numerical_features = [
    'Age',
    'Transaction_Amount',
    'Account_Balance',
    'Hour',
    'Day_of_Week',
    'Day',
    'Month',
    'Transaction_to_Balance_Ratio',
    'Is_Weekend',
    'Is_Night',
    'Is_Business_Hours',
    'Is_High_Value',
    'Is_Low_Balance'
]

# Target variable
target = 'Is_Fraud'

print(f"Features to DROP: {len(drop_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Target: {target}")
print(f"\nTotal features for modeling: {len(categorical_features) + len(numerical_features)}")

FEATURE SELECTION
Features to DROP: 12
Categorical features: 10
Numerical features: 13
Target: Is_Fraud

Total features for modeling: 23


In [6]:
print("="*80)
print("HANDLING HIGH CARDINALITY FEATURES")
print("="*80)

# Check cardinality of categorical features
print("Unique values in categorical features:")
for col in categorical_features:
    print(f"  ‚Ä¢ {col}: {df[col].nunique()} unique values")

# For State and City - too many unique values, let's use frequency encoding
# This converts categories to their frequency (how often they appear)

def frequency_encoding(df, column):
    """Convert categorical column to frequency encoding"""
    freq_encoding = df[column].value_counts(normalize=True).to_dict()
    return df[column].map(freq_encoding)

# Apply frequency encoding to high cardinality features
high_cardinality_features = ['State', 'City', 'Transaction_Description']

for col in high_cardinality_features:
    df[f'{col}_Frequency'] = frequency_encoding(df, col)
    print(f"\n{col} - Top 5 most frequent:")
    print(df[col].value_counts().head())

# Remove original high cardinality features from categorical list
categorical_features = [f for f in categorical_features if f not in high_cardinality_features]

# Add frequency encoded features to numerical list
numerical_features.extend([f'{col}_Frequency' for col in high_cardinality_features])

print(f"\n‚úì High cardinality features converted to frequency encoding")
print(f"Updated categorical features: {len(categorical_features)}")
print(f"Updated numerical features: {len(numerical_features)}")

HANDLING HIGH CARDINALITY FEATURES
Unique values in categorical features:
  ‚Ä¢ Gender: 2 unique values
  ‚Ä¢ State: 34 unique values
  ‚Ä¢ City: 145 unique values
  ‚Ä¢ Account_Type: 3 unique values
  ‚Ä¢ Transaction_Type: 5 unique values
  ‚Ä¢ Merchant_Category: 6 unique values
  ‚Ä¢ Transaction_Device: 20 unique values
  ‚Ä¢ Device_Type: 4 unique values
  ‚Ä¢ Transaction_Description: 172 unique values
  ‚Ä¢ Amount_Category: 4 unique values

State - Top 5 most frequent:
State
Nagaland         6031
Meghalaya        6003
Uttar Pradesh    6002
Uttarakhand      5985
Lakshadweep      5954
Name: count, dtype: int64

City - Top 5 most frequent:
City
Chandigarh     8135
Kavaratti      5954
Udaipur        2681
Daman          2022
Car Nicobar    1956
Name: count, dtype: int64

Transaction_Description - Top 5 most frequent:
Transaction_Description
Sports ticket            1268
Home appliance repair    1257
Taxi fare                1248
Seminar registration     1246
Taxi booking             1240

In [7]:
print("="*80)
print("ENCODING CATEGORICAL VARIABLES")
print("="*80)

# Create a copy for preprocessing
df_processed = df.copy()

# One-Hot Encoding for categorical features
print(f"\nCategorical features to encode: {categorical_features}")
print(f"Number of features: {len(categorical_features)}")

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df_processed, columns=categorical_features, drop_first=True, dtype=int)

print(f"\n‚úì One-Hot Encoding applied")
print(f"Shape before encoding: {df_processed.shape}")
print(f"Shape after encoding: {df_encoded.shape}")
print(f"New features created: {df_encoded.shape[1] - df_processed.shape[1]}")

# Display sample of encoded columns
print("\nSample of new encoded columns (first 5 rows, showing some encoded features):")
encoded_cols = [col for col in df_encoded.columns if any(cat in col for cat in categorical_features)]
print(df_encoded[encoded_cols[:10]].head())

ENCODING CATEGORICAL VARIABLES

Categorical features to encode: ['Gender', 'Account_Type', 'Transaction_Type', 'Merchant_Category', 'Transaction_Device', 'Device_Type', 'Amount_Category']
Number of features: 7

‚úì One-Hot Encoding applied
Shape before encoding: (200000, 40)
Shape after encoding: (200000, 70)
New features created: 30

Sample of new encoded columns (first 5 rows, showing some encoded features):
   Gender_Male  Account_Type_Checking  Account_Type_Savings  \
0            1                      0                     1   
1            0                      0                     0   
2            1                      0                     1   
3            0                      0                     0   
4            0                      0                     1   

   Transaction_Type_Credit  Transaction_Type_Debit  Transaction_Type_Transfer  \
0                        0                       0                          1   
1                        0                   

In [8]:
print("="*80)
print("PREPARING FEATURES AND TARGET")
print("="*80)

# Drop unnecessary columns
df_model = df_encoded.drop(columns=drop_features, errors='ignore')

# Separate features (X) and target (y)
X = df_model.drop(columns=[target])
y = df_model[target]

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"Fraud percentage: {y.mean()*100:.2f}%")

# Display feature names
print(f"\nTotal features for modeling: {X.shape[1]}")
print("\nFeature categories breakdown:")

# Count different types of features
time_features = [col for col in X.columns if any(word in col.lower() for word in ['hour', 'day', 'month', 'weekend', 'night', 'business'])]
transaction_features = [col for col in X.columns if any(word in col.lower() for word in ['amount', 'balance', 'ratio', 'high', 'low'])]
encoded_features = [col for col in X.columns if '_' in col and col not in time_features + transaction_features]
frequency_features = [col for col in X.columns if 'frequency' in col.lower()]

print(f"  ‚Ä¢ Time-based features: {len(time_features)}")
print(f"  ‚Ä¢ Transaction features: {len(transaction_features)}")
print(f"  ‚Ä¢ Frequency encoded features: {len(frequency_features)}")
print(f"  ‚Ä¢ One-hot encoded features: {len(encoded_features)}")
print(f"  ‚Ä¢ Other features: {X.shape[1] - len(time_features) - len(transaction_features) - len(frequency_features) - len(encoded_features)}")

print("\nFirst 15 feature names:")
for i, col in enumerate(X.columns[:57], 1):
    print(f"  {i}. {col}")

PREPARING FEATURES AND TARGET
Features (X) shape: (200000, 57)
Target (y) shape: (200000,)

Target distribution:
Is_Fraud
0    189912
1     10088
Name: count, dtype: int64
Fraud percentage: 5.04%

Total features for modeling: 57

Feature categories breakdown:
  ‚Ä¢ Time-based features: 7
  ‚Ä¢ Transaction features: 8
  ‚Ä¢ Frequency encoded features: 3
  ‚Ä¢ One-hot encoded features: 38
  ‚Ä¢ Other features: 1

First 15 feature names:
  1. Age
  2. State
  3. City
  4. Transaction_Amount
  5. Account_Balance
  6. Transaction_Description
  7. Hour
  8. Day_of_Week
  9. Day
  10. Month
  11. Year
  12. Is_Weekend
  13. Is_Night
  14. Is_Business_Hours
  15. Transaction_to_Balance_Ratio
  16. Is_High_Value
  17. Is_Low_Balance
  18. State_Frequency
  19. City_Frequency
  20. Transaction_Description_Frequency
  21. Gender_Male
  22. Account_Type_Checking
  23. Account_Type_Savings
  24. Transaction_Type_Credit
  25. Transaction_Type_Debit
  26. Transaction_Type_Transfer
  27. Transaction_T

In [9]:
print("="*80)
print("REMOVING ORIGINAL HIGH CARDINALITY FEATURES")
print("="*80)

# These were already converted to frequency encoding, so we should drop the originals
high_cardinality_originals = ['State', 'City', 'Transaction_Description']

print(f"Removing original high cardinality features: {high_cardinality_originals}")
print(f"(We already have their frequency-encoded versions)")

# Check if they exist and remove them
for col in high_cardinality_originals:
    if col in X.columns:
        X = X.drop(columns=[col])
        print(f"  ‚úì Removed {col}")

print(f"\nUpdated feature count: {X.shape[1]}")
print(f"\nFirst 20 features after cleanup:")
for i, col in enumerate(X.columns[:20], 1):
    print(f"  {i}. {col}")

REMOVING ORIGINAL HIGH CARDINALITY FEATURES
Removing original high cardinality features: ['State', 'City', 'Transaction_Description']
(We already have their frequency-encoded versions)
  ‚úì Removed State
  ‚úì Removed City
  ‚úì Removed Transaction_Description

Updated feature count: 54

First 20 features after cleanup:
  1. Age
  2. Transaction_Amount
  3. Account_Balance
  4. Hour
  5. Day_of_Week
  6. Day
  7. Month
  8. Year
  9. Is_Weekend
  10. Is_Night
  11. Is_Business_Hours
  12. Transaction_to_Balance_Ratio
  13. Is_High_Value
  14. Is_Low_Balance
  15. State_Frequency
  16. City_Frequency
  17. Transaction_Description_Frequency
  18. Gender_Male
  19. Account_Type_Checking
  20. Account_Type_Savings


In [10]:
print("="*80)
print("TRAIN-TEST SPLIT (Hold-out Test Set)")
print("="*80)

# Split: 80% for K-Fold CV, 20% held-out for final testing
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, 
    test_size=0.2,           # 20% for final testing
    random_state=42,         
    stratify=y               # CRITICAL: Maintain fraud ratio
)

print(f"Total samples: {len(X):,}")
print(f"\nData for K-Fold CV: {X_train_full.shape[0]:,} samples ({X_train_full.shape[0]/len(X)*100:.0f}%)")
print(f"Hold-out test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.0f}%)")

print(f"\n{'='*50}")
print("K-Fold CV Data (Training Portion):")
print(f"{'='*50}")
print(f"Total samples: {len(y_train_full):,}")
print(y_train_full.value_counts())
print(f"Fraud rate: {y_train_full.mean()*100:.2f}%")

print(f"\n{'='*50}")
print("Hold-out Test Set (Final Evaluation):")
print(f"{'='*50}")
print(f"Total samples: {len(y_test):,}")
print(y_test.value_counts())
print(f"Fraud rate: {y_test.mean()*100:.2f}%")

print(f"\n{'='*50}")
print("Stratification Verification:")
print(f"{'='*50}")
print(f"Original fraud rate:    {y.mean()*100:.2f}%")
print(f"K-Fold data fraud rate: {y_train_full.mean()*100:.2f}%")
print(f"Test set fraud rate:    {y_test.mean()*100:.2f}%")
print("\n‚úì Stratification successful - fraud rates match!")

print("\nüìå Important Notes:")
print("  ‚Ä¢ The hold-out test set will NOT be touched until final evaluation")
print("  ‚Ä¢ K-Fold CV will be performed on the training portion (160,000 samples)")
print("  ‚Ä¢ SMOTE and scaling will be applied INSIDE each fold")

TRAIN-TEST SPLIT (Hold-out Test Set)
Total samples: 200,000

Data for K-Fold CV: 160,000 samples (80%)
Hold-out test set: 40,000 samples (20%)

K-Fold CV Data (Training Portion):
Total samples: 160,000
Is_Fraud
0    151930
1      8070
Name: count, dtype: int64
Fraud rate: 5.04%

Hold-out Test Set (Final Evaluation):
Total samples: 40,000
Is_Fraud
0    37982
1     2018
Name: count, dtype: int64
Fraud rate: 5.04%

Stratification Verification:
Original fraud rate:    5.04%
K-Fold data fraud rate: 5.04%
Test set fraud rate:    5.04%

‚úì Stratification successful - fraud rates match!

üìå Important Notes:
  ‚Ä¢ The hold-out test set will NOT be touched until final evaluation
  ‚Ä¢ K-Fold CV will be performed on the training portion (160,000 samples)
  ‚Ä¢ SMOTE and scaling will be applied INSIDE each fold


In [11]:
print("="*80)
print("STRATIFIED K-FOLD CROSS-VALIDATION STRATEGY")
print("="*80)

from sklearn.model_selection import StratifiedKFold

# Initialize Stratified K-Fold
n_splits = 5  # We'll use 5 folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"Strategy: {n_splits}-Fold Stratified Cross-Validation")
print(f"\nHow it works:")
print(f"  1. Split {X_train_full.shape[0]:,} samples into {n_splits} folds")
print(f"  2. Each fold maintains ~{y_train_full.mean()*100:.2f}% fraud rate")
print(f"  3. For each fold:")
print(f"     ‚Ä¢ Train on {n_splits-1} folds (~{X_train_full.shape[0]*(n_splits-1)/n_splits:.0f} samples)")
print(f"     ‚Ä¢ Validate on 1 fold (~{X_train_full.shape[0]/n_splits:.0f} samples)")
print(f"     ‚Ä¢ Apply SMOTE to training folds only")
print(f"     ‚Ä¢ Scale features on training folds, apply to validation")
print(f"     ‚Ä¢ Train model and evaluate on validation fold")
print(f"  4. Average performance across all {n_splits} folds")
print(f"  5. Final model trained on all {X_train_full.shape[0]:,} samples")
print(f"  6. Final evaluation on hold-out test set ({X_test.shape[0]:,} samples)")

# Demonstrate the splits
print(f"\n{'='*80}")
print("FOLD BREAKDOWN - Verification of Stratification:")
print(f"{'='*80}")

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_full, y_train_full), 1):
    y_train_fold = y_train_full.iloc[train_idx]
    y_val_fold = y_train_full.iloc[val_idx]
    
    fraud_train = y_train_fold.sum()
    fraud_val = y_val_fold.sum()
    
    print(f"\nFold {fold_idx}:")
    print(f"  Training:   {len(train_idx):>7,} samples | Fraud: {fraud_train:>5,} ({y_train_fold.mean()*100:>5.2f}%)")
    print(f"  Validation: {len(val_idx):>7,} samples | Fraud: {fraud_val:>5,} ({y_val_fold.mean()*100:>5.2f}%)")

print(f"\n{'='*80}")
print("‚úì All folds maintain consistent ~5.04% fraud rate!")
print(f"{'='*80}")

STRATIFIED K-FOLD CROSS-VALIDATION STRATEGY
Strategy: 5-Fold Stratified Cross-Validation

How it works:
  1. Split 160,000 samples into 5 folds
  2. Each fold maintains ~5.04% fraud rate
  3. For each fold:
     ‚Ä¢ Train on 4 folds (~128000 samples)
     ‚Ä¢ Validate on 1 fold (~32000 samples)
     ‚Ä¢ Apply SMOTE to training folds only
     ‚Ä¢ Scale features on training folds, apply to validation
     ‚Ä¢ Train model and evaluate on validation fold
  4. Average performance across all 5 folds
  5. Final model trained on all 160,000 samples
  6. Final evaluation on hold-out test set (40,000 samples)

FOLD BREAKDOWN - Verification of Stratification:

Fold 1:
  Training:   128,000 samples | Fraud: 6,456 ( 5.04%)
  Validation:  32,000 samples | Fraud: 1,614 ( 5.04%)

Fold 2:
  Training:   128,000 samples | Fraud: 6,456 ( 5.04%)
  Validation:  32,000 samples | Fraud: 1,614 ( 5.04%)

Fold 3:
  Training:   128,000 samples | Fraud: 6,456 ( 5.04%)
  Validation:  32,000 samples | Fraud: 1,614 

In [12]:
print("="*80)
print("SAVING PREPROCESSED DATA")
print("="*80)

import joblib
import os

# Create directory for preprocessed data
os.makedirs('../data/preprocessed', exist_ok=True)

# Fit scaler on full training data (we'll refit inside CV, but save this for deployment)
scaler = StandardScaler()
scaler.fit(X_train_full)

# Scale the hold-out test set
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

# Save the data
print("Saving files...")

# Training data for K-Fold CV (unscaled - we'll scale inside each fold)
np.save('../data/preprocessed/X_train_full.npy', X_train_full.values)
np.save('../data/preprocessed/y_train_full.npy', y_train_full.values)

# Hold-out test set
np.save('../data/preprocessed/X_test.npy', X_test.values)
np.save('../data/preprocessed/X_test_scaled.npy', X_test_scaled.values)
np.save('../data/preprocessed/y_test.npy', y_test.values)

# Save the scaler for deployment
joblib.dump(scaler, '../data/preprocessed/scaler.pkl')

# Save feature names
with open('../data/preprocessed/feature_names.txt', 'w') as f:
    for col in X.columns:
        f.write(f"{col}\n")

# Save preprocessing parameters
params = {
    'n_folds': n_splits,
    'test_size': 0.2,
    'random_state': 42,
    'smote_strategy': 0.5,
    'total_features': X.shape[1],
    'feature_names': list(X.columns)
}
joblib.dump(params, '../data/preprocessed/preprocessing_params.pkl')

print("\n‚úì All files saved successfully!")
print("\nSaved files in '../data/preprocessed/':")
print("  1. X_train_full.npy         - Training data for K-Fold CV (160,000 samples)")
print("  2. y_train_full.npy         - Training labels")
print("  3. X_test.npy               - Hold-out test set (40,000 samples)")
print("  4. X_test_scaled.npy        - Scaled hold-out test set")
print("  5. y_test.npy               - Test labels")
print("  6. scaler.pkl               - Fitted StandardScaler")
print("  7. feature_names.txt        - List of all 54 features")
print("  8. preprocessing_params.pkl - All preprocessing parameters")

print(f"\nFile sizes:")
import os
for filename in os.listdir('../data/preprocessed/'):
    filepath = f'../data/preprocessed/{filename}'
    size = os.path.getsize(filepath) / (1024 * 1024)  # Convert to MB
    print(f"  ‚Ä¢ {filename}: {size:.2f} MB")

SAVING PREPROCESSED DATA
Saving files...

‚úì All files saved successfully!

Saved files in '../data/preprocessed/':
  1. X_train_full.npy         - Training data for K-Fold CV (160,000 samples)
  2. y_train_full.npy         - Training labels
  3. X_test.npy               - Hold-out test set (40,000 samples)
  4. X_test_scaled.npy        - Scaled hold-out test set
  5. y_test.npy               - Test labels
  6. scaler.pkl               - Fitted StandardScaler
  7. feature_names.txt        - List of all 54 features
  8. preprocessing_params.pkl - All preprocessing parameters

File sizes:
  ‚Ä¢ feature_names.txt: 0.00 MB
  ‚Ä¢ preprocessing_params.pkl: 0.00 MB
  ‚Ä¢ scaler.pkl: 0.00 MB
  ‚Ä¢ X_test.npy: 16.48 MB
  ‚Ä¢ X_test_scaled.npy: 16.48 MB
  ‚Ä¢ X_train_full.npy: 65.92 MB
  ‚Ä¢ y_test.npy: 0.31 MB
  ‚Ä¢ y_train_full.npy: 1.22 MB


In [14]:
print("="*80)
print("PREPROCESSING COMPLETE - FINAL SUMMARY")
print("="*80)

print("\nüìä ORIGINAL DATA:")
print(f"  ‚Ä¢ Total samples: {len(df):,}")
print(f"  ‚Ä¢ Original features: {df.shape[1]}")
print(f"  ‚Ä¢ Fraud cases: {df['Is_Fraud'].sum():,} ({df['Is_Fraud'].mean()*100:.2f}%)")

print("\nüîß FEATURE ENGINEERING:")
print(f"  ‚Ä¢ Time-based features created: 6")
print(f"    (Hour, Day, Month, Is_Weekend, Is_Night, Is_Business_Hours)")
print(f"  ‚Ä¢ Transaction features created: 4")
print(f"    (Transaction_to_Balance_Ratio, Is_High_Value, Is_Low_Balance, Amount_Category)")
print(f"  ‚Ä¢ Frequency encoding applied: 3")
print(f"    (State_Frequency, City_Frequency, Transaction_Description_Frequency)")

print("\nüìù ENCODING:")
print(f"  ‚Ä¢ Categorical features one-hot encoded: 7")
print(f"  ‚Ä¢ Total features after encoding: {X.shape[1]}")

print("\n‚úÇÔ∏è DATA SPLIT:")
print(f"  ‚Ä¢ K-Fold CV data (80%): {len(X_train_full):,} samples")
print(f"    - Fraud: {y_train_full.sum():,} ({y_train_full.mean()*100:.2f}%)")
print(f"  ‚Ä¢ Hold-out test set (20%): {len(X_test):,} samples")
print(f"    - Fraud: {y_test.sum():,} ({y_test.mean()*100:.2f}%)")

print("\nüîÑ CROSS-VALIDATION STRATEGY:")
print(f"  ‚Ä¢ Method: {n_splits}-Fold Stratified Cross-Validation")
print(f"  ‚Ä¢ Training per fold: {X_train_full.shape[0]*(n_splits-1)//n_splits:,} samples")
print(f"  ‚Ä¢ Validation per fold: {X_train_full.shape[0]//n_splits:,} samples")
print(f"  ‚Ä¢ SMOTE strategy: 0.5 (increase fraud to 50% of non-fraud)")
print(f"  ‚Ä¢ Scaling: StandardScaler (applied inside each fold)")

print("\nüíæ SAVED FILES:")
print(f"  ‚Ä¢ Location: ../data/preprocessed/")
print(f"  ‚Ä¢ Total size: ~100 MB")
print(f"  ‚Ä¢ Files: 8 (data arrays, scaler, parameters, feature names)")

print("\n‚úÖ PREPROCESSING COMPLETE!")
print("\n" + "="*80)
print("READY FOR MODEL TRAINING!")
print("="*80)

PREPROCESSING COMPLETE - FINAL SUMMARY

üìä ORIGINAL DATA:
  ‚Ä¢ Total samples: 200,000
  ‚Ä¢ Original features: 40
  ‚Ä¢ Fraud cases: 10,088 (5.04%)

üîß FEATURE ENGINEERING:
  ‚Ä¢ Time-based features created: 6
    (Hour, Day, Month, Is_Weekend, Is_Night, Is_Business_Hours)
  ‚Ä¢ Transaction features created: 4
    (Transaction_to_Balance_Ratio, Is_High_Value, Is_Low_Balance, Amount_Category)
  ‚Ä¢ Frequency encoding applied: 3
    (State_Frequency, City_Frequency, Transaction_Description_Frequency)

üìù ENCODING:
  ‚Ä¢ Categorical features one-hot encoded: 7
  ‚Ä¢ Total features after encoding: 54

‚úÇÔ∏è DATA SPLIT:
  ‚Ä¢ K-Fold CV data (80%): 160,000 samples
    - Fraud: 8,070 (5.04%)
  ‚Ä¢ Hold-out test set (20%): 40,000 samples
    - Fraud: 2,018 (5.04%)

üîÑ CROSS-VALIDATION STRATEGY:
  ‚Ä¢ Method: 5-Fold Stratified Cross-Validation
  ‚Ä¢ Training per fold: 128,000 samples
  ‚Ä¢ Validation per fold: 32,000 samples
  ‚Ä¢ SMOTE strategy: 0.5 (increase fraud to 50% of non-frau