Preprocessing Notebook for Credit Card Fraud Dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
import joblib
import json
from pathlib import Path

Path('../data/processed').mkdir(parents=True, exist_ok=True)

In [3]:
#Load the raw data

df = pd.read_csv('../data/raw/creditcard.csv')
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Prepare log1p transformation, normalize after splitting to avoid data leakage
df['Amount_log1p'] = np.log1p(df['Amount'])

In [5]:
# Create stratified train/test split to preserve the fraud ratio in each split

feature_cols = [f'V{i}' for i in range(1, 29)] + ['Amount_log1p']  # Use log1p, not normalized yet
X = df[feature_cols]
y = df['Class']

# Split: 70% train, 30% temp (using raw features)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Split temp: 50/50 -> 15% val, 15% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# NOW normalize: Fit scaler ONLY on training data
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled['Amount_normalized'] = scaler.fit_transform(X_train[['Amount_log1p']])

# Transform validation and test using the scaler fitted on training data
X_val_scaled = X_val.copy()
X_val_scaled['Amount_normalized'] = scaler.transform(X_val[['Amount_log1p']])

X_test_scaled = X_test.copy()
X_test_scaled['Amount_normalized'] = scaler.transform(X_test[['Amount_log1p']])

# Drop Amount_log1p, keep only V1-V28 and Amount_normalized
feature_cols_final = [f'V{i}' for i in range(1, 29)] + ['Amount_normalized']
X_train_final = X_train_scaled[feature_cols_final]
X_val_final = X_val_scaled[feature_cols_final]
X_test_final = X_test_scaled[feature_cols_final]

# Combine with labels and save
train_df = pd.concat([X_train_final, y_train], axis=1)
val_df = pd.concat([X_val_final, y_val], axis=1)
test_df = pd.concat([X_test_final, y_test], axis=1)

train_df.to_csv('../data/processed/train.csv', index=False)
val_df.to_csv('../data/processed/val.csv', index=False)
test_df.to_csv('../data/processed/test.csv', index=False)

# Save scaler for inference pipeline (fitted on training data only)
joblib.dump(scaler, '../data/processed/amount_scaler.pkl')

print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")

Train: 199,364 | Val: 42,721 | Test: 42,722


In [6]:
#generate synthetic streaming fields because the original data lacks user_id, device_id, etc. and these are needed for behavioral features

np.random.seed(42)
df_stream = df.copy()

# Entity counts
N_USERS = 10000
N_DEVICES = 5000
N_IPS = 3000
N_MERCHANTS = 500

# Legitimate transactions: spread across many users/devices
legit_mask = df_stream['Class'] == 0
df_stream.loc[legit_mask, 'user_id'] = np.random.randint(0, N_USERS, legit_mask.sum())
df_stream.loc[legit_mask, 'device_id'] = np.random.randint(0, N_DEVICES, legit_mask.sum())
df_stream.loc[legit_mask, 'ip_int'] = np.random.randint(0, N_IPS, legit_mask.sum())
df_stream.loc[legit_mask, 'merchant_id'] = np.random.randint(0, N_MERCHANTS, legit_mask.sum())

# Fraud: concentrated (500 users, 200 devices)
fraud_mask = df_stream['Class'] == 1
df_stream.loc[fraud_mask, 'user_id'] = np.random.randint(0, 500, fraud_mask.sum())
df_stream.loc[fraud_mask, 'device_id'] = np.random.randint(0, 200, fraud_mask.sum())
df_stream.loc[fraud_mask, 'ip_int'] = np.random.randint(0, N_IPS // 2, fraud_mask.sum())
df_stream.loc[fraud_mask, 'merchant_id'] = np.random.randint(0, N_MERCHANTS, fraud_mask.sum())

# Convert to string IDs
df_stream['user_id'] = 'user_' + df_stream['user_id'].astype(int).astype(str)
df_stream['device_id'] = 'dev_' + df_stream['device_id'].astype(int).astype(str)
df_stream['ip'] = '192.168.' + (df_stream['ip_int'] // 256).astype(int).astype(str) + '.' + (df_stream['ip_int'] % 256).astype(int).astype(str)
df_stream['merchant_id'] = 'merch_' + df_stream['merchant_id'].astype(int).astype(str)

print(f"Synthetic IDs created: {N_USERS:,} users, {N_DEVICES:,} devices")

Synthetic IDs created: 10,000 users, 5,000 devices


In [7]:
#Convert time to ISO Timestamps and add country/currency

# Dataset is from September 2013, European cardholders
start_time = datetime(2013, 9, 1, 0, 0, 0)  # Sept 1, 2013 (actual dataset period)

df_stream['ts'] = df_stream['Time'].apply(
    lambda x: (start_time + timedelta(seconds=x)).isoformat() + 'Z'
)
df_stream['event_id'] = 'evt_' + df_stream.index.astype(str)

# European country distribution (European cardholders)
df_stream['country'] = np.random.choice(
    ['FR', 'DE', 'IT', 'ES', 'GB', 'NL', 'BE'], 
    size=len(df_stream), 
    p=[0.25, 0.20, 0.15, 0.15, 0.10, 0.10, 0.05]
)

# Currency mapping (mostly EUR)
currency_map = {'FR': 'EUR', 'DE': 'EUR', 'IT': 'EUR', 'ES': 'EUR', 'GB': 'GBP', 'NL': 'EUR', 'BE': 'EUR'}
df_stream['currency'] = df_stream['country'].map(currency_map)
df_stream['amount'] = df_stream['Amount']

print(f"Timestamps: {df_stream['ts'].iloc[0]} to {df_stream['ts'].iloc[-1]}")

Timestamps: 2013-09-01T00:00:00Z to 2013-09-02T23:59:52Z


In [8]:
#Save Streaming Events

stream_cols = [
    'event_id', 'ts', 'user_id', 'amount', 'currency',
    'country', 'device_id', 'ip', 'merchant_id', 'Class'
]

df_stream[stream_cols].to_csv('../data/processed/streaming_events.csv', index=False)
df_stream[stream_cols].head(10000).to_csv('../data/processed/streaming_events_sample.csv', index=False)

print(f"Saved: streaming_events.csv ({len(df_stream):,} events)")
print(f"Saved: streaming_events_sample.csv (10,000 events)")

Saved: streaming_events.csv (284,807 events)
Saved: streaming_events_sample.csv (10,000 events)


In [9]:
# Save Metadata
feature_cols_final = [f'V{i}' for i in range(1, 29)] + ['Amount_normalized']  # Use final feature list

metadata = {
    'dataset': {
        'total_samples': len(df),
        'fraud_samples': int(df['Class'].sum()),
        'fraud_rate': float(df['Class'].mean()),
    },
    'splits': {
        'train_size': len(train_df),
        'val_size': len(val_df),
        'test_size': len(test_df),
    },
    'features': {
        'model_features': feature_cols_final,  # Use final feature list
        'feature_count': len(feature_cols_final),
    },
    'preprocessing': {
        'amount_transformation': 'log1p + StandardScaler',
        'scaler_path': 'data/processed/amount_scaler.pkl',
        'scaler_fit_data': 'training_set_only',  # Important note
    },
    'synthetic': {
        'n_users': N_USERS,
        'n_devices': N_DEVICES,
        'n_ips': N_IPS,
        'n_merchants': N_MERCHANTS,
    }
}

with open('../data/processed/preprocessing_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)