In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

# Load cleaned data from EDA
df = pd.read_csv('data/raw/Fraud_Data.csv')
ip_country = pd.read_csv('data/raw/IpAddress_to_Country.csv')

# Re-apply cleaning & feature engineering (from EDA)
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df['class'] = df['class'].replace('o', 0).astype(int)
df['time_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
df['hour_of_day'] = df['purchase_time'].dt.hour
df['day_of_week'] = df['purchase_time'].dt.dayofweek

# IP → Country (reuse function from EDA)
def ip_to_int(ip):
    octets = ip.split('.')
    return (int(octets[0]) << 24) + (int(octets[1]) << 16) + (int(octets[2]) << 8) + int(octets[3])

df['ip_int'] = df['ip_address'].apply(ip_to_int)
def get_country(ip_int):
    match = ip_country[(ip_country['lower_bound'] <= ip_int) & (ip_country['upper_bound'] >= ip_int)]
    return match['country'].iloc[0] if len(match) > 0 else 'Unknown'
df['country'] = df['ip_int'].apply(get_country)

print("✅ Data reloaded and features re-engineered.")

In [None]:
# Select features for modeling
features = ['purchase_value', 'time_since_signup', 'hour_of_day', 'day_of_week', 'age', 'country']
X = df[features].copy()
y = df['class'].copy()

# Encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['purchase_value', 'time_since_signup', 'age']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['country', 'day_of_week', 'hour_of_day'])
    ]
)

# Transform features
X_processed = preprocessor.fit_transform(X)

# Split data (STRATIFIED to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE ONLY to training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"Original train class distribution: {np.bincount(y_train)}")
print(f"SMOTE train class distribution: {np.bincount(y_train_smote)}")

In [None]:
# Create processed data folder
import os
os.makedirs('data/processed', exist_ok=True)

# Save processed data
np.save('data/processed/X_train_smote.npy', X_train_smote)
np.save('data/processed/y_train_smote.npy', y_train_smote)
np.save('data/processed/X_test.npy', X_test)
np.save('data/processed/y_test.npy', y_test)

# Save preprocessor for later use
import joblib
joblib.dump(preprocessor, 'data/processed/preprocessor.pkl')

print("✅ Processed data saved to data/processed/")