import pandas as pd
import numpy as np
from datetime import timedelta
from src.preprocess import prepare_features

# Load
df_fraud = pd.read_parquet('../data/processed/fraud_geo.parquet')
df_cc = pd.read_parquet('../data/processed/creditcard_processed.parquet')

In [None]:
# time_since_signup 
df_fraud['time_since_signup'] = (df_fraud['purchase_time'] - df_fraud['signup_time']).dt.total_seconds() / 3600

# hour_of_day, day_of_week
df_fraud['hour_of_day'] = df_fraud['purchase_time'].dt.hour
df_fraud['day_of_week'] = df_fraud['purchase_time'].dt.dayofweek

# Cyclical encoding
df_fraud['hour_sin'] = np.sin(2 * np.pi * df_fraud['hour_of_day'] / 24)
df_fraud['hour_cos'] = np.cos(2 * np.pi * df_fraud['hour_of_day'] / 24)
df_fraud['day_sin'] = np.sin(2 * np.pi * df_fraud['day_of_week'] / 7)
df_fraud['day_cos'] = np.cos(2 * np.pi * df_fraud['day_of_week'] / 7)

# Transaction frequency (per user)
df_fraud['user_txn_count'] = df_fraud.groupby('user_id')['user_id'].transform('count')

# Velocity: Simple 1h window (group by user, rolling count; approximate)
df_fraud = df_fraud.sort_values(['user_id', 'purchase_time'])
df_fraud['txn_velocity_1h'] = df_fraud.groupby('user_id').apply(
    lambda x: x.rolling('1H', on='purchase_time', closed='left').count()['user_id']
).reset_index(0, drop=True)

# High-risk country flag (top 10 fraud rates)
high_risk = ['Nigeria', 'India', 'Unknown']  # From EDA; adjust
df_fraud['is_high_risk_country'] = df_fraud['country_code'].isin(high_risk).astype(int)

# Age bins
df_fraud['age_group'] = pd.cut(df_fraud['age'], bins=[0, 25, 40, 60, 100], labels=['Young', 'Adult', 'Middle', 'Senior'])

print(df_fraud[['time_since_signup', 'hour_of_day', 'user_txn_count', 'is_high_risk_country']].describe())

In [None]:
# time_since_signup by class
sns.violinplot(data=df_fraud, x='class', y='time_since_signup')
plt.title('Time Since Signup by Class')
plt.yscale('log')
plt.show()

# Fraud by hour
sns.countplot(data=df_fraud, x='hour_of_day', hue='class')
plt.title('Fraud by Hour of Day')
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# For Fraud_Data (example; apply in modeling)
X = df_fraud.drop('class', axis=1)  # Select features
y = df_fraud['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train.select_dtypes(include=[np.number]), y_train)  # Numeric only for demo

print(f"Original train fraud %: {y_train.mean():.2%}")
print(f"SMOTE train fraud %: {y_train_res.mean():.2%}")
print(f"Test unchanged: {y_test.mean():.2%}")

In [None]:
# Numerical and categorical cols
num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'user_txn_count', 'txn_velocity_1h']
cat_cols = ['source', 'browser', 'sex', 'country_code', 'age_group']

# Encode/Scale (drop non-features first)
df_features = df_fraud[num_cols + cat_cols + ['class']]
X_transformed = prepare_features(df_features, num_cols, cat_cols)
df_fraud_eng = pd.DataFrame(X_transformed, columns=[f'feat_{i}' for i in range(X_transformed.shape[1])])
df_fraud_eng['class'] = df_fraud['class'].values  # Align

df_fraud_eng.to_parquet('../data/processed/fraud_engineered.parquet')
print("Fraud engineered saved.")

In [None]:
# Simple: Add time_hour cyclic
df_cc['time_hour'] = df_cc['Time'] % (24*3600) / 3600  # Hours since start
df_cc['time_hour_sin'] = np.sin(2 * np.pi * df_cc['time_hour'] / 24)
df_cc['time_hour_cos'] = np.cos(2 * np.pi * df_cc['time_hour'] / 24)

# Drop original Time, Amount; keep log_amount
df_cc_eng = df_cc.drop(['Time', 'Amount'], axis=1)
df_cc_eng.to_parquet('../data/processed/creditcard_engineered.parquet')
print("Credit engineered saved.")

- SMOTE chosen to balance without losing data; applied only to train.
- Features like time_since_signup capture rushed fraud (median 2h for fraud vs 45d legit).
- Geolocation adds risk flags, improving discrimination.

Imbalance Handling Demo

In [None]:
# Stratified split (for demo)
X_temp = df_fraud_eng.drop('class', axis=1)
y_temp = df_fraud_eng['class']
X_tr, X_te, y_tr, y_te = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

print(f"Train fraud: {y_tr.mean():.2%}, Test: {y_te.mean():.2%}")

# SMOTE on train (numeric feats only; assume all now)
smote = SMOTE(random_state=42, k_neighbors=5)
X_tr_res, y_tr_res = smote.fit_resample(X_tr, y_tr)

print(f"Post-SMOTE train fraud: {y_tr_res.mean():.2%} (balanced)")
print(f"New train shape: {X_tr_res.shape}")

# Bar plot before/after
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
pd.Series(y_tr).value_counts().plot(kind='bar', ax=ax[0], title='Train Before SMOTE')
pd.Series(y_tr_res).value_counts().plot(kind='bar', ax=ax[1], title='Train After SMOTE')
plt.tight_layout()
plt.show()

In [None]:
df_fraud_eng.to_parquet('../data/processed/fraud_engineered.parquet', index=False)
df_cc_eng.to_parquet('../data/processed/creditcard_engineered.parquet', index=False)
print("Saved engineered Parquetsâ€”ready for modeling!")
print("Note: For creditcard full run, reload without subsample.")