In [10]:
import pandas as pd
import logging
import sys
sys.path.append("../") 

from src.advanced.fraud_feature_engineering import (
    add_transaction_amount_flags,
    add_time_to_purchase,
    save_advanced_featured_data
)

from src.advanced.fraud_risk_scoring import (
    calculate_transaction_risk,
    assign_risk_score,
)

from src.advanced.fraud_clustering_analysis import(
    perform_clustering_analysis,
)

from src.advanced.outlier_detection import (
    detect_outliers_iforest,
)

from src.advanced.eda_grouped_visuals import (
    plot_fraud_rate_by_category,
)

logging.basicConfig(level=logging.INFO)
# 1. Load Cleaned Data
fraud_df = pd.read_parquet("../data/processed/fraud_data_basic_features.parquet")

# 3. Advanced Feature Engineering
fraud_df = add_transaction_amount_flags(fraud_df)
fraud_df = add_time_to_purchase(fraud_df)
# 4. Fraud Risk Scoring
fraud_df = calculate_transaction_risk(fraud_df)
fraud_df = assign_risk_score(fraud_df)

logging.info("Calculated risk score features")

# 5. Clustering Analysis
fraud_df = perform_clustering_analysis(fraud_df)
logging.info("Completed fraud clustering analysis")
# 6. Outlier Detection
fraud_df = detect_outliers_iforest(fraud_df)
logging.info("Completed outlier detection")
# 7. Save Enriched Dataset
fraud_df.to_parquet("../data/processed/fraud_data_advanced_enriched.parquet", index=False)
logging.info("Saved enriched fraud dataset")
# 8. Save EDA Visuals
plot_fraud_rate_by_category(fraud_df, category_col='country', target_col='class', save=True)
plot_fraud_rate_by_category(fraud_df, category_col='source', target_col='class', save=True)
plot_fraud_rate_by_category(fraud_df, category_col='browser', target_col='class', save=True)

logging.info("Saved fraud rate EDA plots")

INFO:root:Added high_value_transaction with threshold 100.
INFO:root:Added time_to_purchase feature.
INFO:root:Calculated risk score features
INFO:root:Added 'cluster' column with 5 clusters.
INFO:root:Completed fraud clustering analysis
INFO:root:Outlier detection completed. Found 1512 outliers.
INFO:root:Completed outlier detection
INFO:root:Saved enriched fraud dataset
INFO:root:Saved fraud rate EDA plots


In [11]:
#  Preprocessing & Balancing (your transformers.py)
from src.transformers import get_preprocessor, apply_balancing
numeric_cols = [
    'purchase_value',
    'age',
    'time_since_signup',
    'user_transaction_count',
    'device_transaction_count',
    'time_to_purchase',
    'high_value_transaction'
]

categorical_cols = ['source', 'browser', 'sex', 'country']


In [12]:
# Assuming you have fraud_df loaded
import joblib
X = fraud_df[numeric_cols + categorical_cols]
y = fraud_df['class']
# Get preprocessor
preprocessor = get_preprocessor(numeric_cols, categorical_cols)
# Fit preprocessor and transform X
X_transformed = preprocessor.fit_transform(X)
# Apply balancing
X_balanced, y_balanced = apply_balancing(X_transformed, y, strategy='smote')
# Save preprocessor
joblib.dump(preprocessor, '../models/preprocessor.pkl')
# Save balanced dataset
import numpy as np
np.savez_compressed('../data/processed/X_balanced.npz', X_balanced)
np.savez_compressed('../data/processed/y_balanced.npz', y_balanced)

print(" Preprocessor and balanced data saved successfully.")

INFO:root:Original class distribution: {0: 136961, 1: 14151}
INFO:root:Balanced class distribution: {0: 136961, 1: 136961}


 Preprocessor and balanced data saved successfully.


| Type                    | Columns                                                                                                                   |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| **Original Features**   | `user_id`, `signup_time`, `purchase_time`, `purchase_value`, `device_id`, `source`, `browser`, `sex`, `age`, `ip_address` |
| **Engineered Features** | `time_since_signup`, `hour_of_day`, `day_of_week`, `user_transaction_count`, `device_transaction_count`, `country`        |
| **Advanced Features**   | `high_value_transaction`, `time_to_purchase`, `transaction_risk`, `risk_score_label`, `cluster`, `outlier`                |
| **Target**              | `class`                                                                                                                   |
