In [None]:
import pandas as pd
import logging
import sys
sys.path.append("../") 

from src.advanced.fraud_feature_engineering import (
    add_transaction_amount_flags,
    add_time_to_purchase,
    save_advanced_featured_data
)

from src.advanced.fraud_risk_scoring import (
    calculate_transaction_risk,
    assign_risk_score,
)

from src.advanced.fraud_clustering_analysis import(
    perform_clustering_analysis,
)

from src.advanced.outlier_detection import (
    detect_outliers_iforest,
)

from src.advanced.eda_grouped_visuals import (
    plot_fraud_rate_by_category,
)

logging.basicConfig(level=logging.INFO)
# 1. Load Cleaned Data
fraud_df = pd.read_parquet("../data/processed/fraud_data_basic_features.parquet")

# 3. Advanced Feature Engineering
fraud_df = add_transaction_amount_flags(fraud_df)
fraud_df = add_time_to_purchase(fraud_df)
# 4. Fraud Risk Scoring
fraud_df = calculate_transaction_risk(fraud_df)
fraud_df = assign_risk_score(fraud_df)

logging.info("Calculated risk score features")
save_advanced_featured_data(fraud_df, "../data/processed/fraud_data_advanced_features.parquet")
# 5. Clustering Analysis
fraud_df = perform_clustering_analysis(fraud_df)
logging.info("Completed fraud clustering analysis")
# 6. Outlier Detection
fraud_df = detect_outliers_iforest(fraud_df)
logging.info("Completed outlier detection")
# 7. Save Enriched Dataset
fraud_df.to_parquet("../data/processed/fraud_data_advanced_enriched.parquet", index=False)
logging.info("Saved enriched fraud dataset")
# 8. Save EDA Visuals
plot_fraud_rate_by_category(fraud_df, category_col='country', target_col='class', save=True)
plot_fraud_rate_by_category(fraud_df, category_col='source', target_col='class', save=True)
plot_fraud_rate_by_category(fraud_df, category_col='browser', target_col='class', save=True)

logging.info("Saved fraud rate EDA plots")

INFO:root:Added high_value_transaction with threshold 100.
INFO:root:Added time_to_purchase feature.
INFO:root:Calculated risk score features
INFO:root:Added 'cluster' column with 5 clusters.
INFO:root:Completed fraud clustering analysis
INFO:root:Outlier detection completed. Found 1512 outliers.
INFO:root:Completed outlier detection
INFO:root:Saved enriched fraud dataset
INFO:root:Saved fraud rate EDA plots


In [None]:
from src.transformers import get_preprocessor, apply_balancing, fit_transform_to_df
import pandas as pd
import numpy as np
import joblib

# Load Data
fraud_df = pd.read_parquet('../data/processed/fraud_data_advanced_enriched.parquet')

# Define Columns
fraud_numeric_cols = [
    'purchase_value', 'age', 'time_since_signup',
    'user_transaction_count', 'device_transaction_count',
    'time_to_purchase', 'high_value_transaction'
]
fraud_categorical_cols = ['source', 'browser', 'sex', 'country']

# Separate Features & Target
Xf = fraud_df[fraud_numeric_cols + fraud_categorical_cols]
yf = fraud_df['class']

# Preprocessing
preprocessor_fraud = get_preprocessor(fraud_numeric_cols, fraud_categorical_cols)
Xf_trans_df = fit_transform_to_df(preprocessor_fraud, Xf)

# Save Feature Names before balancing
np.save('../models/feature_names_fraud.npy', Xf_trans_df.columns)

# Balancing
Xf_bal, yf_bal = apply_balancing(Xf_trans_df, yf, strategy='smote')
Xf_bal_df = pd.DataFrame(Xf_bal, columns=Xf_trans_df.columns)

# Remove Constant/All-NaN Columns after balancing
Xf_bal_df = Xf_bal_df.loc[:, (Xf_bal_df.nunique() > 1) & (~Xf_bal_df.isna().all())]

# Save Preprocessor & Balanced Data
joblib.dump(preprocessor_fraud, '../models/preprocessor_fraud.pkl')
np.savez_compressed('../data/processed/Xf_balanced.npz', Xf_bal_df)
np.savez_compressed('../data/processed/yf_balanced.npz', yf_bal)


INFO:root:Original class distribution: {0: 136961, 1: 14151}
INFO:root:Balanced class distribution: {0: 136961, 1: 136961}


| Type                    | Columns                                                                                                                   |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| **Original Features**   | `user_id`, `signup_time`, `purchase_time`, `purchase_value`, `device_id`, `source`, `browser`, `sex`, `age`, `ip_address` |
| **Engineered Features** | `time_since_signup`, `hour_of_day`, `day_of_week`, `user_transaction_count`, `device_transaction_count`, `country`        |
| **Advanced Features**   | `high_value_transaction`, `time_to_purchase`, `transaction_risk`, `risk_score_label`, `cluster`, `outlier`                |
| **Target**              | `class`                                                                                                                   |


In [None]:
from src.transformers import get_preprocessor, apply_balancing, fit_transform_to_df
import pandas as pd
import numpy as np
import joblib

# Load Credit Card Data
cc_df = pd.read_parquet('../data/processed/creditcard_data_cleaned.parquet')

# Define Columns
cc_numeric_cols = [f'V{i}' for i in range(1, 29)] + ['Amount']
cc_categorical_cols = []  # No categorical features

# Separate Features & Target
Xcc = cc_df[cc_numeric_cols]
ycc = cc_df['Class']

# Preprocessing
preprocessor_cc = get_preprocessor(cc_numeric_cols, cc_categorical_cols)
Xcc_trans_df = fit_transform_to_df(preprocessor_cc, Xcc)

# Save Feature Names before balancing
np.save('../models/feature_names_creditcard.npy', Xcc_trans_df.columns)

# Balancing
Xcc_bal, ycc_bal = apply_balancing(Xcc_trans_df, ycc, strategy='smote')
Xcc_bal_df = pd.DataFrame(Xcc_bal, columns=Xcc_trans_df.columns)

# Remove Constant/All-NaN Columns after balancing
Xcc_bal_df = Xcc_bal_df.loc[:, (Xcc_bal_df.nunique() > 1) & (~Xcc_bal_df.isna().all())]

# Save Preprocessor & Cleaned Balanced Data
joblib.dump(preprocessor_cc, '../models/preprocessor_creditcard.pkl')
np.savez_compressed('../data/processed/Xcc_balanced.npz', Xcc_bal_df)
np.savez_compressed('../data/processed/ycc_balanced.npz', ycc_bal)

print(" Credit Card Preprocessor & Cleaned Balanced Data Saved Successfully.")


INFO:root:Original class distribution: {0: 283253, 1: 473}
INFO:root:Balanced class distribution: {0: 283253, 1: 283253}
