In [7]:
import sys
import os

sys.path.append(os.path.abspath(".."))

import pandas as pd
from src.feature_engineering import add_time_features, add_freq_velocity, get_preprocessor
from utils.helpers import validate_data
import numpy as np

# Audit Cleaned Data

In [11]:
# Define columns to parse as dates if present
DATE_COLS = ['signup_time', 'purchase_time']

def audit_all_cleaned_data(data_dir ='D:/Projects - AI & ML/fraud-detection-project/data/processed'):
    print(f"\n📁 Auditing all cleaned datasets in: {data_dir}")
    print("=" * 60)

    for filename in os.listdir(data_dir):
        if filename.endswith(".csv"):
            path = os.path.join(data_dir, filename)
            try:
                # Try parsing date columns if they exist
                df_sample = pd.read_csv(path, nrows=1)
                parse_cols = [col for col in DATE_COLS if col in df_sample.columns]
                df = pd.read_csv(path, parse_dates=parse_cols)

                # Run audit
                validate_data(df, name=filename)

            except Exception as e:
                print(f"❌ Failed to audit {filename}: {e}")

    print("\n✅ All audits complete.\n")

audit_all_cleaned_data()


📁 Auditing all cleaned datasets in: D:/Projects - AI & ML/fraud-detection-project/data/processed


  df = pd.read_csv(path, parse_dates=parse_cols)



🔍 Auditing: combined.csv
Shape: (434838, 42)
Columns: ['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'class', 'ip_int', 'country', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

Missing Values:
user_id           283726
signup_time       283726
purchase_time     283726
purchase_value    283726
device_id         283726
source            283726
browser           283726
sex               283726
age               283726
class                  0
ip_int            283726
country           284360
Time              151112
V1                151112
V2                151112
V3                151112
V4                151112
V5                151112
V6                151112
V7                151112
V8                151112
V9                151112
V10               151112
V11     

# Feature engineering (example on fraud)

In [None]:
fraud = add_time_features(fraud)
fraud = add_freq_velocity(fraud)

# Select features & target
X = fraud[[
    'purchase_value','age','time_since_signup','time_since_prev',
    'source','browser','sex','country','hour_of_day','day_of_week'
]]
y = fraud['class']

preproc = get_preprocessor(
    num_feats=['purchase_value','age','time_since_signup','time_since_prev'],
    cat_feats=['source','browser','sex','country','hour_of_day','day_of_week']
)

X_proc = preproc.fit_transform(X)