In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))
from src.data_preprocessing import preprocess_raw_data, get_preprocessing_pipeline

In [2]:
# clean or preprocessing
import pandas as pd
from src.data_preprocessing import preprocess_raw_data, get_preprocessing_pipeline
# Step 1: Load Raw Data
df_raw = pd.read_csv('../data/raw/data.csv')
# Step 2: Clean Data (drops, feature extraction)
df_cleaned = preprocess_raw_data(df_raw)
# Step 3: Define features for the pipeline
numeric_features = ['Amount', 'txn_year', 'txn_month', 'txn_dayofweek', 'txn_hour']
categorical_features = ['ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy']
# save cleaned data 
df_cleaned.to_csv('../data/processed/cleaned_transactions.csv', index=False)

### `engineer_features(df)`:

* Adds **datetime features** (txn\_year, txn\_month, etc.)
* Applies `CustomerAggregationTransformer()` to compute per-customer stats:
  * `Amount_sum`, `Amount_mean`, `Amount_std`, `Amount_count`
  * Frequency of `txn_month`, `txn_hour`, etc.
  * Mode of `ProductCategory`, `ProviderId`, etc.

###  `build_feature_pipeline()`:

* Defines a `sklearn.pipeline.Pipeline` with:
  * `SimpleImputer` → `StandardScaler` for numerical features
  * `SimpleImputer` → `OneHotEncoder` for categorical features
* Wraps everything in a `ColumnTransformer`

In [5]:
#  Feature Engineering 
from src.feature_engineering import engineer_features, build_feature_pipeline
df_cleaned = pd.read_csv('../data/processed/cleaned_transactions.csv')
df_features = engineer_features(df_cleaned)
pipeline = build_feature_pipeline()
X = pipeline.fit_transform(df_features)

In [6]:
# Save the engineered features to a CSV
df_features.to_csv('../data/processed/feature_engineered.csv', index=False)