# Feature Engineering

This notebook contains feature engineering for the TailWagg pet retail dataset, including:
- Rolling average calculations
- Trend calculations
- Data preprocessing functions
- Feature creation utilities


## Setup


In [1]:
# Add project root to Python path
import sys
import os

# Get the project root directory (parent of notebooks directory)
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    project_root = os.path.dirname(current_dir)
else:
    project_root = current_dir

# Add project root to Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root: {project_root}")
print(f"Python path includes: {project_root in sys.path}")


Project root: /Users/paulrodriguez/Documents/Documents - Paul’s MacBook Pro/Data Analyst School/_DataCamp/github/tailwagg
Python path includes: True


In [2]:
# Import TailWagg utilities
from src.utils.database import get_database_engine, test_connection
from src.dataset import load_daily_metrics
from src.features import calculate_rolling_averages, calculate_trend_labels, calculate_net_profit_margin
from src.utils.validation import validate_environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Validate environment and get database engine
validate_environment()
engine = get_database_engine()

# Test connection
if test_connection(engine):
    print("✅ Connected to PostgreSQL successfully!")
else:
    print("❌ Failed to connect to PostgreSQL")


✅ Connected to PostgreSQL successfully!


## Rolling Average Calculations


In [3]:
# Load sample data to demonstrate feature engineering
df = load_daily_metrics()
print(f"Loaded {len(df)} rows of daily metrics data")

# Example: Calculate rolling averages using TailWagg utility
df_with_rolling = calculate_rolling_averages(
    df, 
    group_col='product_id', 
    date_col='order_date', 
    value_col='total_units_sold', 
    windows=[30, 90]
)

print("✅ Rolling averages calculated successfully!")
print(f"New columns added: {[col for col in df_with_rolling.columns if 'rolling' in col]}")


Loaded 216425 rows of daily metrics data
✅ Rolling averages calculated successfully!
New columns added: ['rolling_30d_avg_sales', 'rolling_90d_avg_sales']


## Trend Calculations


In [4]:
# Example: Calculate trend labels using TailWagg utility
df_with_trends = calculate_trend_labels(
    df_with_rolling,
    short_col='rolling_30d_avg_sales',
    long_col='rolling_90d_avg_sales'
)

print("✅ Trend labels calculated successfully!")
print(f"Trend distribution:")
print(df_with_trends['trend_label'].value_counts())

# Example: Calculate net profit margin using TailWagg utility
df_with_margins = calculate_net_profit_margin(
    df_with_trends,
    profit_col='gross_profit',
    revenue_col='gross_revenue'
)

print("✅ Net profit margins calculated successfully!")
print(f"Average net profit margin: {df_with_margins['net_profit_margin'].mean():.2%}")

# Save feature-engineered data for use in final insights notebook
import os
os.makedirs('data/interim', exist_ok=True)
df_with_margins.to_csv('data/interim/featured_data.csv', index=False)
print("\n✅ Feature-engineered data saved to data/interim/featured_data.csv")


✅ Trend labels calculated successfully!
Trend distribution:
trend_label
Plateau      98554
Growing      60641
Declining    57230
Name: count, dtype: int64
✅ Net profit margins calculated successfully!
Average net profit margin: 19.53%

✅ Feature-engineered data saved to data/interim/featured_data.csv
