In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Add src to path
import sys
sys.path.append('..')

from src.config import CLEANED_DATA_PATH, FEATURED_DATA_PATH, TARGET_COLUMN, DATE_COLUMN
from src.feature_engineering import (
    add_temporal_features, add_cyclical_features, add_lag_features,
    add_rolling_features, add_diff_features, add_holiday_features,
    add_weather_features, add_interaction_features, feature_engineering_pipeline
)
from src.utils import save_csv

print("Libraries imported successfully!")

## 3.1 Load Cleaned Data

In [None]:
# Load cleaned data
df = pd.read_csv(CLEANED_DATA_PATH, parse_dates=[DATE_COLUMN])
print(f"Loaded data shape: {df.shape}")
print(f"Original columns: {df.columns.tolist()}")
df.head()

## 3.2 Temporal Features

In [None]:
# Add temporal features
df = add_temporal_features(df, DATE_COLUMN)

# View new features
temporal_cols = ['hour', 'day_of_week', 'day_of_month', 'month', 'year', 
                 'week_of_year', 'quarter', 'is_weekend', 'is_rush_hour', 'season']
df[temporal_cols].head(10)

## 3.3 Cyclical Features

In [None]:
# Add cyclical encoding
df = add_cyclical_features(df)

# Visualize cyclical encoding for hour
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Hour sin/cos
sample = df.head(100)
axes[0].scatter(sample['hour_sin'], sample['hour_cos'], c=sample['hour'], cmap='hsv')
axes[0].set_xlabel('hour_sin')
axes[0].set_ylabel('hour_cos')
axes[0].set_title('Cyclical Encoding of Hour')

# Day sin/cos
axes[1].scatter(sample['day_sin'], sample['day_cos'], c=sample['day_of_week'], cmap='hsv')
axes[1].set_xlabel('day_sin')
axes[1].set_ylabel('day_cos')
axes[1].set_title('Cyclical Encoding of Day of Week')

plt.tight_layout()
plt.show()

## 3.4 Lag Features

In [None]:
# Add lag features
lag_periods = [1, 2, 3, 6, 12, 24, 168]  # 168 = 1 week
df = add_lag_features(df, TARGET_COLUMN, lags=lag_periods)

# View lag features
lag_cols = [f'{TARGET_COLUMN}_lag_{lag}h' for lag in lag_periods]
df[[TARGET_COLUMN] + lag_cols].head(200).tail(10)

## 3.5 Rolling Statistics

In [None]:
# Add rolling features
df = add_rolling_features(df, TARGET_COLUMN, windows=[3, 6, 12, 24])

# View rolling features
rolling_cols = ['rolling_mean_3h', 'rolling_mean_6h', 'rolling_mean_12h', 'rolling_mean_24h',
                'rolling_std_3h', 'rolling_std_6h', 'rolling_std_24h',
                'rolling_min_24h', 'rolling_max_24h', 'ewm_mean']
df[[TARGET_COLUMN] + rolling_cols].head(50).tail(10)

## 3.6 Difference Features

In [None]:
# Add difference features
df = add_diff_features(df, TARGET_COLUMN)

# View difference features
diff_cols = ['diff_1h', 'diff_24h', 'pct_change_1h', 'pct_change_24h']
df[[TARGET_COLUMN] + diff_cols].head(50).tail(10)

## 3.7 Holiday Features

In [None]:
# Add holiday features
df = add_holiday_features(df, 'holiday')

# Check holiday feature
print(f"Holiday distribution:")
print(df['is_holiday'].value_counts())

## 3.8 Weather Features

In [None]:
# Add weather features
df = add_weather_features(df)

# View weather features
weather_cols = ['temp_celsius', 'temp_category', 'is_rainy', 'is_snowy', 'cloud_category', 'weather_encoded']
df[weather_cols].describe()

## 3.9 Interaction Features

In [None]:
# Add interaction features
df = add_interaction_features(df)

# View interaction features
interaction_cols = ['hour_weekend', 'hour_holiday', 'rush_rain', 'temp_rush']
df[interaction_cols].head(10)

## 3.10 Handle NaN Values from Feature Engineering

In [None]:
# Check NaN values
nan_counts = df.isnull().sum()
nan_cols = nan_counts[nan_counts > 0]

print(f"Columns with NaN values: {len(nan_cols)}")
print(nan_cols)

In [None]:
# Drop rows with NaN (from lag and rolling features)
initial_len = len(df)
df = df.dropna().reset_index(drop=True)
print(f"Dropped {initial_len - len(df)} rows with NaN values")
print(f"Final shape: {df.shape}")

## 3.11 Feature Summary

In [None]:
# Summary of all features
print("=" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 60)
print(f"Total features: {df.shape[1]}")
print(f"Total samples: {df.shape[0]:,}")
print("\nFeature categories:")
print(f"  - Original features: 9")
print(f"  - Temporal features: 10")
print(f"  - Cyclical features: 6")
print(f"  - Lag features: {len(lag_periods)}")
print(f"  - Rolling features: 10")
print(f"  - Difference features: 4")
print(f"  - Holiday features: 2")
print(f"  - Weather features: 6")
print(f"  - Interaction features: 4")
print("=" * 60)

In [None]:
# List all columns
print("\nAll columns:")
for i, col in enumerate(df.columns):
    print(f"{i+1:3d}. {col}")

## 3.12 Save Featured Data

In [None]:
# Save to CSV
save_csv(df, FEATURED_DATA_PATH, index=False)

print(f"\nFeatured data saved to: {FEATURED_DATA_PATH}")

## Summary

**Feature Engineering completed:**
1. ✅ Temporal features (hour, day, month, etc.)
2. ✅ Cyclical encoding (sin/cos)
3. ✅ Lag features (t-1, t-2, ..., t-168)
4. ✅ Rolling statistics (mean, std, min, max)
5. ✅ Difference features (diff, pct_change)
6. ✅ Holiday features
7. ✅ Weather features
8. ✅ Interaction features
9. ✅ Saved featured data

**Next step:** Feature Selection (04_Feature_Selection.ipynb)