# Ohrid Water Demand - Feature Engineering

This notebook demonstrates comprehensive feature engineering for water demand prediction in Ohrid, North Macedonia.

## Objectives
- Create temporal features (hour, day, season)
- Engineer weather-based features
- Generate tourism-specific features
- Build lag and rolling window features
- Create interaction features
- Validate feature importance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sys
sys.path.append('../src')

from data_collectors.ohrid_synthetic_generator import OhridWaterDemandGenerator

print("Feature Engineering for Ohrid Water Demand Research")
print("=" * 60)

## 1. Load Base Dataset

In [None]:
# Load synthetic data
df = pd.read_csv('../data/raw/ohrid_synthetic_water_demand.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
df.head()

## 2. Temporal Feature Engineering

In [None]:
# Advanced temporal features
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Season encoding
df['season'] = df['month'].map({12: 'winter', 1: 'winter', 2: 'winter',
                                3: 'spring', 4: 'spring', 5: 'spring',
                                6: 'summer', 7: 'summer', 8: 'summer',
                                9: 'autumn', 10: 'autumn', 11: 'autumn'})

# Peak hours identification
df['is_morning_peak'] = ((df['hour'] >= 6) & (df['hour'] <= 8)).astype(int)
df['is_evening_peak'] = ((df['hour'] >= 18) & (df['hour'] <= 21)).astype(int)
df['is_night_minimum'] = ((df['hour'] >= 23) | (df['hour'] <= 5)).astype(int)

print("Temporal features created:")
temporal_features = ['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 
                    'season', 'is_morning_peak', 'is_evening_peak', 'is_night_minimum']
print(temporal_features)

## 3. Weather Feature Engineering

In [None]:
# Temperature features
df['temp_above_25'] = (df['temperature'] > 25).astype(int)
df['temp_below_5'] = (df['temperature'] < 5).astype(int)
df['temp_comfort_zone'] = ((df['temperature'] >= 18) & (df['temperature'] <= 25)).astype(int)

# Precipitation features
df['is_raining'] = (df['precipitation'] > 0.1).astype(int)
df['heavy_rain'] = (df['precipitation'] > 5.0).astype(int)

# Humidity categories
df['humidity_low'] = (df['humidity'] < 40).astype(int)
df['humidity_high'] = (df['humidity'] > 80).astype(int)

# Weather comfort index
df['weather_comfort'] = (
    df['temp_comfort_zone'] * 0.4 +
    (1 - df['is_raining']) * 0.3 +
    ((df['humidity'] >= 40) & (df['humidity'] <= 70)).astype(int) * 0.3
)

print("Weather features created:")
weather_features = ['temp_above_25', 'temp_below_5', 'temp_comfort_zone', 
                   'is_raining', 'heavy_rain', 'humidity_low', 'humidity_high', 'weather_comfort']
print(weather_features)

## 4. Tourism Feature Engineering

In [None]:
# Tourism intensity levels
df['tourism_low'] = (df['tourists_estimated'] < 1000).astype(int)
df['tourism_medium'] = ((df['tourists_estimated'] >= 1000) & (df['tourists_estimated'] < 3000)).astype(int)
df['tourism_high'] = (df['tourists_estimated'] >= 3000).astype(int)

# Tourism pressure index
df['tourism_pressure'] = df['tourists_estimated'] / df['population']

# Weekend tourism boost
df['weekend_tourism_boost'] = (df['is_weekend'] & df['is_tourist_season']).astype(int)

# Festival impact
df['festival_tourism_impact'] = (df['is_festival_period'] * df['tourism_multiplier'])

print("Tourism features created:")
tourism_features = ['tourism_low', 'tourism_medium', 'tourism_high', 
                   'tourism_pressure', 'weekend_tourism_boost', 'festival_tourism_impact']
print(tourism_features)

## 5. Lag and Rolling Features

In [None]:
# Lag features for water demand
target_col = 'water_demand_m3_per_hour'
lag_periods = [1, 2, 7, 14, 30]  # hours

for lag in lag_periods:
    df[f'{target_col}_lag_{lag}h'] = df[target_col].shift(lag)

# Rolling window features
rolling_windows = [24, 168, 720]  # 1 day, 1 week, 1 month in hours

for window in rolling_windows:
    df[f'{target_col}_rolling_mean_{window}h'] = df[target_col].rolling(window=window, min_periods=1).mean()
    df[f'{target_col}_rolling_std_{window}h'] = df[target_col].rolling(window=window, min_periods=1).std()
    df[f'{target_col}_rolling_min_{window}h'] = df[target_col].rolling(window=window, min_periods=1).min()
    df[f'{target_col}_rolling_max_{window}h'] = df[target_col].rolling(window=window, min_periods=1).max()

# Lag features for weather
df['temperature_lag_1h'] = df['temperature'].shift(1)
df['precipitation_lag_1h'] = df['precipitation'].shift(1)

print("Lag and rolling features created:")
lag_features = [col for col in df.columns if 'lag_' in col or 'rolling_' in col]
print(f"Total lag/rolling features: {len(lag_features)}")

## 6. Interaction Features

In [None]:
# Weather × Tourism interactions
df['temp_tourism_interaction'] = df['temperature'] * df['tourism_pressure']
df['rain_tourism_interaction'] = df['is_raining'] * df['tourists_estimated']

# Time × Season interactions
df['hour_season_interaction'] = df['hour'] * df['tourism_multiplier']
df['weekend_season_interaction'] = df['is_weekend'] * df['seasonal_multiplier']

# Weather × Time interactions
df['temp_hour_interaction'] = df['temperature'] * df['hour']
df['rain_weekend_interaction'] = df['is_raining'] * df['is_weekend']

print("Interaction features created:")
interaction_features = ['temp_tourism_interaction', 'rain_tourism_interaction',
                       'hour_season_interaction', 'weekend_season_interaction',
                       'temp_hour_interaction', 'rain_weekend_interaction']
print(interaction_features)

## 7. Feature Summary and Export

In [None]:
# Calculate total features
all_new_features = temporal_features + weather_features + tourism_features + lag_features + interaction_features

print(f"\nFeature Engineering Summary:")
print(f"Original features: {df.shape[1] - len(all_new_features)}")
print(f"New engineered features: {len(all_new_features)}")
print(f"Total features: {df.shape[1]}")

# Remove any features with all NaN values
df_clean = df.dropna(axis=1, how='all')

# Save engineered features
output_path = '../data/features/ohrid_features_complete.csv'
df_clean.to_csv(output_path)
print(f"\nFeatures saved to: {output_path}")
print(f"Final dataset shape: {df_clean.shape}")

# Display feature correlation with target
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
correlations = df_clean[numeric_cols].corr()[target_col].abs().sort_values(ascending=False)

print("\nTop 15 features by correlation with water demand:")
print(correlations.head(15))