In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load cleaned weekly data
df = pd.read_csv('../data/avocado_weekly.csv')
df['date'] = pd.to_datetime(df['date'])

# Sort by region and date
df = df.sort_values(['region', 'date']).reset_index(drop=True)

# Temporal features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['week'] = df['date'].dt.isocalendar().week
df['dayofweek'] = df['date'].dt.dayofweek
df['is_month_start'] = df['date'].dt.is_month_start.astype(int)
df['is_month_end'] = df['date'].dt.is_month_end.astype(int)

# Cyclical encoding for month
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

# Lag features
for lag in [1, 4, 8, 12]:  # 1, 4, 8, 12 weeks
    df[f'price_lag_{lag}'] = df.groupby('region')['AveragePrice'].shift(lag)

# Rolling statistics
for window in [4, 8, 12]:  # 4, 8, 12 weeks
    df[f'price_roll_mean_{window}'] = df.groupby('region')['AveragePrice'].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean())
    df[f'price_roll_std_{window}'] = df.groupby('region')['AveragePrice'].transform(
        lambda x: x.rolling(window=window, min_periods=1).std())

# Volume features
df['volume_lag_1'] = df.groupby('region')['Total Volume'].shift(1)
df['volume_roll_mean_4'] = df.groupby('region')['Total Volume'].transform(
    lambda x: x.rolling(window=4, min_periods=1).mean())

# Encode categorical variables
le = LabelEncoder()
df['region_encoded'] = le.fit_transform(df['region'])
df['type_encoded'] = le.fit_transform(df['type'])

# Drop rows with NaN values created by lag features
df = df.dropna()

# Save features
df.to_csv('../data/avocado_features.csv', index=False)

print(f"Final dataset shape: {df.shape}")
print("Features:", df.columns.tolist())

Final dataset shape: (17601, 34)
Features: ['region', 'type', 'date', 'AveragePrice', 'Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'year', 'month', 'week', 'dayofweek', 'is_month_start', 'is_month_end', 'month_sin', 'month_cos', 'price_lag_1', 'price_lag_4', 'price_lag_8', 'price_lag_12', 'price_roll_mean_4', 'price_roll_std_4', 'price_roll_mean_8', 'price_roll_std_8', 'price_roll_mean_12', 'price_roll_std_12', 'volume_lag_1', 'volume_roll_mean_4', 'region_encoded', 'type_encoded']
