## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Libraries loaded successfully!")

## Load Cleaned Data

In [None]:
# Load cleaned daily sales data
df_daily = pd.read_csv('../data/sales_daily.csv')
df_daily['Date'] = pd.to_datetime(df_daily['Date'])
df_daily = df_daily.sort_values('Date').reset_index(drop=True)

print(f"Data shape: {df_daily.shape}")
print(f"\nDate range: {df_daily['Date'].min()} to {df_daily['Date'].max()}")
print(f"\nFirst few rows:")
print(df_daily.head(10))

## 1. Temporal Features

In [None]:
# Extract temporal features
df_daily['Year'] = df_daily['Date'].dt.year
df_daily['Month'] = df_daily['Date'].dt.month
df_daily['Quarter'] = df_daily['Date'].dt.quarter
df_daily['DayOfWeek'] = df_daily['Date'].dt.dayofweek  # 0=Monday, 6=Sunday
df_daily['DayOfMonth'] = df_daily['Date'].dt.day
df_daily['WeekOfYear'] = df_daily['Date'].dt.isocalendar().week
df_daily['DayOfYear'] = df_daily['Date'].dt.dayofyear

# Create categorical day names
df_daily['DayName'] = df_daily['Date'].dt.day_name()
df_daily['MonthName'] = df_daily['Date'].dt.month_name()

# Weekend indicator
df_daily['IsWeekend'] = (df_daily['DayOfWeek'] >= 5).astype(int)

print("Temporal Features Created:")
print(df_daily[['Date', 'Year', 'Month', 'Quarter', 'DayOfWeek', 'DayName', 'IsWeekend']].head(15))

## 2. Seasonal Indicators

In [None]:
# Define seasonal periods
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:  # 9, 10, 11
        return 'Fall'

df_daily['Season'] = df_daily['Month'].apply(get_season)

# Cyclical encoding for seasonal patterns (sine and cosine transformations)
df_daily['Month_sin'] = np.sin(2 * np.pi * df_daily['Month'] / 12)
df_daily['Month_cos'] = np.cos(2 * np.pi * df_daily['Month'] / 12)

df_daily['DayOfWeek_sin'] = np.sin(2 * np.pi * df_daily['DayOfWeek'] / 7)
df_daily['DayOfWeek_cos'] = np.cos(2 * np.pi * df_daily['DayOfWeek'] / 7)

print("Seasonal Features Created:")
print(df_daily[['Date', 'Season', 'Month_sin', 'Month_cos', 'DayOfWeek_sin', 'DayOfWeek_cos']].head(15))

## 3. Holiday Indicators

In [None]:
# Define holiday periods for retail business
def is_holiday_season(date):
    month = date.month
    day = date.day
    
    # US Holidays and retail peak seasons
    holidays = [
        (1, 1),      # New Year's Day
        (2, 14),     # Valentine's Day
        (7, 4),      # Independence Day
        (10, 31),    # Halloween
        (11, 27),    # Thanksgiving (approximate)
        (12, 25),    # Christmas
    ]
    
    # Check if within +/- 7 days of major holidays
    for h_month, h_day in holidays:
        if month == h_month and abs(day - h_day) <= 7:
            return 1
    return 0

df_daily['Holiday_Season'] = df_daily['Date'].apply(is_holiday_season)

# Q4 (Oct-Dec) is critical retail season
df_daily['IsQ4'] = ((df_daily['Month'] >= 10) & (df_daily['Month'] <= 12)).astype(int)

# Black Friday/Cyber Monday (last Friday of November + following Monday)
df_daily['IsBlackFriday'] = 0
for year in [2022, 2023, 2024]:
    # Find last Friday of November
    nov_last = pd.Timestamp(year=year, month=11, day=30)
    last_friday = nov_last - timedelta(days=(nov_last.weekday() - 4) % 7)
    df_daily.loc[df_daily['Date'] == last_friday, 'IsBlackFriday'] = 1
    # Cyber Monday (3 days after Black Friday)
    cyber_monday = last_friday + timedelta(days=3)
    df_daily.loc[df_daily['Date'] == cyber_monday, 'IsBlackFriday'] = 1

print("Holiday Features Created:")
print(df_daily[['Date', 'Holiday_Season', 'IsQ4', 'IsBlackFriday']].head(20))
print(f"\nHoliday Season days: {df_daily['Holiday_Season'].sum()}")
print(f"Q4 days: {df_daily['IsQ4'].sum()}")
print(f"Black Friday/Cyber Monday days: {df_daily['IsBlackFriday'].sum()}")

## 4. Lagged Features (Previous Period Sales)

In [None]:
# Create lagged features (past values)
df_daily['Sales_Lag1'] = df_daily['Total_Sales'].shift(1)   # Previous day
df_daily['Sales_Lag7'] = df_daily['Total_Sales'].shift(7)   # Previous week
df_daily['Sales_Lag30'] = df_daily['Total_Sales'].shift(30) # Previous month
df_daily['Sales_Lag365'] = df_daily['Total_Sales'].shift(365) # Previous year

# Year-over-year comparison
df_daily['YoY_Growth'] = ((df_daily['Total_Sales'] - df_daily['Sales_Lag365']) / df_daily['Sales_Lag365'] * 100).fillna(0)

print("Lagged Features Created:")
print(df_daily[['Date', 'Total_Sales', 'Sales_Lag1', 'Sales_Lag7', 'Sales_Lag30', 'YoY_Growth']].iloc[370:385])

## 5. Moving Averages and Trends

In [None]:
# Moving averages
df_daily['MA7'] = df_daily['Total_Sales'].rolling(window=7, min_periods=1).mean()    # 7-day MA
df_daily['MA14'] = df_daily['Total_Sales'].rolling(window=14, min_periods=1).mean()  # 14-day MA
df_daily['MA30'] = df_daily['Total_Sales'].rolling(window=30, min_periods=1).mean()  # 30-day MA

# Exponential Moving Average
df_daily['EMA7'] = df_daily['Total_Sales'].ewm(span=7, adjust=False).mean()
df_daily['EMA30'] = df_daily['Total_Sales'].ewm(span=30, adjust=False).mean()

# Trend indicator (7-day MA vs 30-day MA)
df_daily['Trend_Indicator'] = (df_daily['MA7'] - df_daily['MA30']) / df_daily['MA30'] * 100

# Volatility (standard deviation of 7-day returns)
df_daily['Volatility'] = df_daily['Total_Sales'].pct_change().rolling(window=7).std()

print("Moving Averages and Trends Created:")
print(df_daily[['Date', 'Total_Sales', 'MA7', 'MA30', 'EMA7', 'Trend_Indicator', 'Volatility']].iloc[30:45])

## 6. Velocity and Momentum Indicators

In [None]:
# Daily percentage change
df_daily['Pct_Change'] = df_daily['Total_Sales'].pct_change() * 100

# 7-day momentum
df_daily['Momentum_7d'] = df_daily['Total_Sales'] - df_daily['Total_Sales'].shift(7)

# Rate of change
df_daily['ROC_7d'] = (df_daily['Total_Sales'].pct_change(periods=7)) * 100
df_daily['ROC_30d'] = (df_daily['Total_Sales'].pct_change(periods=30)) * 100

print("Velocity and Momentum Indicators Created:")
print(df_daily[['Date', 'Total_Sales', 'Pct_Change', 'Momentum_7d', 'ROC_7d', 'ROC_30d']].iloc[30:45])

## 7. Aggregate Features

In [None]:
# Calculate monthly aggregates and merge back to daily data
monthly_agg = df_daily.groupby(df_daily['Date'].dt.to_period('M')).agg({
    'Total_Sales': ['mean', 'std', 'min', 'max']
}).reset_index()

monthly_agg.columns = ['YearMonth', 'Monthly_Avg_Sales', 'Monthly_Std_Sales', 'Monthly_Min_Sales', 'Monthly_Max_Sales']
monthly_agg['YearMonth'] = monthly_agg['YearMonth'].astype(str)

# Add monthly features to daily data
df_daily['YearMonth'] = df_daily['Date'].dt.to_period('M').astype(str)
df_daily = df_daily.merge(monthly_agg, on='YearMonth', how='left')

# Calculate quarterly aggregates
quarterly_agg = df_daily.groupby([df_daily['Date'].dt.year, df_daily['Date'].dt.quarter])['Total_Sales'].agg(['mean', 'sum']).reset_index()
quarterly_agg.columns = ['Year', 'Quarter', 'Quarterly_Avg_Sales', 'Quarterly_Sum_Sales']
df_daily = df_daily.merge(quarterly_agg, on=['Year', 'Quarter'], how='left')

print("Aggregate Features Created:")
print(df_daily[['Date', 'Monthly_Avg_Sales', 'Monthly_Std_Sales', 'Quarterly_Avg_Sales']].head(10))

## 8. Feature Summary and Visualization

In [None]:
print("\n" + "="*70)
print("FEATURE ENGINEERING SUMMARY")
print("="*70)

print(f"\nTotal Features Created: {len(df_daily.columns) - 1}")  # -1 for Date

# List all features
feature_categories = {
    'Temporal': ['Year', 'Month', 'Quarter', 'DayOfWeek', 'WeekOfYear', 'DayOfMonth', 'DayOfYear'],
    'Seasonal': ['Season', 'Month_sin', 'Month_cos', 'DayOfWeek_sin', 'DayOfWeek_cos'],
    'Holiday': ['Holiday_Season', 'IsQ4', 'IsBlackFriday'],
    'Categorical': ['DayName', 'MonthName', 'IsWeekend'],
    'Lagged': ['Sales_Lag1', 'Sales_Lag7', 'Sales_Lag30', 'Sales_Lag365', 'YoY_Growth'],
    'Trend/MA': ['MA7', 'MA14', 'MA30', 'EMA7', 'EMA30', 'Trend_Indicator', 'Volatility'],
    'Momentum': ['Pct_Change', 'Momentum_7d', 'ROC_7d', 'ROC_30d'],
    'Aggregate': ['YearMonth', 'Monthly_Avg_Sales', 'Monthly_Std_Sales', 'Monthly_Min_Sales', 
                  'Monthly_Max_Sales', 'Quarterly_Avg_Sales', 'Quarterly_Sum_Sales']
}

for category, features in feature_categories.items():
    available_features = [f for f in features if f in df_daily.columns]
    print(f"\n{category} Features ({len(available_features)}):")
    for feature in available_features:
        print(f"  • {feature}")

## 9. Feature Correlation Analysis

In [None]:
# Select numeric features for correlation
numeric_cols = df_daily.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation with target variable
corr_with_target = df_daily[numeric_cols].corr()['Total_Sales'].sort_values(ascending=False)

print("\nCorrelation with Total_Sales:")
print(corr_with_target.head(20))
print("\n...")
print(corr_with_target.tail(10))

# Visualize top correlations
fig, ax = plt.subplots(figsize=(10, 8))

top_features = corr_with_target[corr_with_target.index != 'Total_Sales'].head(15)
colors = ['green' if x > 0 else 'red' for x in top_features.values]
ax.barh(range(len(top_features)), top_features.values, color=colors, edgecolor='black')
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features.index)
ax.set_xlabel('Correlation with Total_Sales', fontsize=12, fontweight='bold')
ax.set_title('Top 15 Features Correlated with Sales', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linewidth=0.8)
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 10. Handle Missing Values

In [None]:
# Check for missing values
missing_before = df_daily.isnull().sum()
print("Missing values before imputation:")
print(missing_before[missing_before > 0])

# Forward fill for lagged features and moving averages at the beginning
df_daily = df_daily.fillna(method='bfill').fillna(method='ffill')

# For any remaining NaN, use 0 (for new features)
df_daily = df_daily.fillna(0)

print("\nMissing values after imputation:")
print(df_daily.isnull().sum().sum())

## 11. Save Engineered Features

In [None]:
# Save engineered features
df_daily.to_csv('../data/sales_with_features.csv', index=False)

print(f"✓ Engineered features saved to 'sales_with_features.csv'")
print(f"\nFinal dataset shape: {df_daily.shape}")
print(f"Total features: {len(df_daily.columns)}")
print(f"\nColumns in dataset:")
for i, col in enumerate(df_daily.columns, 1):
    print(f"{i:2d}. {col}")

## 12. Feature Statistics

In [None]:
print("\n" + "="*70)
print("FEATURE ENGINEERING COMPLETE")
print("="*70)

print(f"\nDataset Information:")
print(f"Rows: {len(df_daily):,}")
print(f"Features: {len(df_daily.columns) - 1}")  # Exclude Date
print(f"Date Range: {df_daily['Date'].min()} to {df_daily['Date'].max()}")

print(f"\nNumerical Features Summary:")
print(df_daily.describe().round(2))

print(f"\nCategorical Features Summary:")
categorical = ['Season', 'DayName', 'IsWeekend', 'Holiday_Season', 'IsQ4']
for col in categorical:
    if col in df_daily.columns:
        print(f"\n{col}:")
        print(df_daily[col].value_counts())