In [1]:
# Milestone 2: Feature Engineering & Data Wrangling
# Azure Demand Forecasting Project

import pandas as pd
import numpy as np

print("Starting Milestone 2: Feature Engineering & Data Wrangling\n")

# Load cleaned dataset from Milestone 1
df = pd.read_csv("azure_demand_cleaned.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp")

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Date Range: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}\n")

# Check what we have
print("Current columns:")
print(df.columns.tolist())
print("\n")

# Sort by region, service type, and timestamp for proper lag/rolling calculations
df = df.sort_values(["region", "service_type", "timestamp"])

# Time-based features
print("Creating time-based features...")
df["week_of_year"] = df["timestamp"].dt.isocalendar().week.astype(int)

print("  - week_of_year added")
print("\n")

# Lag features - past usage patterns
print("Creating lag features...")

df["lag_1"] = df.groupby(
    ["region", "service_type"],
    observed=True
)["usage_units"].shift(1)

df["lag_7"] = df.groupby(
    ["region", "service_type"],
    observed=True
)["usage_units"].shift(7)

df["lag_30"] = df.groupby(
    ["region", "service_type"],
    observed=True
)["usage_units"].shift(30)

print("  - lag_1 (1 day ago)")
print("  - lag_7 (7 days ago)")
print("  - lag_30 (30 days ago)")
print("\n")

# Rolling statistics - trends and patterns
print("Creating rolling statistics...")

df["rolling_mean_7"] = df.groupby(
    ["region", "service_type"],
    observed=True
)["usage_units"].transform(
    lambda x: x.rolling(window=7).mean()
)

df["rolling_std_7"] = df.groupby(
    ["region", "service_type"],
    observed=True
)["usage_units"].transform(
    lambda x: x.rolling(window=7).std()
)

print("  - rolling_mean_7 (7-day average)")
print("  - rolling_std_7 (7-day volatility)")
print("\n")

# Capacity stress metrics
print("Creating capacity metrics...")

df["capacity_utilization"] = (
    df["usage_units"] / df["provisioned_capacity"]
)

df["over_provisioned_flag"] = (
    df["capacity_utilization"] < 0.75
).astype(int)

df["high_stress_flag"] = (
    df["capacity_utilization"] > 0.9
).astype(int)

print("  - capacity_utilization")
print("  - over_provisioned_flag (utilization < 75%)")
print("  - high_stress_flag (utilization > 90%)")
print("\n")

# Check missing values from lag/rolling features
print("Missing values after feature creation:")
print(df.isna().sum())
print("\n")

# Drop rows with NaN (from lag and rolling features)
initial_rows = len(df)
df = df.dropna()
rows_dropped = initial_rows - len(df)

print(f"Rows dropped due to NaN: {rows_dropped}")
print(f"Final dataset shape: {df.shape[0]} rows, {df.shape[1]} columns\n")

# Summary of new features
print("Feature Engineering Summary:")
print(f"  Original features: 17")
print(f"  New features created: 10")
print(f"  Total features: {df.shape[1]}")
print("\n")

print("New feature categories:")
print("  - Time features: 1")
print("  - Lag features: 3")
print("  - Rolling statistics: 2")
print("  - Capacity metrics: 3")
print("\n")

# Basic statistics
print("Dataset Statistics:")
print(f"  Total records: {len(df):,}")
print(f"  Date range: {df['timestamp'].min().date()} to {df['timestamp'].max().date()}")
print(f"  Regions: {df['region'].nunique()}")
print(f"  Service types: {df['service_type'].nunique()}")
print("\n")

# Save feature-engineered dataset
df.to_csv("azure_demand_feature_engineered.csv", index=False)
print("Feature-engineered dataset saved: azure_demand_feature_engineered.csv")

print("\nMilestone 2 Complete!")

Starting Milestone 2: Feature Engineering & Data Wrangling

Dataset loaded: 5200 rows, 16 columns
Date Range: 2023-04-01 to 2025-03-31

Current columns:
['timestamp', 'region', 'service_type', 'usage_units', 'provisioned_capacity', 'cost_usd', 'availability_pct', 'is_holiday', 'utilization_pct', 'cost_per_unit', 'buffer_capacity', 'year', 'month', 'quarter', 'day_of_week', 'is_weekend']


Creating time-based features...
  - week_of_year added


Creating lag features...
  - lag_1 (1 day ago)
  - lag_7 (7 days ago)
  - lag_30 (30 days ago)


Creating rolling statistics...
  - rolling_mean_7 (7-day average)
  - rolling_std_7 (7-day volatility)


Creating capacity metrics...
  - capacity_utilization
  - over_provisioned_flag (utilization < 75%)
  - high_stress_flag (utilization > 90%)


Missing values after feature creation:
timestamp                  0
region                     0
service_type               0
usage_units                0
provisioned_capacity       0
cost_usd              