In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("../../Data/Processed/insights.csv", parse_dates=["date"])
df["date"] = pd.to_datetime(df["date"])

In [None]:
def create_features(df, target_columns=['usage_cpu', 'usage_storage', 'users_active'],
                   lags=[1, 7, 14], rolling_windows=[7, 14], fourier_k=3):
    df = df.copy()

    # Convert date
    df['date'] = pd.to_datetime(df['date'])

    # Sort by region, resource_type, and date
    df = df.sort_values(["region", "resource_type", "date"])

    # Numerically encode region and resource_type
    df['region_encoded'] = df['region'].astype('category').cat.codes
    df['resource_type_encoded'] = df['resource_type'].astype('category').cat.codes

    # Create unique_id using encoded features
    df['unique_id'] = df['region_encoded'].astype(str) + '_' + df['resource_type_encoded'].astype(str)

    # Basic time features
    df["dayofweek"] = df["date"].dt.dayofweek
    df["month"] = df["date"].dt.month
    df["weekofyear"] = df["date"].dt.isocalendar().week.astype(int)
    df["dayofmonth"] = df["date"].dt.day
    df["quarter"] = df["date"].dt.quarter
    df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)

    # Create features for each target column
    for target_col in target_columns:
        # Lag features
        for lag in lags:
            df[f"{target_col}_lag_{lag}"] = df.groupby("unique_id")[target_col].shift(lag)

        # Rolling features (shifted by 1 to avoid data leakage)
        for window in rolling_windows:
            df[f"{target_col}_roll_mean_{window}"] = (
                df.groupby("unique_id")[target_col]
                .shift(1)
                .rolling(window=window, min_periods=1)
                .mean()
                .reset_index(0, drop=True)
            )
            df[f"{target_col}_roll_std_{window}"] = (
                df.groupby("unique_id")[target_col]
                .shift(1)
                .rolling(window=window, min_periods=1)
                .std()
                .reset_index(0, drop=True)
            )

    # Fourier features for cyclical patterns
    for k in range(1, fourier_k + 1):
        df[f"sin_week_{k}"] = np.sin(2 * np.pi * k * df["dayofweek"] / 7)
        df[f"cos_week_{k}"] = np.cos(2 * np.pi * k * df["dayofweek"] / 7)

    # Fill NaNs - first with group mean, then with forward/backward fill
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df.groupby("unique_id")[numeric_cols].transform(
        lambda g: g.fillna(g.mean())
    )

    # Handle any remaining NaNs with forward/backward fill
    df[numeric_cols] = df.groupby("unique_id")[numeric_cols].transform(
        lambda g: g.fillna(method="bfill").fillna(method="ffill")
    )

    df.drop(columns=['region','resource_type'], inplace = True)

    return df

In [None]:
def features(filename):
    # Load the data
    df = pd.read_csv(filename)

    print("Original data shape:", df.shape)
    print("Original columns:", df.columns.tolist())
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    # Fix: Combine region and resource_type row-wise before finding unique combinations
    print(f"Unique region-resource combinations: {len((df['region'].astype(str) + '_' + df['resource_type'].astype(str)).unique())}")

    # Create features
    print("\nCreating features...")
    df_features = create_features(
        df,
        target_columns=['usage_cpu', 'usage_storage', 'users_active', 'storage_efficiency'],
        lags=[1, 7, 14],
        rolling_windows=[7, 14],
        fourier_k=3
    )

    print(f"Enhanced data shape: {df_features.shape}")

    # Check for NaNs
    nan_counts = df_features.isnull().sum()
    total_nans = nan_counts.sum()

    if total_nans > 0:
        print(f"⚠️  Found {total_nans} NaN values:")
        print(nan_counts[nan_counts > 0])
    else:
        print("✅ No NaN values - all rows preserved!")

    # Show feature summary
    print(f"\n=== FEATURES CREATED ===")

    # Time features
    time_features = [col for col in df_features.columns if col in ['dayofweek', 'month', 'weekofyear', 'dayofmonth', 'quarter', 'is_weekend']]
    print(f"Time features: {time_features}")

    # Lag features
    lag_features = [col for col in df_features.columns if '_lag_' in col]
    print(f"Lag features ({len(lag_features)}): {lag_features}")

    # Rolling features
    rolling_features = [col for col in df_features.columns if '_roll_' in col]
    print(f"Rolling features ({len(rolling_features)}): {rolling_features}")

    # Fourier features
    fourier_features = [col for col in df_features.columns if 'sin_week_' in col or 'cos_week_' in col]
    print(f"Fourier features: {fourier_features}")

    # Sample of enhanced data
    print(f"\n=== SAMPLE DATA ===")
    sample_cols = ['date', 'unique_id', 'usage_cpu', 'usage_cpu_lag_1', 'usage_cpu_roll_mean_7', 'dayofweek', 'is_weekend']
    available_cols = [col for col in sample_cols if col in df_features.columns]
    print(df_features[available_cols].head(10))

    # Save the enhanced dataset
    output_file = 'enhanced_features.csv'
    df_features.to_csv(output_file, index=False)
    print(f"\n✅ Enhanced dataset saved to '{output_file}'")
    print(f"📊 {df_features.shape[0]} rows × {df_features.shape[1]} features")

In [None]:
df = pd.read_csv("/content/insights.csv")
df

In [None]:
features("/content/insights.csv")
