In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import pickle

# Time Series Libraries
try:
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.statespace.sarimax import SARIMAX
except ImportError:
    print("Warning: statsmodels not available - ARIMA/SARIMA models will be skipped")

# Deep Learning Libraries
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping
    from tensorflow.keras.optimizers import Adam
    from sklearn.preprocessing import MinMaxScaler
    tf.random.set_seed(42)
except ImportError:
    print("Warning: TensorFlow not available - LSTM/GRU models will be skipped")

from datetime import datetime, timedelta
import os

In [2]:


class ForecastingPipeline:
    def __init__(self, data_path):
        self.data_path = data_path
        self.models = {}
        self.results = {}
        self.target_columns = ['usage_cpu', 'usage_storage', 'users_active']
        self.feature_columns = []

        # Create directories for saving models
        os.makedirs('models', exist_ok=True)
        os.makedirs('results', exist_ok=True)
        os.makedirs('top_models', exist_ok=True)

        self.hyperparameter_results = {}  # Store hyperparameter search results

    def load_and_prepare_data(self):
        """Load and prepare data for training"""
        print("Loading and preparing data...")
        self.df = pd.read_csv(self.data_path)
        self.df['date'] = pd.to_datetime(self.df['date'])
        self.df = self.df.sort_values(['unique_id', 'date'])

        # Define feature columns (exclude targets and non-feature columns)
        exclude_cols = ['date', 'unique_id'] + self.target_columns
        # Only select numeric columns for features
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        self.feature_columns = [col for col in numeric_cols if col not in exclude_cols]

        print(f"Data loaded: {self.df.shape}")
        print(f"Feature columns: {len(self.feature_columns)}")
        print(f"Target columns: {self.target_columns}")

        return self.df

    def create_train_val_test_split(self, df, train_ratio=0.7, val_ratio=0.2):
        """Create train/validation/test splits by unique_id"""
        train_data, val_data, test_data = [], [], []

        for unique_id in df['unique_id'].unique():
            group_data = df[df['unique_id'] == unique_id].sort_values('date')
            n = len(group_data)

            train_end = int(n * train_ratio)
            val_end = int(n * (train_ratio + val_ratio))

            train_data.append(group_data.iloc[:train_end])
            val_data.append(group_data.iloc[train_end:val_end])
            test_data.append(group_data.iloc[val_end:])

        train_df = pd.concat(train_data, ignore_index=True)
        val_df = pd.concat(val_data, ignore_index=True)
        test_df = pd.concat(test_data, ignore_index=True)

        print(f"Train: {len(train_df)} samples")
        print(f"Validation: {len(val_df)} samples")
        print(f"Test: {len(test_df)} samples")

        return train_df, val_df, test_df

    def calculate_metrics(self, y_true, y_pred, model_name, target):
        """Calculate evaluation metrics"""
        # Handle any NaN or infinite values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred) | np.isinf(y_true) | np.isinf(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]

        if len(y_true_clean) == 0:
            return {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan, 'Bias': np.nan}

        mae = mean_absolute_error(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))

        # MAPE - handle division by zero
        mape = np.mean(np.abs((y_true_clean - y_pred_clean) / np.where(y_true_clean != 0, y_true_clean, 1))) * 100

        # Forecast Bias
        bias = np.mean(y_pred_clean - y_true_clean)

        return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'Bias': bias}

    def aggregate_predictions_by_date(self, df, predictions, target):
        """Aggregate predictions by date (key improvement from standalone scripts)"""
        temp_df = df.copy()
        temp_df['predictions'] = predictions

        # Aggregate by date: mean of actuals and predictions
        agg_df = temp_df.groupby('date').agg({
            target: 'mean',
            'predictions': 'mean'
        }).reset_index()

        return agg_df[target].values, agg_df['predictions'].values

    def train_xgboost_grid_search(self, train_df, val_df, test_df):
        """Train XGBoost with comprehensive hyperparameter grid search"""
        print("\nTraining XGBoost with Grid Search (this may take a while)...")

        # Prepare features
        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define hyperparameter grid
        param_grid = {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6],
            'n_estimators': [200, 500, 800, 1000],
            'subsample': [0.6, 0.7, 0.8, 0.9],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
            'reg_alpha': [0, 0.1, 0.5, 1.0],
            'reg_lambda': [1.0, 2.0, 3.0, 5.0],
            'gamma': [0, 0.1, 0.2, 0.5],
            'min_child_weight': [1, 3, 5, 7]
        }

        from itertools import product
        import random

        # Generate all possible combinations
        keys = list(param_grid.keys())
        combinations = list(product(*[param_grid[key] for key in keys]))

        # Randomly sample 50 combinations to make it manageable
        if len(combinations) > 50:
            combinations = random.sample(combinations, 50)

        for target in self.target_columns:
            print(f"\n  Grid Search for {target}...")
            print(f"  Testing {len(combinations)} parameter combinations...")

            y_train = train_df[target].fillna(train_df[target].mean())
            y_val = val_df[target].fillna(val_df[target].mean())
            y_test = test_df[target].fillna(test_df[target].mean())

            best_mae = float('inf')
            best_params = None
            best_model = None

            for i, combo in enumerate(combinations):
                if i % 10 == 0:
                    print(f"    Progress: {i}/{len(combinations)}")

                # Create parameter dictionary
                params = dict(zip(keys, combo))
                params.update({
                    'objective': 'reg:squarederror',
                    'random_state': 42,
                    'early_stopping_rounds': 20
                })

                try:
                    # Train model
                    model = xgb.XGBRegressor(**params)
                    model.fit(X_train, y_train,
                             eval_set=[(X_val, y_val)],
                             verbose=False)

                    # Predict
                    val_pred = model.predict(X_val)
                    test_pred = model.predict(X_test)

                    # Aggregate by date (KEY IMPROVEMENT)
                    val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
                    test_actual_agg, test_pred_agg = self.aggregate_predictions_by_date(test_df, test_pred, target)

                    # Calculate metrics on aggregated data
                    val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, 'XGBoost_GridSearch', target)
                    test_metrics = self.calculate_metrics(test_actual_agg, test_pred_agg, 'XGBoost_GridSearch', target)

                    # Check if this is the best model
                    if val_metrics['MAE'] < best_mae:
                        best_mae = val_metrics['MAE']
                        best_params = params.copy()
                        best_model = model
                        best_val_metrics = val_metrics
                        best_test_metrics = test_metrics

                except Exception as e:
                    continue

            # Store best model and results
            if best_model is not None:
                model_key = f'XGBoost_GridSearch_{target}'
                self.models[model_key] = best_model

                # Save model
                joblib.dump(best_model, f'models/{model_key}.pkl')

                # Store in results
                self.results[model_key] = {
                    'validation': best_val_metrics,
                    'test': best_test_metrics,
                    'best_params': best_params
                }

                # Store hyperparameter results
                self.hyperparameter_results[target] = {
                    'best_params': best_params,
                    'best_metrics': {
                        'validation': best_val_metrics,
                        'test': best_test_metrics
                    },
                    'total_combinations_tested': len(combinations)
                }

                print(f"    Best MAE for {target}: {best_mae:.4f}")
                print(f"    Best params: {best_params}")

        # Save best hyperparameter results
        import json
        with open('results/best_hyperparameter_results.json', 'w') as f:
            json_results = {}
            for target, data in self.hyperparameter_results.items():
                json_results[target] = {
                    'best_params': data['best_params'],
                    'best_val_mae': data['best_metrics']['validation']['MAE'],
                    'best_test_mae': data['best_metrics']['test']['MAE'],
                    'total_combinations_tested': data['total_combinations_tested']
                }
            json.dump(json_results, f, indent=2)

    def train_xgboost_models(self, train_df, val_df, test_df):
        """Train XGBoost models with default parameters (for comparison)"""
        print("\nTraining XGBoost models with default parameters...")

        # Prepare features
        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        for target in self.target_columns:
            print(f"  Training XGBoost for {target}...")

            y_train = train_df[target].fillna(train_df[target].mean())
            y_val = val_df[target].fillna(val_df[target].mean())
            y_test = test_df[target].fillna(test_df[target].mean())

            # XGBoost with default parameters
            model = xgb.XGBRegressor(
                objective='reg:squarederror',
                random_state=42,
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                early_stopping_rounds=20
            )

            model.fit(X_train, y_train,
                     eval_set=[(X_val, y_val)],
                     verbose=False)

            # Predictions
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)

            # Aggregate by date (KEY IMPROVEMENT)
            val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
            test_actual_agg, test_pred_agg = self.aggregate_predictions_by_date(test_df, test_pred, target)

            # Store model and results
            model_key = f'XGBoost_Default_{target}'
            self.models[model_key] = model

            # Save model
            joblib.dump(model, f'models/{model_key}.pkl')

            # Calculate metrics
            val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, 'XGBoost_Default', target)
            test_metrics = self.calculate_metrics(test_actual_agg, test_pred_agg, 'XGBoost_Default', target)

            self.results[model_key] = {
                'validation': val_metrics,
                'test': test_metrics
            }

    def train_lightgbm_grid_search(self, train_df, val_df, test_df):
        """Train LightGBM with hyperparameter grid search"""
        print("\nTraining LightGBM with Grid Search...")

        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define hyperparameter grid for LightGBM
        param_grid = {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 5, 7, -1],
            'n_estimators': [200, 500, 800],
            'num_leaves': [31, 50, 100, 127],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'reg_alpha': [0, 0.1, 0.5, 1.0],
            'reg_lambda': [0, 0.1, 0.5, 1.0]
        }

        from itertools import product
        import random

        keys = list(param_grid.keys())
        combinations = list(product(*[param_grid[key] for key in keys]))

        # Limit to 50 combinations
        if len(combinations) > 50:
            combinations = random.sample(combinations, 50)

        for target in self.target_columns:
            print(f"\n  Grid Search for LightGBM - {target}...")
            print(f"  Testing {len(combinations)} parameter combinations...")

            y_train = train_df[target].fillna(train_df[target].mean())
            y_val = val_df[target].fillna(val_df[target].mean())
            y_test = test_df[target].fillna(test_df[target].mean())

            best_mae = float('inf')
            best_params = None
            best_model = None

            for i, combo in enumerate(combinations):
                if i % 10 == 0:
                    print(f"    Progress: {i}/{len(combinations)}")

                params = dict(zip(keys, combo))
                params.update({
                    'objective': 'regression',
                    'random_state': 42,
                    'verbose': -1
                })

                try:
                    model = lgb.LGBMRegressor(**params)
                    model.fit(X_train, y_train,
                             eval_set=[(X_val, y_val)],
                             callbacks=[lgb.early_stopping(20, verbose=False)])

                    # Predict and aggregate
                    val_pred = model.predict(X_val)
                    val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)

                    val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, 'LightGBM_GridSearch', target)

                    if val_metrics['MAE'] < best_mae:
                        best_mae = val_metrics['MAE']
                        best_params = params.copy()
                        best_model = model

                except Exception as e:
                    continue

            # Store best model
            if best_model is not None:
                test_pred = best_model.predict(X_test)
                val_pred = best_model.predict(X_val)

                val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
                test_actual_agg, test_pred_agg = self.aggregate_predictions_by_date(test_df, test_pred, target)

                val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, 'LightGBM_GridSearch', target)
                test_metrics = self.calculate_metrics(test_actual_agg, test_pred_agg, 'LightGBM_GridSearch', target)

                model_key = f'LightGBM_GridSearch_{target}'
                self.models[model_key] = best_model

                joblib.dump(best_model, f'models/{model_key}.pkl')

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics,
                    'best_params': best_params
                }

                print(f"    Best MAE: {best_mae:.4f}")

    def train_lightgbm_models(self, train_df, val_df, test_df):
        """Train LightGBM models with default parameters"""
        print("\nTraining LightGBM models with default parameters...")

        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        for target in self.target_columns:
            print(f"  Training LightGBM for {target}...")

            y_train = train_df[target].fillna(train_df[target].mean())
            y_val = val_df[target].fillna(val_df[target].mean())
            y_test = test_df[target].fillna(test_df[target].mean())

            # LightGBM with default parameters
            model = lgb.LGBMRegressor(
                objective='regression',
                random_state=42,
                verbose=-1
            )

            model.fit(X_train, y_train)

            # Predictions
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)

            # Aggregate by date
            val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
            test_actual_agg, test_pred_agg = self.aggregate_predictions_by_date(test_df, test_pred, target)

            # Store model and results
            model_key = f'LightGBM_Default_{target}'
            self.models[model_key] = model

            joblib.dump(model, f'models/{model_key}.pkl')

            val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, 'LightGBM_Default', target)
            test_metrics = self.calculate_metrics(test_actual_agg, test_pred_agg, 'LightGBM_Default', target)

            self.results[model_key] = {
                'validation': val_metrics,
                'test': test_metrics
            }

    def train_traditional_ml_grid_search(self, train_df, val_df, test_df):
        """Train traditional ML models with hyperparameter grid search (NO LINEAR MODELS)"""
        print("\nTraining Traditional ML models with Grid Search...")

        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define hyperparameter grids - REMOVED LINEAR MODELS
        ml_param_grids = {
            'RandomForest': {
                'n_estimators': [100, 200, 300, 500],
                'max_depth': [3, 5, 10, 15, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', 0.5, 0.8]
            },
            'GradientBoosting': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.05, 0.1, 0.2],
                'max_depth': [3, 4, 5, 6],
                'subsample': [0.6, 0.8, 1.0],
                'min_samples_split': [2, 5, 10]
            }
        }

        from itertools import product
        import random

        for model_name, param_grid in ml_param_grids.items():
            print(f"\n  Grid Search for {model_name}...")

            keys = list(param_grid.keys())
            combinations = list(product(*[param_grid[key] for key in keys]))

            if len(combinations) > 30:
                combinations = random.sample(combinations, 30)

            for target in self.target_columns:
                print(f"    {model_name} for {target}...")

                y_train = train_df[target].fillna(train_df[target].mean())
                y_val = val_df[target].fillna(val_df[target].mean())
                y_test = test_df[target].fillna(test_df[target].mean())

                best_mae = float('inf')
                best_params = None
                best_model = None

                for combo in combinations:
                    params = dict(zip(keys, combo))
                    params['random_state'] = 42

                    try:
                        if model_name == 'RandomForest':
                            model = RandomForestRegressor(**params)
                        elif model_name == 'GradientBoosting':
                            model = GradientBoostingRegressor(**params)

                        model.fit(X_train, y_train)

                        # Predict and aggregate
                        val_pred = model.predict(X_val)
                        val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
                        val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, model_name, target)

                        if val_metrics['MAE'] < best_mae:
                            best_mae = val_metrics['MAE']
                            best_params = params.copy()
                            best_model = model

                    except Exception as e:
                        continue

                # Store best model
                if best_model is not None:
                    test_pred = best_model.predict(X_test)
                    val_pred = best_model.predict(X_val)

                    val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
                    test_actual_agg, test_pred_agg = self.aggregate_predictions_by_date(test_df, test_pred, target)

                    val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, model_name, target)
                    test_metrics = self.calculate_metrics(test_actual_agg, test_pred_agg, model_name, target)

                    model_key = f'{model_name}_GridSearch_{target}'
                    self.models[model_key] = best_model

                    joblib.dump(best_model, f'models/{model_key}.pkl')

                    self.results[model_key] = {
                        'validation': val_metrics,
                        'test': test_metrics,
                        'best_params': best_params
                    }

                    print(f"      Best MAE: {best_mae:.4f}")

    def train_traditional_ml_models(self, train_df, val_df, test_df):
        """Train traditional ML models with default parameters (NO LINEAR MODELS)"""
        print("\nTraining traditional ML models with default parameters...")

        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define models - REMOVED LINEAR MODELS
        ml_models = {
            'RandomForest_Default': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
            'GradientBoosting_Default': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=6)
        }

        for model_name, model in ml_models.items():
            for target in self.target_columns:
                print(f"  Training {model_name} for {target}...")

                y_train = train_df[target].fillna(train_df[target].mean())
                y_val = val_df[target].fillna(val_df[target].mean())
                y_test = test_df[target].fillna(test_df[target].mean())

                # Train model
                model.fit(X_train, y_train)

                # Predictions
                val_pred = model.predict(X_val)
                test_pred = model.predict(X_test)

                # Aggregate by date
                val_actual_agg, val_pred_agg = self.aggregate_predictions_by_date(val_df, val_pred, target)
                test_actual_agg, test_pred_agg = self.aggregate_predictions_by_date(test_df, test_pred, target)

                # Store model and results
                model_key = f'{model_name}_{target}'
                self.models[model_key] = model

                joblib.dump(model, f'models/{model_key}.pkl')

                val_metrics = self.calculate_metrics(val_actual_agg, val_pred_agg, model_name, target)
                test_metrics = self.calculate_metrics(test_actual_agg, test_pred_agg, model_name, target)

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics
                }

    def prepare_lstm_data(self, df, target_col, sequence_length=7):
        """Prepare data for LSTM/GRU models - aggregate by date first"""
        # Aggregate by date
        agg_df = df.groupby('date').agg({
            target_col: 'mean'
        }).reset_index()
        agg_df = agg_df.sort_values('date')

        # Scale data
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(agg_df[target_col].values.reshape(-1, 1))

        # Create sequences
        sequences = []
        targets = []

        for i in range(sequence_length, len(scaled_data)):
            sequences.append(scaled_data[i-sequence_length:i])
            targets.append(scaled_data[i])

        return np.array(sequences), np.array(targets), scaler

    def train_deep_learning_grid_search(self, train_df, val_df, test_df):
        """Train deep learning models with hyperparameter grid search"""
        try:
            print("\nTraining Deep Learning models with Grid Search...")

            dl_param_grids = {
                'LSTM': {
                    'units': [32, 50, 64],
                    'dropout': [0.1, 0.2],
                    'batch_size': [16, 32],
                    'learning_rate': [0.001, 0.01]
                },
                'GRU': {
                    'units': [32, 50, 64],
                    'dropout': [0.1, 0.2],
                    'batch_size': [16, 32],
                    'learning_rate': [0.001, 0.01]
                }
            }

            from itertools import product
            import random

            for target in self.target_columns:
                print(f"  Preparing sequences for {target}...")

                # Prepare sequences with aggregation
                X_train, y_train, train_scaler = self.prepare_lstm_data(train_df, target)
                X_val, y_val, val_scaler = self.prepare_lstm_data(val_df, target)
                X_test, y_test, test_scaler = self.prepare_lstm_data(test_df, target)

                if len(X_train) < 10 or len(X_val) < 5:
                    print(f"    Insufficient data for {target}")
                    continue

                for model_type in ['LSTM', 'GRU']:
                    print(f"    Grid Search for {model_type} - {target}...")

                    param_grid = dl_param_grids[model_type]
                    keys = list(param_grid.keys())
                    combinations = list(product(*[param_grid[key] for key in keys]))

                    if len(combinations) > 15:
                        combinations = random.sample(combinations, 15)

                    best_mae = float('inf')
                    best_params = None
                    best_model = None

                    for combo in combinations:
                        params = dict(zip(keys, combo))

                        try:
                            model = Sequential()

                            if model_type == 'LSTM':
                                model.add(LSTM(params['units'], input_shape=(X_train.shape[1], X_train.shape[2])))
                            else:  # GRU
                                model.add(GRU(params['units'], input_shape=(X_train.shape[1], X_train.shape[2])))

                            model.add(Dropout(params['dropout']))
                            model.add(Dense(1))

                            optimizer = Adam(learning_rate=params['learning_rate'])
                            model.compile(optimizer=optimizer, loss='mse')

                            early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
                            model.fit(X_train, y_train,
                                    validation_data=(X_val, y_val),
                                    epochs=30, batch_size=params['batch_size'],
                                    callbacks=[early_stop], verbose=0)

                            # Evaluate
                            val_pred_scaled = model.predict(X_val, verbose=0)
                            val_pred = val_scaler.inverse_transform(val_pred_scaled).flatten()
                            val_actual = val_scaler.inverse_transform(y_val).flatten()

                            val_metrics = self.calculate_metrics(val_actual, val_pred, model_type, target)

                            if val_metrics['MAE'] < best_mae:
                                best_mae = val_metrics['MAE']
                                best_params = params.copy()
                                best_model = model
                                best_scaler = val_scaler

                        except Exception as e:
                            continue

                    # Store best model
                    if best_model is not None:
                        test_pred_scaled = best_model.predict(X_test, verbose=0)
                        test_pred = test_scaler.inverse_transform(test_pred_scaled).flatten()
                        test_actual = test_scaler.inverse_transform(y_test).flatten()

                        val_pred_scaled = best_model.predict(X_val, verbose=0)
                        val_pred = val_scaler.inverse_transform(val_pred_scaled).flatten()
                        val_actual = val_scaler.inverse_transform(y_val).flatten()

                        val_metrics = self.calculate_metrics(val_actual, val_pred, model_type, target)
                        test_metrics = self.calculate_metrics(test_actual, test_pred, model_type, target)

                        model_key = f'{model_type}_GridSearch_{target}'
                        self.models[model_key] = best_model

                        best_model.save(f'models/{model_key}.h5')

                        self.results[model_key] = {
                            'validation': val_metrics,
                            'test': test_metrics,
                            'best_params': best_params
                        }

                        print(f"      Best MAE: {best_mae:.4f}")

        except Exception as e:
            print(f"Error in deep learning grid search: {e}")

    def train_deep_learning_models(self, train_df, val_df, test_df):
        """Train LSTM and GRU models with default parameters"""
        try:
            print("\nTraining Deep Learning models...")

            for target in self.target_columns:
                print(f"  Preparing data for {target}...")

                # Prepare sequences with aggregation
                X_train, y_train, train_scaler = self.prepare_lstm_data(train_df, target)
                X_val, y_val, val_scaler = self.prepare_lstm_data(val_df, target)
                X_test, y_test, test_scaler = self.prepare_lstm_data(test_df, target)

                if len(X_train) < 10 or len(X_val) < 5 or len(X_test) < 5:
                    print(f"    Insufficient data for {target}, skipping deep learning models")
                    continue

                # LSTM Model
                print(f"    Training LSTM for {target}...")
                lstm_model = Sequential([
                    LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])),
                    Dropout(0.2),
                    Dense(1)
                ])

                lstm_model.compile(optimizer=Adam(), loss='mse')

                early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                lstm_model.fit(X_train, y_train,
                              validation_data=(X_val, y_val),
                              epochs=50, batch_size=16,
                              callbacks=[early_stop], verbose=0)

                # LSTM Predictions
                val_pred_scaled = lstm_model.predict(X_val, verbose=0)
                test_pred_scaled = lstm_model.predict(X_test, verbose=0)

                val_pred = val_scaler.inverse_transform(val_pred_scaled).flatten()
                test_pred = test_scaler.inverse_transform(test_pred_scaled).flatten()
                val_actual = val_scaler.inverse_transform(y_val).flatten()
                test_actual = test_scaler.inverse_transform(y_test).flatten()

                # Store LSTM results
                model_key = f'LSTM_{target}'
                self.models[model_key] = lstm_model
                lstm_model.save(f'models/{model_key}.h5')

                val_metrics = self.calculate_metrics(val_actual, val_pred, 'LSTM', target)
                test_metrics = self.calculate_metrics(test_actual, test_pred, 'LSTM', target)

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics
                }

                # GRU Model
                print(f"    Training GRU for {target}...")
                gru_model = Sequential([
                    GRU(50, input_shape=(X_train.shape[1], X_train.shape[2])),
                    Dropout(0.2),
                    Dense(1)
                ])

                gru_model.compile(optimizer=Adam(), loss='mse')
                gru_model.fit(X_train, y_train,
                             validation_data=(X_val, y_val),
                             epochs=50, batch_size=16,
                             callbacks=[early_stop], verbose=0)

                # GRU Predictions
                val_pred_scaled = gru_model.predict(X_val, verbose=0)
                test_pred_scaled = gru_model.predict(X_test, verbose=0)

                val_pred = val_scaler.inverse_transform(val_pred_scaled).flatten()
                test_pred = test_scaler.inverse_transform(test_pred_scaled).flatten()
                val_actual = val_scaler.inverse_transform(y_val).flatten()
                test_actual = test_scaler.inverse_transform(y_test).flatten()

                # Store GRU results
                model_key = f'GRU_{target}'
                self.models[model_key] = gru_model
                gru_model.save(f'models/{model_key}.h5')

                val_metrics = self.calculate_metrics(val_actual, val_pred, 'GRU', target)
                test_metrics = self.calculate_metrics(test_actual, test_pred, 'GRU', target)

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics
                }

        except Exception as e:
            print(f"Error in deep learning training: {e}")

    def train_time_series_models(self, train_df, val_df, test_df):
        """Train ARIMA and SARIMA models - aggregate by date approach"""
        try:
            print("\nTraining Time Series models...")

            for target in self.target_columns:
                print(f"  Training ARIMA/SARIMA for {target}...")

                # Aggregate by date
                train_ts = train_df.groupby('date')[target].mean().reset_index()
                val_ts = val_df.groupby('date')[target].mean().reset_index()
                test_ts = test_df.groupby('date')[target].mean().reset_index()

                train_ts['date'] = pd.to_datetime(train_ts['date'])
                val_ts['date'] = pd.to_datetime(val_ts['date'])
                test_ts['date'] = pd.to_datetime(test_ts['date'])

                train_ts = train_ts.sort_values('date').set_index('date')[target]
                val_ts = val_ts.sort_values('date').set_index('date')[target]
                test_ts = test_ts.sort_values('date').set_index('date')[target]

                if len(train_ts) < 20:
                    print(f"    Insufficient data for ARIMA {target}, skipping...")
                    continue

                # ARIMA - Try different orders
                print(f"    Fitting ARIMA for {target}...")
                arima_orders = [(1,1,1), (2,1,1), (1,1,2), (2,1,2), (0,1,1), (1,0,1)]
                best_aic = float('inf')
                best_arima = None
                best_arima_order = None

                for order in arima_orders:
                    try:
                        arima_model = ARIMA(train_ts, order=order)
                        arima_fitted = arima_model.fit()

                        if arima_fitted.aic < best_aic:
                            best_aic = arima_fitted.aic
                            best_arima = arima_fitted
                            best_arima_order = order
                    except:
                        continue

                if best_arima is not None:
                    try:
                        # Forecast
                        val_forecast = best_arima.forecast(steps=len(val_ts))
                        test_forecast = best_arima.forecast(steps=len(test_ts))

                        # Store model
                        model_key = f'ARIMA_{target}'
                        self.models[model_key] = best_arima

                        with open(f'models/{model_key}.pkl', 'wb') as f:
                            pickle.dump(best_arima, f)

                        # Calculate metrics
                        val_metrics = self.calculate_metrics(val_ts.values, val_forecast, 'ARIMA', target)
                        test_metrics = self.calculate_metrics(test_ts.values, test_forecast, 'ARIMA', target)

                        self.results[model_key] = {
                            'validation': val_metrics,
                            'test': test_metrics,
                            'best_params': {'order': best_arima_order, 'aic': best_aic}
                        }

                        print(f"    ARIMA - Best order: {best_arima_order}, AIC: {best_aic:.2f}")

                    except Exception as e:
                        print(f"    ARIMA forecasting failed: {e}")

                # SARIMA
                print(f"    Fitting SARIMA for {target}...")
                try:
                    sarima_model = SARIMAX(train_ts, order=(1, 1, 1), seasonal_order=(1, 1, 1, 7))
                    sarima_fitted = sarima_model.fit(disp=False)

                    # Forecast
                    val_forecast = sarima_fitted.forecast(steps=len(val_ts))
                    test_forecast = sarima_fitted.forecast(steps=len(test_ts))

                    # Store model
                    model_key = f'SARIMA_{target}'
                    self.models[model_key] = sarima_fitted

                    with open(f'models/{model_key}.pkl', 'wb') as f:
                        pickle.dump(sarima_fitted, f)

                    # Calculate metrics
                    val_metrics = self.calculate_metrics(val_ts.values, val_forecast, 'SARIMA', target)
                    test_metrics = self.calculate_metrics(test_ts.values, test_forecast, 'SARIMA', target)

                    self.results[model_key] = {
                        'validation': val_metrics,
                        'test': test_metrics,
                        'best_params': {'order': (1,1,1), 'seasonal_order': (1,1,1,7)}
                    }

                    print(f"    SARIMA - Training complete")

                except Exception as e:
                    print(f"    SARIMA training failed: {e}")

        except Exception as e:
            print(f"Error in time series training: {e}")

    def create_comparison_table(self):
        """Create comparison table of all models"""
        print("\nCreating comparison table...")

        comparison_data = []

        for model_key, metrics in self.results.items():
            # Parse model name and target
            if model_key.count('_') >= 2:
                parts = model_key.split('_')
                if len(parts) >= 3:
                    target = None
                    for i in range(len(parts)-1, 0, -1):
                        potential_target = '_'.join(parts[i:])
                        if potential_target in self.target_columns:
                            target = potential_target
                            model_name = '_'.join(parts[:i])
                            break

                    if target is None:
                        model_name, target = model_key.rsplit('_', 1)
                else:
                    model_name, target = model_key.rsplit('_', 1)
            else:
                model_name, target = model_key.rsplit('_', 1)

            if 'validation' in metrics and 'test' in metrics:
                row = {
                    'Model': model_name,
                    'Target': target,
                    'Val_MAE': metrics['validation']['MAE'],
                    'Val_RMSE': metrics['validation']['RMSE'],
                    'Val_MAPE': metrics['validation']['MAPE'],
                    'Val_Bias': metrics['validation']['Bias'],
                    'Test_MAE': metrics['test']['MAE'],
                    'Test_RMSE': metrics['test']['RMSE'],
                    'Test_MAPE': metrics['test']['MAPE'],
                    'Test_Bias': metrics['test']['Bias']
                }

                if 'best_params' in metrics:
                    params_str = str(metrics['best_params'])[:100] + "..." if len(str(metrics['best_params'])) > 100 else str(metrics['best_params'])
                    row['Hyperparameters'] = params_str
                else:
                    row['Hyperparameters'] = 'Default'

                comparison_data.append(row)

        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.round(4)

        # Save comparison table
        comparison_df.to_csv('results/model_comparison.csv', index=False)

        print("\n" + "="*80)
        print("MODEL COMPARISON TABLE")
        print("="*80)
        print(comparison_df.to_string(index=False))

        return comparison_df

    def get_top_models_summary(self, comparison_df):
        """Get top 1 model for each target with detailed info"""
        print("\n" + "="*60)
        print("TOP 1 MODEL FOR EACH TARGET")
        print("="*60)

        top_models_data = []

        for target in self.target_columns:
            target_results = comparison_df[comparison_df['Target'] == target].copy()

            if len(target_results) > 0:
                # Find best model based on validation MAE
                best_idx = target_results['Val_MAE'].idxmin()
                best_row = target_results.loc[best_idx]

                model_name = best_row['Model']
                model_key = f"{model_name}_{target}"

                # Get parameters if available
                params = "Default"
                if 'Hyperparameters' in best_row and pd.notna(best_row['Hyperparameters']):
                    params = best_row['Hyperparameters']
                elif model_key in self.results and 'best_params' in self.results[model_key]:
                    params = str(self.results[model_key]['best_params'])
                elif target in self.hyperparameter_results:
                    params = str(self.hyperparameter_results[target]['best_params'])

                top_model_info = {
                    'Target': target,
                    'Best_Model': model_name,
                    'Val_MAE': best_row['Val_MAE'],
                    'Val_RMSE': best_row['Val_RMSE'],
                    'Val_MAPE': best_row['Val_MAPE'],
                    'Val_Bias': best_row['Val_Bias'],
                    'Test_MAE': best_row['Test_MAE'],
                    'Test_RMSE': best_row['Test_RMSE'],
                    'Test_MAPE': best_row['Test_MAPE'],
                    'Test_Bias': best_row['Test_Bias'],
                    'Hyperparameters': params
                }

                top_models_data.append(top_model_info)

                # Copy best model to top_models directory
                try:
                    import shutil
                    for ext in ['.pkl', '.h5']:
                        source_file = f'models/{model_key}{ext}'
                        if os.path.exists(source_file):
                            dest_file = f'top_models/{target}_best_model{ext}'
                            shutil.copy2(source_file, dest_file)
                            print(f"✅ Moved {model_name} for {target} to top_models/")
                            break
                    else:
                        print(f"⚠️  Model file not found for {model_key}")

                except Exception as e:
                    print(f"⚠️  Could not copy model for {target}: {e}")

        # Create DataFrame and save
        if top_models_data:
            top_models_df = pd.DataFrame(top_models_data)
            top_models_df = top_models_df.round(4)

            top_models_df.to_csv('results/top_models_summary.csv', index=False)

            print("\n" + "="*60)
            print("TOP MODELS SUMMARY")
            print("="*60)
            print(top_models_df.to_string(index=False))
        else:
            print("No top models found!")
            top_models_df = pd.DataFrame()

        return top_models_df

    def run_complete_pipeline(self):
        """Run the complete ML pipeline"""
        print("="*80)
        print("STARTING COMPLETE ML FORECASTING PIPELINE")
        print("="*80)

        # Load data
        df = self.load_and_prepare_data()

        # Create splits
        train_df, val_df, test_df = self.create_train_val_test_split(df)

        # Train all models
        self.train_xgboost_grid_search(train_df, val_df, test_df)
        self.train_xgboost_models(train_df, val_df, test_df)
        self.train_lightgbm_grid_search(train_df, val_df, test_df)
        self.train_lightgbm_models(train_df, val_df, test_df)
        self.train_traditional_ml_grid_search(train_df, val_df, test_df)
        self.train_traditional_ml_models(train_df, val_df, test_df)
        self.train_deep_learning_grid_search(train_df, val_df, test_df)
        self.train_deep_learning_models(train_df, val_df, test_df)
        self.train_time_series_models(train_df, val_df, test_df)

        # Create comparison table
        comparison_df = self.create_comparison_table()

        # Get top models summary
        top_models_df = self.get_top_models_summary(comparison_df)

        print(f"\n✅ Pipeline complete! All models saved in 'models/' directory")
        print(f"✅ Top models moved to 'top_models/' directory")
        print(f"✅ Results saved in 'results/' directory")

        return comparison_df, top_models_df


# Example usage:
# pipeline = ForecastingPipeline('data/processed/train.csv')
# comparison_df, top_models_df = pipeline.run_complete_pipeline()

In [3]:
pipeline = ForecastingPipeline('/content/enhanced_features.csv')
comparison_df, top_models_df = pipeline.run_complete_pipeline()

STARTING COMPLETE ML FORECASTING PIPELINE
Loading and preparing data...
Data loaded: (1080, 52)
Feature columns: 47
Target columns: ['usage_cpu', 'usage_storage', 'users_active']
Train: 744 samples
Validation: 216 samples
Test: 120 samples

Training XGBoost with Grid Search (this may take a while)...

  Grid Search for usage_cpu...
  Testing 50 parameter combinations...
    Progress: 0/50
    Progress: 10/50
    Progress: 20/50
    Progress: 30/50
    Progress: 40/50
    Best MAE for usage_cpu: 2.8605
    Best params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 1.0, 'reg_lambda': 3.0, 'gamma': 0, 'min_child_weight': 7, 'objective': 'reg:squarederror', 'random_state': 42, 'early_stopping_rounds': 20}

  Grid Search for usage_storage...
  Testing 50 parameter combinations...
    Progress: 0/50
    Progress: 10/50
    Progress: 20/50
    Progress: 30/50
    Progress: 40/50
    Best MAE for usage_storage: 2.4067
    Be



      Best MAE: 3.5999
    Grid Search for GRU - usage_cpu...




      Best MAE: 3.5211
  Preparing sequences for usage_storage...
    Grid Search for LSTM - usage_storage...




      Best MAE: 81.4685
    Grid Search for GRU - usage_storage...




      Best MAE: 82.8246
  Preparing sequences for users_active...
    Grid Search for LSTM - users_active...




      Best MAE: 34.0843
    Grid Search for GRU - users_active...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


      Best MAE: 32.5961

Training Deep Learning models...
  Preparing data for usage_cpu...
    Insufficient data for usage_cpu, skipping deep learning models
  Preparing data for usage_storage...
    Insufficient data for usage_storage, skipping deep learning models
  Preparing data for users_active...
    Insufficient data for users_active, skipping deep learning models

Training Time Series models...
  Training ARIMA/SARIMA for usage_cpu...
    Fitting ARIMA for usage_cpu...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


    ARIMA - Best order: (2, 1, 1), AIC: 355.23
    Fitting SARIMA for usage_cpu...
    SARIMA - Training complete
  Training ARIMA/SARIMA for usage_storage...
    Fitting ARIMA for usage_storage...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


    ARIMA - Best order: (0, 1, 1), AIC: 777.95
    Fitting SARIMA for usage_storage...
    SARIMA - Training complete
  Training ARIMA/SARIMA for users_active...
    Fitting ARIMA for users_active...


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


    ARIMA - Best order: (0, 1, 1), AIC: 560.01
    Fitting SARIMA for users_active...
    SARIMA - Training complete

Creating comparison table...

MODEL COMPARISON TABLE
                      Model        Target  Val_MAE  Val_RMSE  Val_MAPE  Val_Bias  Test_MAE  Test_RMSE  Test_MAPE  Test_Bias                                                                                         Hyperparameters
         XGBoost_GridSearch     usage_cpu   2.8605    3.6093    3.8862    0.7494    4.4276     4.8492     6.0059     0.9195 {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.9, 'colsample_bytree': 0....
         XGBoost_GridSearch usage_storage   2.4067    2.9738    0.2016   -1.5095    3.6985     4.5832     0.2925    -1.3985 {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.8, 'colsample_bytree': 0...
         XGBoost_GridSearch  users_active  24.0366   30.3922    7.4034    5.6447   23.1678    25.1671     6.3983   -11.3608 {'learning_rate': 0.1

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


✅ Moved GradientBoosting_GridSearch for usage_cpu to top_models/
✅ Moved GradientBoosting_GridSearch for usage_storage to top_models/
✅ Moved XGBoost_GridSearch for users_active to top_models/

TOP MODELS SUMMARY
       Target                  Best_Model  Val_MAE  Val_RMSE  Val_MAPE  Val_Bias  Test_MAE  Test_RMSE  Test_MAPE  Test_Bias                                                                                         Hyperparameters
    usage_cpu GradientBoosting_GridSearch   2.7387    3.6283    3.6939    0.8166    5.8135     6.5139     7.9223     1.4115 {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 6, 'subsample': 1.0, 'min_samples_split': 5...
usage_storage GradientBoosting_GridSearch   0.5440    0.6377    0.0449   -0.2857    0.8500     1.1127     0.0691    -0.0472 {'n_estimators': 300, 'learning_rate': 0.1, 'max_depth': 6, 'subsample': 0.6, 'min_samples_split': 2...
 users_active          XGBoost_GridSearch  24.0366   30.3922    7.4034    5.6447   23.1678    25.1671  

In [20]:
model_cpu = joblib.load('/content/top_models/usage_cpu_best_model.pkl')
model_storage = joblib.load('/content/top_models/usage_storage_best_model.pkl')
model_users = joblib.load('/content/top_models/users_active_best_model.pkl')
encoded_insights = pd.read_csv('/content/enhanced_features.csv')


In [21]:
def forecast_next_30_days(model, target_col, df, variability_factor=0.25, seed=42):


    np.random.seed(seed)
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])
    hist_df = df.copy()
    last_date = hist_df["date"].max()
    forecasts = []

    z_score = 1.96

    for i in range(30):
        next_date = last_date + timedelta(days=i + 1)
        new_row = hist_df.iloc[-1:].copy()
        new_row["date"] = next_date
        new_row["month"] = next_date.month
        new_row["dayofweek"] = next_date.dayofweek
        new_row["dayofmonth"] = next_date.day
        new_row["quarter"] = (next_date.month - 1) // 3 + 1
        new_row["is_weekend"] = 1 if next_date.weekday() >= 5 else 0

        for lag in [1, 7, 14]:
            if len(hist_df) >= lag:
                prev_val = hist_df[target_col].iloc[-lag]
                pct_change = np.random.normal(0, variability_factor)
                new_row[f"{target_col}_lag_{lag}"] = prev_val * (1 + pct_change)
            else:
                new_row[f"{target_col}_lag_{lag}"] = hist_df[target_col].mean()

        for win in [7, 14]:
            if len(hist_df) >= win:
                prev_window = hist_df[target_col].iloc[-win:]
                pct_changes = np.random.normal(0, variability_factor, size=win)
                adjusted_window = prev_window * (1 + pct_changes)
                new_row[f"{target_col}_roll_mean_{win}"] = adjusted_window.mean()
                new_row[f"{target_col}_roll_std_{win}"] = adjusted_window.std(ddof=0)
            else:
                new_row[f"{target_col}_roll_mean_{win}"] = hist_df[target_col].mean()
                new_row[f"{target_col}_roll_std_{win}"] = hist_df[target_col].std(ddof=0)

        feature_cols = [col for col in hist_df.columns if col not in ["date", "usage_cpu", "usage_storage", "users_active", "unique_id"]]
        X_next = new_row[feature_cols]

        pred = model.predict(X_next)[0]

        std_dev = new_row[[f"{target_col}_roll_std_7", f"{target_col}_roll_std_14"]].mean(axis=1).values[0]


        lower = pred - z_score * std_dev
        upper = pred + z_score * std_dev

        if target_col == "usage_cpu":
            upper = min(upper, 100)
        lower = max(lower, 0)

        forecasts.append({
            "date": next_date.strftime("%Y-%m-%d"),
            "predicted": float(pred),
            "lower_95": float(lower),
            "upper_95": float(upper)
        })

        new_row[target_col] = pred
        hist_df = pd.concat([hist_df, new_row], ignore_index=True)

    return forecasts

In [22]:
results = forecast_next_30_days(model_cpu, "usage_cpu", encoded_insights, variability_factor=0.25)
results

[{'date': '2023-04-01',
  'predicted': 73.275505612054,
  'lower_95': 37.47420468615455,
  'upper_95': 100.0},
 {'date': '2023-04-02',
  'predicted': 78.06984721465562,
  'lower_95': 40.35778392183062,
  'upper_95': 100.0},
 {'date': '2023-04-03',
  'predicted': 71.39372336767073,
  'lower_95': 39.73636395732156,
  'upper_95': 100.0},
 {'date': '2023-04-04',
  'predicted': 79.93056298003795,
  'lower_95': 47.6131479813415,
  'upper_95': 100.0},
 {'date': '2023-04-05',
  'predicted': 75.82876590187132,
  'lower_95': 40.38249661730409,
  'upper_95': 100.0},
 {'date': '2023-04-06',
  'predicted': 63.96898872703402,
  'lower_95': 21.69153840562565,
  'upper_95': 100.0},
 {'date': '2023-04-07',
  'predicted': 80.56755324530471,
  'lower_95': 46.24219684360594,
  'upper_95': 100.0},
 {'date': '2023-04-08',
  'predicted': 74.50981959854893,
  'lower_95': 32.86756925527948,
  'upper_95': 100.0},
 {'date': '2023-04-09',
  'predicted': 75.7527043855809,
  'lower_95': 33.512926597847326,
  'upper

In [23]:
results = forecast_next_30_days(model_storage, "usage_storage", encoded_insights, variability_factor=0.25)
results

[{'date': '2023-04-01',
  'predicted': 1273.6056395238088,
  'lower_95': 329.840698985024,
  'upper_95': 2217.3705800625935},
 {'date': '2023-04-02',
  'predicted': 1273.594704158244,
  'lower_95': 641.0903182588185,
  'upper_95': 1906.0990900576694},
 {'date': '2023-04-03',
  'predicted': 1272.132539671274,
  'lower_95': 441.45446063951545,
  'upper_95': 2102.8106187030326},
 {'date': '2023-04-04',
  'predicted': 1274.2975471575312,
  'lower_95': 441.04059131627116,
  'upper_95': 2107.554502998791},
 {'date': '2023-04-05',
  'predicted': 1274.0630567506264,
  'lower_95': 378.77365261334023,
  'upper_95': 2169.3524608879125},
 {'date': '2023-04-06',
  'predicted': 1274.4072825422975,
  'lower_95': 472.0406329585013,
  'upper_95': 2076.7739321260938},
 {'date': '2023-04-07',
  'predicted': 1274.0247148475491,
  'lower_95': 702.4408416804628,
  'upper_95': 1845.6085880146356},
 {'date': '2023-04-08',
  'predicted': 1272.6245744331325,
  'lower_95': 764.7044878929365,
  'upper_95': 1780.5

In [24]:
results = forecast_next_30_days(model_users, "users_active", encoded_insights, variability_factor=0.25)
results

[{'date': '2023-04-01',
  'predicted': 351.6036376953125,
  'lower_95': 113.34440270982671,
  'upper_95': 589.8628726807983},
 {'date': '2023-04-02',
  'predicted': 350.11480712890625,
  'lower_95': 102.64370697571917,
  'upper_95': 597.5859072820933},
 {'date': '2023-04-03',
  'predicted': 351.6036376953125,
  'lower_95': 174.10950907865924,
  'upper_95': 529.0977663119658},
 {'date': '2023-04-04',
  'predicted': 350.11480712890625,
  'lower_95': 124.37542951055889,
  'upper_95': 575.8541847472536},
 {'date': '2023-04-05',
  'predicted': 351.6036376953125,
  'lower_95': 200.8697316905907,
  'upper_95': 502.3375437000343},
 {'date': '2023-04-06',
  'predicted': 351.6036376953125,
  'lower_95': 120.19746454279587,
  'upper_95': 583.0098108478292},
 {'date': '2023-04-07',
  'predicted': 351.6036376953125,
  'lower_95': 157.14934876936286,
  'upper_95': 546.0579266212621},
 {'date': '2023-04-08',
  'predicted': 351.6036376953125,
  'lower_95': 147.71390433461116,
  'upper_95': 555.4933710