In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from 4sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import pickle

# Time Series Libraries
try:
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.statespace.sarimax import SARIMAX
except ImportError:
    print("Warning: statsmodels not available - ARIMA/SARIMA models will be skipped")

# Deep Learning Libraries
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping
    tf.random.set_seed(42)
except ImportError:
    print("Warning: TensorFlow not available - LSTM/GRU models will be skipped")

from datetime import datetime, timedelta
import os

class ForecastingPipeline:
    def __init__(self, data_path):
        self.data_path = data_path
        self.models = {}
        self.results = {}
        self.target_columns = ['usage_cpu', 'usage_storage', 'users_active']
        self.feature_columns = []

        # Create directories for saving models
        os.makedirs('models', exist_ok=True)
        os.makedirs('results', exist_ok=True)
        os.makedirs('top_models', exist_ok=True)

        self.hyperparameter_results = {}  # Store hyperparameter search results

    def load_and_prepare_data(self):
        """Load and prepare data for training"""
        print("Loading and preparing data...")
        self.df = pd.read_csv(self.data_path)
        self.df['date'] = pd.to_datetime(self.df['date'])
        self.df = self.df.sort_values(['unique_id', 'date'])

        # Define feature columns (exclude targets and non-feature columns)
        exclude_cols = ['date', 'unique_id'] + self.target_columns
        # Only select numeric columns for features
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        self.feature_columns = [col for col in numeric_cols if col not in exclude_cols]

        print(f"Data loaded: {self.df.shape}")
        print(f"Feature columns: {len(self.feature_columns)}")
        print(f"Target columns: {self.target_columns}")

        return self.df

    def create_train_val_test_split(self, df, train_ratio=0.7, val_ratio=0.2):
        """Create train/validation/test splits by unique_id"""
        train_data, val_data, test_data = [], [], []

        for unique_id in df['unique_id'].unique():
            group_data = df[df['unique_id'] == unique_id].sort_values('date')
            n = len(group_data)

            train_end = int(n * train_ratio)
            val_end = int(n * (train_ratio + val_ratio))

            train_data.append(group_data.iloc[:train_end])
            val_data.append(group_data.iloc[train_end:val_end])
            test_data.append(group_data.iloc[val_end:])

        train_df = pd.concat(train_data, ignore_index=True)
        val_df = pd.concat(val_data, ignore_index=True)
        test_df = pd.concat(test_data, ignore_index=True)

        print(f"Train: {len(train_df)} samples")
        print(f"Validation: {len(val_df)} samples")
        print(f"Test: {len(test_df)} samples")

        return train_df, val_df, test_df

    def calculate_metrics(self, y_true, y_pred, model_name, target):
        """Calculate evaluation metrics"""
        # Handle any NaN or infinite values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred) | np.isinf(y_true) | np.isinf(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]

        if len(y_true_clean) == 0:
            return {'MAE': np.nan, 'RMSE': np.nan, 'MAPE': np.nan, 'Bias': np.nan}

        mae = mean_absolute_error(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))

        # MAPE - handle division by zero
        mape = np.mean(np.abs((y_true_clean - y_pred_clean) / np.where(y_true_clean != 0, y_true_clean, 1))) * 100

        # Forecast Bias
        bias = np.mean(y_pred_clean - y_true_clean)

        return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 'Bias': bias}

    def train_xgboost_grid_search(self, train_df, val_df, test_df):
        """Train XGBoost with comprehensive hyperparameter grid search"""
        print("\nTraining XGBoost with Grid Search (this may take a while)...")

        # Prepare features
        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define hyperparameter grid
        param_grid = {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6],
            'n_estimators': [200, 500, 800, 1000],
            'subsample': [0.6, 0.7, 0.8, 0.9],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
            'reg_alpha': [0, 0.1, 0.5, 1.0],
            'reg_lambda': [1.0, 2.0, 3.0, 5.0],
            'gamma': [0, 0.1, 0.2, 0.5],
            'min_child_weight': [1, 3, 5, 7]
        }

        # Create parameter combinations (limit to manageable number)
        from itertools import product
        import random

        # Generate all possible combinations
        keys = list(param_grid.keys())
        combinations = list(product(*[param_grid[key] for key in keys]))

        # Randomly sample 50 combinations to make it manageable
        if len(combinations) > 50:
            combinations = random.sample(combinations, 50)

        for target in self.target_columns:
            print(f"\n  Grid Search for {target}...")
            print(f"  Testing {len(combinations)} parameter combinations...")

            y_train = train_df[target].fillna(train_df[target].mean())
            y_val = val_df[target].fillna(val_df[target].mean())
            y_test = test_df[target].fillna(test_df[target].mean())

            best_mae = float('inf')
            best_params = None
            best_model = None
            all_results = []

            for i, combo in enumerate(combinations):
                if i % 10 == 0:
                    print(f"    Progress: {i}/{len(combinations)}")

                # Create parameter dictionary
                params = dict(zip(keys, combo))
                params.update({
                    'objective': 'reg:squarederror',
                    'random_state': 42,
                    'early_stopping_rounds': 20
                })

                try:
                    # Train model
                    model = xgb.XGBRegressor(**params)
                    model.fit(X_train, y_train,
                             eval_set=[(X_val, y_val)],
                             verbose=False)

                    # Predict
                    val_pred = model.predict(X_val)
                    test_pred = model.predict(X_test)

                    # Calculate metrics
                    val_metrics = self.calculate_metrics(y_val.values, val_pred, 'XGBoost_GridSearch', target)
                    test_metrics = self.calculate_metrics(y_test.values, test_pred, 'XGBoost_GridSearch', target)

                    # Store results
                    result = {
                        'target': target,
                        'params': params.copy(),
                        'val_mae': val_metrics['MAE'],
                        'val_rmse': val_metrics['RMSE'],
                        'val_mape': val_metrics['MAPE'],
                        'val_bias': val_metrics['Bias'],
                        'test_mae': test_metrics['MAE'],
                        'test_rmse': test_metrics['RMSE'],
                        'test_mape': test_metrics['MAPE'],
                        'test_bias': test_metrics['Bias']
                    }
                    all_results.append(result)

                    # Check if this is the best model
                    if val_metrics['MAE'] < best_mae:
                        best_mae = val_metrics['MAE']
                        best_params = params.copy()
                        best_model = model
                        best_val_metrics = val_metrics
                        best_test_metrics = test_metrics

                except Exception as e:
                    continue

            # Store best model and results
            if best_model is not None:
                model_key = f'XGBoost_GridSearch_{target}'
                self.models[model_key] = best_model

                # Save model
                joblib.dump(best_model, f'models/{model_key}.pkl')

                # Store in results
                self.results[model_key] = {
                    'validation': best_val_metrics,
                    'test': best_test_metrics,
                    'best_params': best_params
                }

                # Store hyperparameter results (only best model info)
                self.hyperparameter_results[target] = {
                    'best_params': best_params,
                    'best_metrics': {
                        'validation': best_val_metrics,
                        'test': best_test_metrics
                    },
                    'total_combinations_tested': len(all_results)
                }

                print(f"    Best MAE for {target}: {best_mae:.4f}")
                print(f"    Best params: {best_params}")

        # Save best hyperparameter results only
        import json
        with open('results/best_hyperparameter_results.json', 'w') as f:
            # Convert to JSON serializable format
            json_results = {}
            for target, data in self.hyperparameter_results.items():
                json_results[target] = {
                    'best_params': data['best_params'],
                    'best_val_mae': data['best_metrics']['validation']['MAE'],
                    'best_test_mae': data['best_metrics']['test']['MAE'],
                    'total_combinations_tested': data['total_combinations_tested']
                }
            json.dump(json_results, f, indent=2)

    def train_xgboost_models(self, train_df, val_df, test_df):
        """Train XGBoost models with default parameters (for comparison)"""
        print("\nTraining XGBoost models with default parameters...")

        # Prepare features
        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        for target in self.target_columns:
            print(f"  Training XGBoost for {target}...")

            y_train = train_df[target].fillna(train_df[target].mean())
            y_val = val_df[target].fillna(val_df[target].mean())
            y_test = test_df[target].fillna(test_df[target].mean())

            # XGBoost with your friend's parameters
            model = xgb.XGBRegressor(
                objective='reg:squarederror',
                random_state=42,
                n_estimators=200,  # Increased for better performance
                max_depth=6,
                learning_rate=0.1,  # Slightly lower for stability
                subsample=0.8,
                colsample_bytree=0.8,
                early_stopping_rounds=20  # Move to constructor
            )

            model.fit(X_train, y_train,
                     eval_set=[(X_val, y_val)],
                     verbose=False)

            # Predictions
            val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)

            # Store model and results
            model_key = f'XGBoost_Default_{target}'
            self.models[model_key] = model

            # Save model
            joblib.dump(model, f'models/{model_key}.pkl')

            # Calculate metrics
            val_metrics = self.calculate_metrics(y_val.values, val_pred, 'XGBoost_Default', target)
            test_metrics = self.calculate_metrics(y_test.values, test_pred, 'XGBoost_Default', target)

            self.results[model_key] = {
                'validation': val_metrics,
                'test': test_metrics
            }

    def train_traditional_ml_grid_search(self, train_df, val_df, test_df):
        """Train traditional ML models with hyperparameter grid search"""
        print("\nTraining Traditional ML models with Grid Search...")

        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define hyperparameter grids for each model
        ml_param_grids = {
            'RandomForest': {
                'n_estimators': [100, 200, 300, 500],
                'max_depth': [3, 5, 10, 15, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2', 0.5, 0.8]
            },
            'GradientBoosting': {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.05, 0.1, 0.2],
                'max_depth': [3, 4, 5, 6],
                'subsample': [0.6, 0.8, 1.0],
                'min_samples_split': [2, 5, 10]
            },
            'Ridge': {
                'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 50.0, 100.0]
            },
            'Lasso': {
                'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
            }
        }

        from itertools import product
        import random

        for model_name, param_grid in ml_param_grids.items():
            print(f"\n  Grid Search for {model_name}...")

            # Generate parameter combinations
            keys = list(param_grid.keys())
            combinations = list(product(*[param_grid[key] for key in keys]))

            # Limit combinations for faster execution
            if len(combinations) > 30:
                combinations = random.sample(combinations, 30)

            for target in self.target_columns:
                print(f"    {model_name} for {target}...")

                y_train = train_df[target].fillna(train_df[target].mean())
                y_val = val_df[target].fillna(val_df[target].mean())
                y_test = test_df[target].fillna(test_df[target].mean())

                best_mae = float('inf')
                best_params = None
                best_model = None

                for combo in combinations:
                    params = dict(zip(keys, combo))

                    if model_name == 'RandomForest':
                        params['random_state'] = 42
                    elif model_name == 'GradientBoosting':
                        params['random_state'] = 42

                    try:
                        # Create and train model
                        if model_name == 'RandomForest':
                            model = RandomForestRegressor(**params)
                        elif model_name == 'GradientBoosting':
                            model = GradientBoostingRegressor(**params)
                        elif model_name == 'Ridge':
                            model = Ridge(**params)
                        elif model_name == 'Lasso':
                            model = Lasso(**params)

                        model.fit(X_train, y_train)

                        # Predict and evaluate
                        val_pred = model.predict(X_val)
                        val_metrics = self.calculate_metrics(y_val.values, val_pred, model_name, target)

                        if val_metrics['MAE'] < best_mae:
                            best_mae = val_metrics['MAE']
                            best_params = params.copy()
                            best_model = model

                    except Exception as e:
                        continue

                # Store best model
                if best_model is not None:
                    test_pred = best_model.predict(X_test)
                    test_metrics = self.calculate_metrics(y_test.values, test_pred, model_name, target)

                    model_key = f'{model_name}_GridSearch_{target}'
                    self.models[model_key] = best_model

                    # Save model
                    joblib.dump(best_model, f'models/{model_key}.pkl')

                    # Store results
                    val_metrics = self.calculate_metrics(y_val.values, best_model.predict(X_val), model_name, target)
                    self.results[model_key] = {
                        'validation': val_metrics,
                        'test': test_metrics,
                        'best_params': best_params
                    }

                    print(f"      Best MAE: {best_mae:.4f}")

    def train_deep_learning_grid_search(self, train_df, val_df, test_df):
        """Train deep learning models with hyperparameter grid search"""
        try:
            print("\nTraining Deep Learning models with Grid Search...")

            # Define hyperparameter grids
            dl_param_grids = {
                'LSTM': {
                    'units_1': [32, 50, 64, 100],
                    'units_2': [25, 32, 50, 64],
                    'dropout': [0.1, 0.2, 0.3],
                    'batch_size': [8, 16, 32],
                    'learning_rate': [0.001, 0.01, 0.1]
                },
                'GRU': {
                    'units_1': [32, 50, 64, 100],
                    'units_2': [25, 32, 50, 64],
                    'dropout': [0.1, 0.2, 0.3],
                    'batch_size': [8, 16, 32],
                    'learning_rate': [0.001, 0.01, 0.1]
                }
            }

            from itertools import product
            import random

            for target in self.target_columns:
                print(f"  Preparing sequences for {target}...")

                # Prepare sequences
                X_train, y_train = self.prepare_lstm_data(train_df, target)
                X_val, y_val = self.prepare_lstm_data(val_df, target)
                X_test, y_test = self.prepare_lstm_data(test_df, target)

                if len(X_train) == 0 or len(X_val) == 0:
                    print(f"    Insufficient data for {target}")
                    continue

                for model_type in ['LSTM', 'GRU']:
                    print(f"    Grid Search for {model_type} - {target}...")

                    param_grid = dl_param_grids[model_type]
                    keys = list(param_grid.keys())
                    combinations = list(product(*[param_grid[key] for key in keys]))

                    # Limit to 15 combinations for faster execution
                    if len(combinations) > 15:
                        combinations = random.sample(combinations, 15)

                    best_mae = float('inf')
                    best_params = None
                    best_model = None

                    for combo in combinations:
                        params = dict(zip(keys, combo))

                        try:
                            # Build model
                            model = Sequential()

                            if model_type == 'LSTM':
                                model.add(LSTM(params['units_1'], return_sequences=True,
                                             input_shape=(X_train.shape[1], X_train.shape[2])))
                                model.add(Dropout(params['dropout']))
                                model.add(LSTM(params['units_2']))
                            else:  # GRU
                                model.add(GRU(params['units_1'], return_sequences=True,
                                            input_shape=(X_train.shape[1], X_train.shape[2])))
                                model.add(Dropout(params['dropout']))
                                model.add(GRU(params['units_2']))

                            model.add(Dropout(params['dropout']))
                            model.add(Dense(25))
                            model.add(Dense(1))

                            # Compile with custom learning rate
                            optimizer = tf.keras.optimizers.Adam(learning_rate=params['learning_rate'])
                            model.compile(optimizer=optimizer, loss='mse')

                            # Train with early stopping
                            early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
                            model.fit(X_train, y_train,
                                    validation_data=(X_val, y_val),
                                    epochs=30, batch_size=params['batch_size'],
                                    callbacks=[early_stop], verbose=0)

                            # Evaluate
                            val_pred = model.predict(X_val, verbose=0).flatten()
                            val_metrics = self.calculate_metrics(y_val, val_pred, model_type, target)

                            if val_metrics['MAE'] < best_mae:
                                best_mae = val_metrics['MAE']
                                best_params = params.copy()
                                best_model = model

                        except Exception as e:
                            continue

                    # Store best model
                    if best_model is not None:
                        test_pred = best_model.predict(X_test, verbose=0).flatten()
                        test_metrics = self.calculate_metrics(y_test, test_pred, model_type, target)

                        model_key = f'{model_type}_GridSearch_{target}'
                        self.models[model_key] = best_model

                        # Save model
                        best_model.save(f'models/{model_key}.h5')

                        # Store results
                        val_metrics = self.calculate_metrics(y_val, best_model.predict(X_val, verbose=0).flatten(), model_type, target)
                        self.results[model_key] = {
                            'validation': val_metrics,
                            'test': test_metrics,
                            'best_params': best_params
                        }

                        print(f"      Best MAE: {best_mae:.4f}")

        except Exception as e:
            print(f"Error in deep learning grid search: {e}")

    def train_traditional_ml_models(self, train_df, val_df, test_df):
        """Train traditional ML models with default parameters (for comparison)"""
        print("\nTraining traditional ML models with default parameters...")

        X_train = train_df[self.feature_columns].fillna(0)
        X_val = val_df[self.feature_columns].fillna(0)
        X_test = test_df[self.feature_columns].fillna(0)

        # Define models to try
        ml_models = {
            'RandomForest_Default': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
            'GradientBoosting_Default': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=6),
            'Ridge_Default': Ridge(alpha=1.0),
            'Lasso_Default': Lasso(alpha=1.0),
            'LinearRegression_Default': LinearRegression()
        }

        for model_name, model in ml_models.items():
            for target in self.target_columns:
                print(f"  Training {model_name} for {target}...")

                y_train = train_df[target].fillna(train_df[target].mean())
                y_val = val_df[target].fillna(val_df[target].mean())
                y_test = test_df[target].fillna(test_df[target].mean())

                # Train model
                model.fit(X_train, y_train)

                # Predictions
                val_pred = model.predict(X_val)
                test_pred = model.predict(X_test)

                # Store model and results
                model_key = f'{model_name}_{target}'
                self.models[model_key] = model

                # Save model
                joblib.dump(model, f'models/{model_key}.pkl')

                # Calculate metrics
                val_metrics = self.calculate_metrics(y_val.values, val_pred, model_name, target)
                test_metrics = self.calculate_metrics(y_test.values, test_pred, model_name, target)

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics
                }

    def prepare_lstm_data(self, df, target_col, sequence_length=7):
        """Prepare data for LSTM/GRU models"""
        sequences = []
        targets = []

        for unique_id in df['unique_id'].unique():
            group_data = df[df['unique_id'] == unique_id].sort_values('date')

            # Use a subset of features for LSTM to avoid overfitting
            lstm_features = ['dayofweek', 'month', 'is_weekend'] + \
                           [col for col in self.feature_columns if ('lag_' in col or 'roll_mean_' in col) and col in self.feature_columns]
            lstm_features = [col for col in lstm_features if col in group_data.columns][:10]  # Limit features

            X = group_data[lstm_features].fillna(0).values
            y = group_data[target_col].fillna(group_data[target_col].mean()).values

            for i in range(sequence_length, len(X)):
                sequences.append(X[i-sequence_length:i])
                targets.append(y[i])

        return np.array(sequences), np.array(targets)

    def train_deep_learning_models(self, train_df, val_df, test_df):
        """Train LSTM and GRU models"""
        try:
            print("\nTraining Deep Learning models...")

            for target in self.target_columns:
                print(f"  Preparing data for {target}...")

                # Prepare sequences
                X_train, y_train = self.prepare_lstm_data(train_df, target)
                X_val, y_val = self.prepare_lstm_data(val_df, target)
                X_test, y_test = self.prepare_lstm_data(test_df, target)

                if len(X_train) == 0 or len(X_val) == 0 or len(X_test) == 0:
                    print(f"    Insufficient data for {target}, skipping deep learning models")
                    continue

                # LSTM Model
                print(f"    Training LSTM for {target}...")
                lstm_model = Sequential([
                    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
                    Dropout(0.2),
                    LSTM(50),
                    Dropout(0.2),
                    Dense(25),
                    Dense(1)
                ])

                lstm_model.compile(optimizer='adam', loss='mse')

                early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                lstm_model.fit(X_train, y_train,
                              validation_data=(X_val, y_val),
                              epochs=50, batch_size=16,
                              callbacks=[early_stop], verbose=0)

                # LSTM Predictions
                val_pred_lstm = lstm_model.predict(X_val, verbose=0).flatten()
                test_pred_lstm = lstm_model.predict(X_test, verbose=0).flatten()

                # Store LSTM results
                model_key = f'LSTM_{target}'
                self.models[model_key] = lstm_model
                lstm_model.save(f'models/{model_key}.h5')

                val_metrics = self.calculate_metrics(y_val, val_pred_lstm, 'LSTM', target)
                test_metrics = self.calculate_metrics(y_test, test_pred_lstm, 'LSTM', target)

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics
                }

                # GRU Model
                print(f"    Training GRU for {target}...")
                gru_model = Sequential([
                    GRU(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
                    Dropout(0.2),
                    GRU(50),
                    Dropout(0.2),
                    Dense(25),
                    Dense(1)
                ])

                gru_model.compile(optimizer='adam', loss='mse')
                gru_model.fit(X_train, y_train,
                             validation_data=(X_val, y_val),
                             epochs=50, batch_size=16,
                             callbacks=[early_stop], verbose=0)

                # GRU Predictions
                val_pred_gru = gru_model.predict(X_val, verbose=0).flatten()
                test_pred_gru = gru_model.predict(X_test, verbose=0).flatten()

                # Store GRU results
                model_key = f'GRU_{target}'
                self.models[model_key] = gru_model
                gru_model.save(f'models/{model_key}.h5')

                val_metrics = self.calculate_metrics(y_val, val_pred_gru, 'GRU', target)
                test_metrics = self.calculate_metrics(y_test, test_pred_gru, 'GRU', target)

                self.results[model_key] = {
                    'validation': val_metrics,
                    'test': test_metrics
                }

        except Exception as e:
            print(f"Error in deep learning training: {e}")

    def train_time_series_models(self, train_df, val_df, test_df):
        """Train ARIMA models - aggregate approach for each target"""
        try:
            print("\nTraining Time Series models...")

            for target in self.target_columns:
                print(f"  Training ARIMA for {target}...")

                # Aggregate all time series data for this target
                all_train_data = []
                all_val_data = []
                all_test_data = []

                for unique_id in train_df['unique_id'].unique():
                    # Get time series for this group
                    train_ts = train_df[train_df['unique_id'] == unique_id][target].fillna(method='ffill')
                    val_ts = val_df[val_df['unique_id'] == unique_id][target].fillna(method='ffill')
                    test_ts = test_df[test_df['unique_id'] == unique_id][target].fillna(method='ffill')

                    if len(train_ts) >= 5:  # Minimum data requirement
                        all_train_data.extend(train_ts.values)
                        all_val_data.extend(val_ts.values)
                        all_test_data.extend(test_ts.values)

                if len(all_train_data) < 20:  # Need sufficient data for ARIMA
                    print(f"    Insufficient data for ARIMA {target}, skipping...")
                    continue

                # Convert to pandas Series
                train_series = pd.Series(all_train_data)
                val_series = pd.Series(all_val_data)
                test_series = pd.Series(all_test_data)

                # Try different ARIMA orders and select best
                arima_orders = [(1,1,1), (2,1,1), (1,1,2), (2,1,2), (0,1,1), (1,0,1)]
                best_aic = float('inf')
                best_model = None
                best_order = None

                for order in arima_orders:
                    try:
                        arima_model = ARIMA(train_series, order=order)
                        arima_fitted = arima_model.fit()

                        if arima_fitted.aic < best_aic:
                            best_aic = arima_fitted.aic
                            best_model = arima_fitted
                            best_order = order
                    except:
                        continue

                if best_model is not None:
                    try:
                        # Forecast
                        val_forecast = best_model.forecast(steps=len(val_series))
                        test_forecast = best_model.forecast(steps=len(test_series))

                        # Store model
                        model_key = f'ARIMA_{target}'
                        self.models[model_key] = best_model

                        # Save model
                        with open(f'models/{model_key}.pkl', 'wb') as f:
                            pickle.dump(best_model, f)

                        # Calculate metrics
                        val_metrics = self.calculate_metrics(val_series.values, val_forecast, 'ARIMA', target)
                        test_metrics = self.calculate_metrics(test_series.values, test_forecast, 'ARIMA', target)

                        self.results[model_key] = {
                            'validation': val_metrics,
                            'test': test_metrics,
                            'best_params': {'order': best_order, 'aic': best_aic}
                        }

                        print(f"    ARIMA {target} - Best order: {best_order}, AIC: {best_aic:.2f}")

                    except Exception as e:
                        print(f"    ARIMA forecasting failed for {target}: {e}")

        except Exception as e:
            print(f"Error in time series training: {e}")

    def create_comparison_table(self):
        """Create comparison table of all models"""
        print("\nCreating comparison table...")

        comparison_data = []

        for model_key, metrics in self.results.items():
            # Better parsing of model name and target
            if model_key.count('_') >= 2:
                # Handle cases like 'XGBoost_GridSearch_usage_cpu'
                parts = model_key.split('_')
                if len(parts) >= 3:
                    # Find target (should be one of our target columns)
                    target = None
                    for i in range(len(parts)-1, 0, -1):
                        potential_target = '_'.join(parts[i:])
                        if potential_target in self.target_columns:
                            target = potential_target
                            model_name = '_'.join(parts[:i])
                            break

                    if target is None:
                        # Fallback to original method
                        model_name, target = model_key.rsplit('_', 1)
                else:
                    model_name, target = model_key.rsplit('_', 1)
            else:
                model_name, target = model_key.rsplit('_', 1)

            if 'validation' in metrics and 'test' in metrics:
                row = {
                    'Model': model_name,
                    'Target': target,
                    'Val_MAE': metrics['validation']['MAE'],
                    'Val_RMSE': metrics['validation']['RMSE'],
                    'Val_MAPE': metrics['validation']['MAPE'],
                    'Val_Bias': metrics['validation']['Bias'],
                    'Test_MAE': metrics['test']['MAE'],
                    'Test_RMSE': metrics['test']['RMSE'],
                    'Test_MAPE': metrics['test']['MAPE'],
                    'Test_Bias': metrics['test']['Bias']
                }

                # Add hyperparameters if available
                if 'best_params' in metrics:
                    # Convert params to string for display
                    params_str = str(metrics['best_params'])[:100] + "..." if len(str(metrics['best_params'])) > 100 else str(metrics['best_params'])
                    row['Hyperparameters'] = params_str
                else:
                    row['Hyperparameters'] = 'Default'

                comparison_data.append(row)

        comparison_df = pd.DataFrame(comparison_data)
        comparison_df = comparison_df.round(4)

        # Save comparison table
        comparison_df.to_csv('results/model_comparison.csv', index=False)

        print("\n" + "="*80)
        print("MODEL COMPARISON TABLE")
        print("="*80)
        print(comparison_df.to_string(index=False))

        return comparison_df

    def get_top_models_summary(self, comparison_df):
        """Get top 1 model for each target with detailed info"""
        print("\n" + "="*60)
        print("TOP 1 MODEL FOR EACH TARGET")
        print("="*60)

        top_models_data = []

        for target in self.target_columns:
            target_results = comparison_df[comparison_df['Target'] == target].copy()

            if len(target_results) > 0:
                # Find best model based on validation MAE
                best_idx = target_results['Val_MAE'].idxmin()
                best_row = target_results.loc[best_idx]

                model_name = best_row['Model']

                # Reconstruct model key properly
                model_key = f"{model_name}_{target}"

                # Get parameters if available
                params = "Default"
                if 'Hyperparameters' in best_row and pd.notna(best_row['Hyperparameters']):
                    params = best_row['Hyperparameters']
                elif model_key in self.results and 'best_params' in self.results[model_key]:
                    params = str(self.results[model_key]['best_params'])
                elif target in self.hyperparameter_results:
                    params = str(self.hyperparameter_results[target]['best_params'])

                top_model_info = {
                    'Target': target,
                    'Best_Model': model_name,
                    'Val_MAE': best_row['Val_MAE'],
                    'Val_RMSE': best_row['Val_RMSE'],
                    'Val_MAPE': best_row['Val_MAPE'],
                    'Val_Bias': best_row['Val_Bias'],
                    'Test_MAE': best_row['Test_MAE'],
                    'Test_RMSE': best_row['Test_RMSE'],
                    'Test_MAPE': best_row['Test_MAPE'],
                    'Test_Bias': best_row['Test_Bias'],
                    'Hyperparameters': params
                }

                top_models_data.append(top_model_info)

                # Copy best model to top_models directory
                try:
                    import shutil
                    # Try different file extensions
                    for ext in ['.pkl', '.h5']:
                        source_file = f'models/{model_key}{ext}'
                        if os.path.exists(source_file):
                            dest_file = f'top_models/{target}_best_model{ext}'
                            shutil.copy2(source_file, dest_file)
                            print(f"✅ Moved {model_name} for {target} to top_models/")
                            break
                    else:
                        print(f"⚠️  Model file not found for {model_key}")

                except Exception as e:
                    print(f"⚠️  Could not copy model for {target}: {e}")

        # Create DataFrame and save
        if top_models_data:
            top_models_df = pd.DataFrame(top_models_data)
            top_models_df = top_models_df.round(4)

            # Save to CSV
            top_models_df.to_csv('results/top_models_summary.csv', index=False)

            print("\n" + "="*60)
            print("TOP MODELS SUMMARY")
            print("="*60)
            print(top_models_df.to_string(index=False))
        else:
            print("No top models found!")
            top_models_df = pd.DataFrame()

        return top_models_df

    def run_complete_pipeline(self):
        """Run the complete ML pipeline"""
        print("="*80)
        print("STARTING COMPLETE ML FORECASTING PIPELINE")
        print("="*80)

        # Load data
        df = self.load_and_prepare_data()

        # Create splits
        train_df, val_df, test_df = self.create_train_val_test_split(df)

        # Train all models
        self.train_xgboost_grid_search(train_df, val_df, test_df)  # Grid search first
        self.train_xgboost_models(train_df, val_df, test_df)       # Default params for comparison
        self.train_traditional_ml_models(train_df, val_df, test_df)
        self.train_deep_learning_models(train_df, val_df, test_df)
        self.train_time_series_models(train_df, val_df, test_df)

        # Create comparison table
        comparison_df = self.create_comparison_table()

        # Get top models summary
        top_models_df = self.get_top_models_summary(comparison_df)

        print(f"\n✅ Pipeline complete! All models saved in 'models/' directory")
        print(f"✅ Top models moved to 'top_models/' directory")
        print(f"✅ Results saved in 'results/' directory")

        return comparison_df, top_models_df



In [None]:

pipeline = ForecastingPipeline('/content/enhanced_features.csv')  # Replace with your actual data file path

# Run complete pipeline
comparison_df, top_models_df = pipeline.run_complete_pipeline()

print("\n" + "="*80)
print("PIPELINE EXECUTION COMPLETED SUCCESSFULLY!")
print("="*80)
print(f"📊 Total models trained: {len(pipeline.results)}")
print(f"🏆 Best models identified for {len(top_models_df)} targets")
print(f"💾 All results saved in 'results/' directory")
print(f"🎯 Best models saved in 'top_models/' directory")

STARTING COMPLETE ML FORECASTING PIPELINE
Loading and preparing data...
Data loaded: (1080, 52)
Feature columns: 47
Target columns: ['usage_cpu', 'usage_storage', 'users_active']
Train: 744 samples
Validation: 216 samples
Test: 120 samples

Training XGBoost with Grid Search (this may take a while)...

  Grid Search for usage_cpu...
  Testing 50 parameter combinations...
    Progress: 0/50
    Progress: 10/50
    Progress: 20/50
    Progress: 30/50
    Progress: 40/50
    Best MAE for usage_cpu: 12.3551
    Best params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.5, 'reg_lambda': 5.0, 'gamma': 0.1, 'min_child_weight': 5, 'objective': 'reg:squarederror', 'random_state': 42, 'early_stopping_rounds': 20}

  Grid Search for usage_storage...
  Testing 50 parameter combinations...
    Progress: 0/50
    Progress: 10/50
    Progress: 20/50
    Progress: 30/50
    Progress: 40/50
    Best MAE for usage_storage: 9.6439
   



    Training GRU for usage_cpu...




  Preparing data for usage_storage...
    Training LSTM for usage_storage...




    Training GRU for usage_storage...




  Preparing data for users_active...
    Training LSTM for users_active...




    Training GRU for users_active...





Training Time Series models...
  Training ARIMA for usage_cpu...
    ARIMA usage_cpu - Best order: (0, 1, 1), AIC: 6100.94
  Training ARIMA for usage_storage...
    ARIMA usage_storage - Best order: (0, 1, 1), AIC: 11155.09
  Training ARIMA for users_active...
    ARIMA users_active - Best order: (0, 1, 1), AIC: 8755.76

Creating comparison table...

MODEL COMPARISON TABLE
                   Model        Target  Val_MAE  Val_RMSE  Val_MAPE  Val_Bias  Test_MAE  Test_RMSE  Test_MAPE  Test_Bias                                                                                         Hyperparameters
      XGBoost_GridSearch     usage_cpu  12.3551   14.1620   17.8297    1.4082   12.4506    14.6107    18.2209     0.9505 {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.7, 'colsample_bytree': 0....
      XGBoost_GridSearch usage_storage   9.6439   12.4008    0.9151   -0.0024   10.9121    14.3216     0.9917    -2.3178 {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators