In [None]:
!pip install tqdm
!pip install opencage
!pip install scikit-learn
!pip install tensorflow
!pip install keras



In [None]:
import pandas as pd
import numpy as np
import time
from geopy.geocoders import Nominatim
from tqdm.notebook import tqdm
from opencage.geocoder import OpenCageGeocode
from google.colab import userdata
import requests
import os
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
import joblib
import warnings
# Arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error
# LSTM
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Random forest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier


warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class ARIMAModel:
    """
    Enhanced ARIMA Model with progress bar support for Jupyter notebooks
    """

    def __init__(self, order=(1, 1, 1)):
        """
        Initialize ARIMA model

        Parameters:
        -----------
        order : tuple
            (p, d, q) order for ARIMA model
        """
        self.order = order
        self.model = None
        self.fitted_model = None
        self.is_fitted = False

    def _check_stationarity(self, timeseries, progress_bar=None):
        """Check if time series is stationary"""
        if progress_bar:
            progress_bar.set_description("Checking stationarity...")
            time.sleep(0.1)  # Small delay for visual feedback

        result = adfuller(timeseries.dropna())
        p_value = result[1]

        if progress_bar:
            progress_bar.set_postfix({"ADF p-value": f"{p_value:.6f}"})

        return p_value < 0.05

    def _prepare_data(self, data, progress_bar=None):
        """Prepare data for ARIMA modeling"""
        if progress_bar:
            progress_bar.set_description("Preparing data...")
            time.sleep(0.1)

        # Handle missing values
        if isinstance(data, pd.Series):
            data = data.interpolate(method='time').fillna(method='bfill').fillna(method='ffill')
        else:
            data = pd.Series(data).interpolate().fillna(method='bfill').fillna(method='ffill')

        if progress_bar:
            progress_bar.set_postfix({"Data points": len(data)})

        return data

    def fit(self, data, show_progress=True):
        """
        Fit ARIMA model with progress tracking

        Parameters:
        -----------
        data : array-like or pd.Series
            Time series data
        show_progress : bool
            Whether to show progress bar
        """
        # Initialize progress bar
        progress_steps = 4
        if show_progress:
            pbar = tqdm(total=progress_steps, desc="Training ARIMA",
                       bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        else:
            pbar = None

        try:
            # Step 1: Prepare data
            if pbar:
                pbar.update(1)
            prepared_data = self._prepare_data(data, pbar)

            # Step 2: Check stationarity
            if pbar:
                pbar.update(1)
            is_stationary = self._check_stationarity(prepared_data, pbar)

            # Step 3: Fit model
            if pbar:
                pbar.set_description("Fitting ARIMA model...")
                pbar.update(1)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.model = ARIMA(prepared_data, order=self.order)
                self.fitted_model = self.model.fit()

            # Step 4: Finalize
            if pbar:
                pbar.set_description("Model fitting complete")
                pbar.set_postfix({
                    "AIC": f"{self.fitted_model.aic:.2f}",
                    "Order": str(self.order)
                })
                pbar.update(1)
                time.sleep(0.5)  # Brief pause to show completion

            self.is_fitted = True

            if pbar:
                pbar.close()

            print(f"✓ ARIMA{self.order} model fitted successfully!")
            print(f"  AIC: {self.fitted_model.aic:.2f}")
            print(f"  Data points: {len(prepared_data)}")

        except Exception as e:
            if pbar:
                pbar.close()
            print(f"✗ Error fitting ARIMA model: {str(e)}")
            raise

    def predict(self, steps=1):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")

        forecast = self.fitted_model.forecast(steps=steps)
        return forecast

    def save_model(self, filepath):
        """Save the fitted model"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before saving")

        model_data = {
            'fitted_model': self.fitted_model,
            'order': self.order,
            'is_fitted': self.is_fitted
        }
        joblib.dump(model_data, filepath)
        print(f"✓ ARIMA model saved to {filepath}")

    def load_model(self, filepath):
        """Load a fitted model"""
        model_data = joblib.load(filepath)
        self.fitted_model = model_data['fitted_model']
        self.order = model_data['order']
        self.is_fitted = model_data['is_fitted']
        print(f"✓ ARIMA model loaded from {filepath}")

In [None]:
class LSTMModel:
    """
    Enhanced LSTM Model with GPU acceleration and progress bar support for Google Colab
    """

    def __init__(self, sequence_length=24, lstm_units=50, dropout_rate=0.2):
        """
        Initialize LSTM model

        Parameters:
        -----------
        sequence_length : int
            Number of time steps to look back
        lstm_units : int
            Number of LSTM units
        dropout_rate : float
            Dropout rate for regularization
        """
        self.sequence_length = sequence_length
        self.lstm_units = lstm_units
        self.dropout_rate = dropout_rate
        self.model = None
        self.scaler = MinMaxScaler()
        self.label_encoders = {}
        self.is_fitted = False

        # Setup GPU acceleration for Google Colab
        self._setup_gpu()

    def _setup_gpu(self):
        """Setup GPU acceleration for Google Colab"""
        print("🔧 Setting up GPU acceleration for Google Colab...")

        # Check GPU availability
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            try:
                # Enable memory growth to prevent allocation of all GPU memory
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)

                # Set up mixed precision for better performance
                policy = tf.keras.mixed_precision.Policy('mixed_float16')
                tf.keras.mixed_precision.set_global_policy(policy)

                print(f"✓ GPU acceleration enabled!")
                print(f"  Available GPUs: {len(gpus)}")
                print(f"  GPU Names: {[gpu.name for gpu in gpus]}")
                print(f"  Mixed precision: {policy.name}")

                # Display GPU memory info
                gpu_details = tf.config.experimental.get_device_details(gpus[0])
                print(f"  GPU Details: {gpu_details.get('device_name', 'Unknown')}")

            except RuntimeError as e:
                print(f"⚠️  GPU setup warning: {e}")
        else:
            print("⚠️  No GPU detected. Running on CPU.")
            print("   To enable GPU in Colab: Runtime → Change runtime type → GPU")

    def _create_sequences(self, data, target_col=None, progress_bar=None):
        """Create sequences for LSTM training"""
        if progress_bar:
            progress_bar.set_description("Creating sequences...")

        X, y = [], []

        for i in tqdm(range(self.sequence_length, len(data)),
                     desc="Building sequences",
                     disable=not progress_bar,
                     leave=False):
            X.append(data[i-self.sequence_length:i])
            if target_col is not None:
                y.append(data[i, target_col] if len(data.shape) > 1 else data[i])

        if progress_bar:
            progress_bar.set_postfix({"Sequences": len(X)})

        return np.array(X), np.array(y) if y else None

    def _build_model(self, input_shape, output_shapes, progress_bar=None):
        """Build LSTM architecture"""
        if progress_bar:
            progress_bar.set_description("Building LSTM architecture...")

        # Use GPU-optimized layers
        with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
            # Input layer
            inputs = tf.keras.layers.Input(shape=input_shape)

            # LSTM layers (GPU-accelerated when conditions are met)
            x = tf.keras.layers.LSTM(
                self.lstm_units,
                return_sequences=True,
                dropout=self.dropout_rate,
                recurrent_dropout=0,  # Required for GPU acceleration
                activation='tanh',    # Required for GPU acceleration
                recurrent_activation='sigmoid'  # Required for GPU acceleration
            )(inputs)

            x = tf.keras.layers.LSTM(
                self.lstm_units,
                dropout=self.dropout_rate,
                recurrent_dropout=0,
                activation='tanh',
                recurrent_activation='sigmoid'
            )(x)

            # Output layers
            outputs = {}

            # AQI prediction (regression)
            aqi_output = tf.keras.layers.Dense(1, name='aqi_output', dtype='float32')(x)
            outputs['aqi'] = aqi_output

            # Severity prediction (classification)
            severity_output = tf.keras.layers.Dense(
                output_shapes['severity'],
                activation='softmax',
                name='severity_output',
                dtype='float32'
            )(x)
            outputs['severity'] = severity_output

            # Main pollutant prediction (classification)
            pollutant_output = tf.keras.layers.Dense(
                output_shapes['pollutant'],
                activation='softmax',
                name='pollutant_output',
                dtype='float32'
            )(x)
            outputs['pollutant'] = pollutant_output

            # Create model
            model = tf.keras.Model(inputs=inputs, outputs=outputs)

            # Compile with mixed precision optimizer
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
            if tf.config.list_physical_devices('GPU'):
                optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

            model.compile(
                optimizer=optimizer,
                loss={
                    'aqi': 'mse',
                    'severity': 'sparse_categorical_crossentropy',
                    'pollutant': 'sparse_categorical_crossentropy'
                },
                metrics={
                    'aqi': ['mae'],
                    'severity': ['accuracy'],
                    'pollutant': ['accuracy']
                }
            )

        if progress_bar:
            progress_bar.set_postfix({
                "Parameters": f"{model.count_params():,}",
                "GPU": "Yes" if tf.config.list_physical_devices('GPU') else "No"
            })

        return model

    def _prepare_data(self, data, progress_bar=None):
        """Prepare data for LSTM training"""
        if progress_bar:
            progress_bar.set_description("Preparing data...")

        # Handle missing values
        numeric_columns = data.select_dtypes(include=[np.number]).columns
        data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

        # Encode categorical variables
        categorical_columns = ['Severity', 'Main Pollutant']
        for col in categorical_columns:
            if col in data.columns:
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    data[col] = self.label_encoders[col].fit_transform(data[col].astype(str))
                else:
                    data[col] = self.label_encoders[col].transform(data[col].astype(str))

        # Scale numerical features
        feature_columns = [col for col in data.columns if col not in ['Severity', 'Main Pollutant']]
        scaled_features = self.scaler.fit_transform(data[feature_columns])

        # Combine scaled features with encoded categorical variables
        processed_data = np.column_stack([
            scaled_features,
            data['Severity'].values if 'Severity' in data.columns else np.zeros(len(data)),
            data['Main Pollutant'].values if 'Main Pollutant' in data.columns else np.zeros(len(data))
        ])

        if progress_bar:
            progress_bar.set_postfix({
                "Features": processed_data.shape[1],
                "Samples": processed_data.shape[0]
            })

        return processed_data

    def fit(self, data, epochs=50, batch_size=32, validation_split=0.2, show_progress=True):
        """
        Fit LSTM model with GPU acceleration and progress tracking

        Parameters:
        -----------
        data : pd.DataFrame
            Training data
        epochs : int
            Number of training epochs
        batch_size : int
            Batch size for training
        validation_split : float
            Fraction of data to use for validation
        show_progress : bool
            Whether to show progress bar
        """
        # Initialize progress bar
        progress_steps = 5
        if show_progress:
            pbar = tqdm(total=progress_steps, desc="Training LSTM",
                       bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        else:
            pbar = None

        try:
            # Step 1: Prepare data
            if pbar:
                pbar.update(1)
            processed_data = self._prepare_data(data, pbar)

            # Step 2: Create sequences
            if pbar:
                pbar.update(1)
            X, _ = self._create_sequences(processed_data, progress_bar=pbar)

            # Prepare targets
            y_aqi = data['AQI'].values[self.sequence_length:]
            y_severity = data['Severity'].map(self.label_encoders['Severity'].transform) if 'Severity' in data.columns else np.zeros(len(y_aqi))
            y_pollutant = data['Main Pollutant'].map(self.label_encoders['Main Pollutant'].transform) if 'Main Pollutant' in data.columns else np.zeros(len(y_aqi))

            # Step 3: Build model
            if pbar:
                pbar.update(1)
            output_shapes = {
                'severity': len(self.label_encoders.get('Severity', {}).classes_) if 'Severity' in self.label_encoders else 3,
                'pollutant': len(self.label_encoders.get('Main Pollutant', {}).classes_) if 'Main Pollutant' in self.label_encoders else 5
            }

            self.model = self._build_model(
                input_shape=(self.sequence_length, processed_data.shape[1]),
                output_shapes=output_shapes,
                progress_bar=pbar
            )

            # Step 4: Setup training callbacks
            if pbar:
                pbar.set_description("Setting up training...")
                pbar.update(1)

            callbacks = [
                tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss',
                    patience=10,
                    restore_best_weights=True
                ),
                tf.keras.callbacks.ReduceLROnPlateau(
                    monitor='val_loss',
                    factor=0.5,
                    patience=5
                )
            ]

            # Add TensorBoard callback for GPU monitoring
            if tf.config.list_physical_devices('GPU'):
                callbacks.append(
                    tf.keras.callbacks.TensorBoard(
                        log_dir='./logs',
                        histogram_freq=1,
                        profile_batch='2,5'
                    )
                )

            # Step 5: Train model
            if pbar:
                pbar.set_description("Training on GPU..." if tf.config.list_physical_devices('GPU') else "Training on CPU...")
                pbar.update(1)
                pbar.close()

            # Prepare targets dictionary
            y_dict = {
                'aqi': y_aqi,
                'severity': y_severity,
                'pollutant': y_pollutant
            }

            print(f"🚀 Starting LSTM training on {'GPU' if tf.config.list_physical_devices('GPU') else 'CPU'}...")
            print(f"   Training samples: {len(X)}")
            print(f"   Sequence length: {self.sequence_length}")
            print(f"   Batch size: {batch_size}")
            print(f"   Epochs: {epochs}")

            # Train with progress monitoring
            history = self.model.fit(
                X, y_dict,
                epochs=epochs,
                batch_size=batch_size,
                validation_split=validation_split,
                callbacks=callbacks,
                verbose=1  # Show training progress
            )

            self.is_fitted = True

            print("✓ LSTM model training completed!")
            print(f"  Final loss: {history.history['loss'][-1]:.4f}")
            print(f"  Final val_loss: {history.history['val_loss'][-1]:.4f}")

            return history

        except Exception as e:
            if pbar:
                pbar.close()
            print(f"✗ Error training LSTM model: {str(e)}")
            raise

    def predict(self, data):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")

        processed_data = self._prepare_data(data)
        X, _ = self._create_sequences(processed_data)

        predictions = self.model.predict(X)

        return {
            'aqi': predictions['aqi'].flatten(),
            'severity': np.argmax(predictions['severity'], axis=1),
            'pollutant': np.argmax(predictions['pollutant'], axis=1)
        }

    def save_model(self, filepath):
        """Save the fitted model"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before saving")

        self.model.save(filepath)
        print(f"✓ LSTM model saved to {filepath}")

    def load_model(self, filepath):
        """Load a fitted model"""
        self.model = tf.keras.models.load_model(filepath)
        self.is_fitted = True
        print(f"✓ LSTM model loaded from {filepath}")

In [None]:
class RandomForestModel:
    """
    Enhanced Random Forest Model with progress bar support and GPU acceleration options
    """

    def __init__(self, n_estimators=100, max_depth=None, random_state=42, n_jobs=-1, use_gpu_alternative=True):
        """
        Initialize Random Forest model

        Parameters:
        -----------
        n_estimators : int
            Number of trees in the forest
        max_depth : int or None
            Maximum depth of trees
        random_state : int
            Random state for reproducibility
        n_jobs : int
            Number of parallel jobs (-1 uses all cores)
        use_gpu_alternative : bool
            Whether to try GPU-accelerated alternatives if available
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.use_gpu_alternative = use_gpu_alternative

        self.regressor = None
        self.severity_classifier = None
        self.pollutant_classifier = None
        self.label_encoders = {}
        self.feature_columns = None
        self.is_fitted = False

        # Setup GPU alternatives
        self._setup_gpu_alternatives()

    def _setup_gpu_alternatives(self):
        """Setup GPU-accelerated alternatives for Random Forest"""
        print("🔧 Setting up Random Forest with acceleration options...")

        if self.use_gpu_alternative:
            try:
                # Try to import cuML for GPU acceleration
                import cuml
                from cuml.ensemble import RandomForestRegressor as cuRFRegressor
                from cuml.ensemble import RandomForestClassifier as cuRFClassifier

                self.cuml_available = True
                self.cuRFRegressor = cuRFRegressor
                self.cuRFClassifier = cuRFClassifier

                print("✓ cuML (GPU-accelerated Random Forest) is available!")
                print("  Will use GPU acceleration when possible")

                # Check GPU availability
                import subprocess
                result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
                if result.returncode == 0:
                    print("✓ NVIDIA GPU detected for cuML acceleration")
                else:
                    print("⚠️  No NVIDIA GPU detected. cuML will use CPU.")

            except ImportError:
                self.cuml_available = False
                print("⚠️  cuML not available. Install with: pip install cuml-cu11")
                print("   Using standard scikit-learn Random Forest")

        else:
            self.cuml_available = False
            print("🖥️  Using standard scikit-learn Random Forest")

        # Setup multi-threading optimization
        if self.n_jobs == -1:
            n_cores = os.cpu_count()
            print(f"📊 Using all {n_cores} CPU cores for parallel processing")
        else:
            print(f"📊 Using {self.n_jobs} CPU cores for parallel processing")

    def _create_advanced_features(self, data, progress_bar=None):
        """Create advanced engineered features"""
        if progress_bar:
            progress_bar.set_description("Creating advanced features...")

        feature_data = data.copy()

        # Time-based features
        if 'From Date' in data.columns:
            feature_data['From Date'] = pd.to_datetime(feature_data['From Date'])
            feature_data['hour'] = feature_data['From Date'].dt.hour
            feature_data['day_of_week'] = feature_data['From Date'].dt.dayofweek
            feature_data['month'] = feature_data['From Date'].dt.month
            feature_data['season'] = (feature_data['From Date'].dt.month % 12 + 3) // 3
            feature_data['is_weekend'] = (feature_data['day_of_week'] >= 5).astype(int)
            feature_data['is_rush_hour'] = ((feature_data['hour'].between(7, 9)) |
                                          (feature_data['hour'].between(17, 19))).astype(int)

        # Weather interaction features
        weather_cols = ['AT (degree C)', 'RH (%)', 'WS (m/s)', 'WD (degree)', 'SR (W/mt2)', 'RF (mm)', 'BP (mmHg)']
        available_weather = [col for col in weather_cols if col in data.columns]

        for i, col1 in enumerate(available_weather):
            for col2 in available_weather[i+1:]:
                if all(col in data.columns for col in [col1, col2]):
                    interaction_name = f"{col1.split()[0]}_{col2.split()[0]}_interaction"
                    feature_data[interaction_name] = feature_data[col1] * feature_data[col2]

        # Wind components (if wind speed and direction available)
        if all(col in data.columns for col in ['WS (m/s)', 'WD (degree)']):
            feature_data['wind_x'] = feature_data['WS (m/s)'] * np.cos(np.radians(feature_data['WD (degree)']))
            feature_data['wind_y'] = feature_data['WS (m/s)'] * np.sin(np.radians(feature_data['WD (degree)']))

        # Pollutant features
        pollutant_cols = ['Benzene (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)', 'Toluene (ug/m3)']
        available_pollutants = [col for col in pollutant_cols if col in data.columns]

        if available_pollutants:
            # Pollutant ratios
            for i, col1 in enumerate(available_pollutants):
                for col2 in available_pollutants[i+1:]:
                    ratio_name = f"{col1.split()[0]}_{col2.split()[0]}_ratio"
                    feature_data[ratio_name] = (feature_data[col1] + 1e-8) / (feature_data[col2] + 1e-8)

            # Total pollutant load
            feature_data['total_pollutants'] = feature_data[available_pollutants].sum(axis=1)

        # Rolling statistics (if data is temporal)
        if len(feature_data) > 24:  # At least 24 data points for meaningful rolling stats
            numeric_cols = feature_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols[:5]:  # Limit to avoid too many features
                if col in feature_data.columns:
                    feature_data[f'{col}_rolling_mean_6h'] = feature_data[col].rolling(window=6, min_periods=1).mean()
                    feature_data[f'{col}_rolling_std_6h'] = feature_data[col].rolling(window=6, min_periods=1).std()

        if progress_bar:
            progress_bar.set_postfix({"Total features": feature_data.shape[1]})

        return feature_data

    def _prepare_data(self, data, progress_bar=None):
        """Prepare data for Random Forest training"""
        if progress_bar:
            progress_bar.set_description("Preparing data for Random Forest...")

        # Create advanced features
        feature_data = self._create_advanced_features(data, progress_bar)

        # Handle missing values with different strategies
        numeric_columns = feature_data.select_dtypes(include=[np.number]).columns

        # For weather data, use median imputation
        weather_numeric = [col for col in numeric_columns if any(w in col for w in ['AT', 'RH', 'WS', 'WD', 'SR', 'RF', 'BP'])]
        if weather_numeric:
            feature_data[weather_numeric] = feature_data[weather_numeric].fillna(feature_data[weather_numeric].median())

        # For other numeric data, use forward fill then median
        other_numeric = [col for col in numeric_columns if col not in weather_numeric]
        if other_numeric:
            feature_data[other_numeric] = feature_data[other_numeric].fillna(method='ffill').fillna(feature_data[other_numeric].median())

        # Encode categorical variables
        categorical_columns = ['Severity', 'Main Pollutant']
        for col in categorical_columns:
            if col in feature_data.columns:
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    feature_data[col] = self.label_encoders[col].fit_transform(feature_data[col].astype(str))
                else:
                    feature_data[col] = self.label_encoders[col].transform(feature_data[col].astype(str))

        # Select feature columns
        exclude_cols = ['From Date', 'file_name', 'state', 'city']
        self.feature_columns = [col for col in feature_data.columns
                               if col not in exclude_cols and feature_data[col].dtype in ['int64', 'float64']]

        if progress_bar:
            progress_bar.set_postfix({
                "Features": len(self.feature_columns),
                "Samples": len(feature_data)
            })

        return feature_data[self.feature_columns]

    def _train_with_progress(self, model, X_train, y_train, X_val, y_val, model_name, progress_bar=None):
        """Train model with progress tracking"""
        if progress_bar:
            progress_bar.set_description(f"Training {model_name}...")

        print(f"🌳 Training {model_name} with {self.n_estimators} trees...")

        # For large datasets, show progress by training in batches
        if self.cuml_available and hasattr(model, 'fit'):
            # cuML models
            start_time = time.time()
            model.fit(X_train, y_train)
            training_time = time.time() - start_time
            print(f"   Training completed in {training_time:.2f} seconds (GPU accelerated)")
        else:
            # Scikit-learn models with warm start for progress tracking
            if hasattr(model, 'warm_start'):
                model.warm_start = True

                # Train incrementally to show progress
                batch_size = max(10, self.n_estimators // 10)
                for i in range(batch_size, self.n_estimators + 1, batch_size):
                    model.n_estimators = min(i, self.n_estimators)
                    start_time = time.time()
                    model.fit(X_train, y_train)
                    batch_time = time.time() - start_time

                    # Calculate validation score
                    val_pred = model.predict(X_val)
                    if hasattr(model, 'predict_proba'):  # Classifier
                        score = accuracy_score(y_val, val_pred)
                        metric = "Accuracy"
                    else:  # Regressor
                        score = np.sqrt(mean_squared_error(y_val, val_pred))
                        metric = "RMSE"

                    print(f"   Trees {model.n_estimators:3d}/{self.n_estimators}: {metric} = {score:.4f} ({batch_time:.2f}s)")

                model.warm_start = False
            else:
                # Regular training
                start_time = time.time()
                model.fit(X_train, y_train)
                training_time = time.time() - start_time
                print(f"   Training completed in {training_time:.2f} seconds")

        return model

    def fit(self, data, show_progress=True):
        """
        Fit Random Forest models with progress tracking and optional GPU acceleration

        Parameters:
        -----------
        data : pd.DataFrame
            Training data
        show_progress : bool
            Whether to show progress bar
        """
        # Initialize progress bar
        progress_steps = 6
        if show_progress:
            pbar = tqdm(total=progress_steps, desc="Training Random Forest",
                       bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        else:
            pbar = None

        try:
            # Step 1: Prepare data
            if pbar:
                pbar.update(1)
            X = self._prepare_data(data, pbar)

            # Step 2: Prepare targets
            if pbar:
                pbar.set_description("Preparing targets...")
                pbar.update(1)

            y_aqi = data['AQI'].values
            y_severity = data['Severity'].map(self.label_encoders['Severity'].transform) if 'Severity' in data.columns else None
            y_pollutant = data['Main Pollutant'].map(self.label_encoders['Main Pollutant'].transform) if 'Main Pollutant' in data.columns else None

            # Step 3: Setup models
            if pbar:
                pbar.set_description("Setting up models...")
                pbar.update(1)

            # Choose between cuML (GPU) and scikit-learn based on availability
            if self.cuml_available:
                print("🚀 Using cuML GPU-accelerated Random Forest")

                # cuML Random Forest parameters
                rf_params = {
                    'n_estimators': self.n_estimators,
                    'max_depth': self.max_depth,
                    'random_state': self.random_state,
                    'bootstrap': True
                }

                regressor_class = self.cuRFRegressor
                classifier_class = self.cuRFClassifier

            else:
                print("🖥️  Using scikit-learn Random Forest with CPU acceleration")

                # Scikit-learn Random Forest parameters
                rf_params = {
                    'n_estimators': self.n_estimators,
                    'max_depth': self.max_depth,
                    'random_state': self.random_state,
                    'n_jobs': self.n_jobs,
                    'oob_score': True
                }

                regressor_class = RandomForestRegressor
                classifier_class = RandomForestClassifier

            # Step 4: Train AQI regressor
            if pbar:
                pbar.update(1)

            X_train, X_val, y_train_aqi, y_val_aqi = train_test_split(X, y_aqi, test_size=0.2, random_state=42)

            self.regressor = regressor_class(**rf_params)
            self.regressor = self._train_with_progress(
                self.regressor, X_train, y_train_aqi, X_val, y_val_aqi,
                "AQI Regressor", pbar
            )

            # Step 5: Train severity classifier
            if pbar:
                pbar.update(1)

            if y_severity is not None:
                X_train_sev, X_val_sev, y_train_sev, y_val_sev = train_test_split(
                    X, y_severity, test_size=0.2, random_state=42
                )

                self.severity_classifier = classifier_class(**rf_params)
                self.severity_classifier = self._train_with_progress(
                    self.severity_classifier, X_train_sev, y_train_sev, X_val_sev, y_val_sev,
                    "Severity Classifier", pbar
                )

            # Step 6: Train pollutant classifier
            if pbar:
                pbar.update(1)

            if y_pollutant is not None:
                X_train_pol, X_val_pol, y_train_pol, y_val_pol = train_test_split(
                    X, y_pollutant, test_size=0.2, random_state=42
                )

                self.pollutant_classifier = classifier_class(**rf_params)
                self.pollutant_classifier = self._train_with_progress(
                    self.pollutant_classifier, X_train_pol, y_train_pol, X_val_pol, y_val_pol,
                    "Pollutant Classifier", pbar
                )

            self.is_fitted = True

            if pbar:
                pbar.set_description("Training complete!")
                pbar.close()

            # Display training summary
            print("✅ Random Forest models training completed!")
            print(f"   Acceleration: {'GPU (cuML)' if self.cuml_available else 'CPU (scikit-learn)'}")
            print(f"   Features: {len(self.feature_columns)}")
            print(f"   Trees per model: {self.n_estimators}")

            # Model performance
            aqi_rmse = np.sqrt(mean_squared_error(y_val_aqi, self.regressor.predict(X_val)))
            print(f"   AQI RMSE: {aqi_rmse:.4f}")

            if hasattr(self.regressor, 'oob_score_'):
                print(f"   AQI OOB Score: {self.regressor.oob_score_:.4f}")

            if self.severity_classifier:
                sev_acc = accuracy_score(y_val_sev, self.severity_classifier.predict(X_val_sev))
                print(f"   Severity Accuracy: {sev_acc:.4f}")

            if self.pollutant_classifier:
                pol_acc = accuracy_score(y_val_pol, self.pollutant_classifier.predict(X_val_pol))
                print(f"   Pollutant Accuracy: {pol_acc:.4f}")

        except Exception as e:
            if pbar:
                pbar.close()
            print(f"✗ Error training Random Forest models: {str(e)}")
            raise

    def predict(self, data):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before making predictions")

        X = self._prepare_data(data)

        predictions = {}

        # AQI prediction
        predictions['aqi'] = self.regressor.predict(X)

        # Severity prediction
        if self.severity_classifier:
            predictions['severity'] = self.severity_classifier.predict(X)

        # Pollutant prediction
        if self.pollutant_classifier:
            predictions['pollutant'] = self.pollutant_classifier.predict(X)

        return predictions

    def get_feature_importance(self):
        """Get feature importance from trained models"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before getting feature importance")

        importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'aqi_importance': self.regressor.feature_importances_
        })

        if self.severity_classifier:
            importance_df['severity_importance'] = self.severity_classifier.feature_importances_

        if self.pollutant_classifier:
            importance_df['pollutant_importance'] = self.pollutant_classifier.feature_importances_

        return importance_df.sort_values('aqi_importance', ascending=False)

    def save_model(self, filepath):
        """Save the fitted models"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before saving")

        # Convert cuML models to scikit-learn for compatibility if needed
        if self.cuml_available:
            print("⚠️  Note: cuML models will be converted to CPU format for saving")

        model_data = {
            'regressor': self.regressor,
            'severity_classifier': self.severity_classifier,
            'pollutant_classifier': self.pollutant_classifier,
            'label_encoders': self.label_encoders,
            'feature_columns': self.feature_columns,
            'is_fitted': self.is_fitted,
            'cuml_used': self.cuml_available
        }

        joblib.dump(model_data, filepath)
        print(f"✓ Random Forest models saved to {filepath}")

    def load_model(self, filepath):
        """Load fitted models"""
        model_data = joblib.load(filepath)

        self.regressor = model_data['regressor']
        self.severity_classifier = model_data['severity_classifier']
        self.pollutant_classifier = model_data['pollutant_classifier']
        self.label_encoders = model_data['label_encoders']
        self.feature_columns = model_data['feature_columns']
        self.is_fitted = model_data['is_fitted']

        if model_data.get('cuml_used', False):
            print("ℹ️  This model was trained with cuML GPU acceleration")

        print(f"✓ Random Forest models loaded from {filepath}")

In [None]:
class XGBoostModel:
    """
    Enhanced XGBoost Model with GPU acceleration and progress bar support for Google Colab
    """

    def __init__(self, n_estimators=100, max_depth=6, learning_rate=0.1, use_gpu=True):
        """
        Initialize XGBoost model

        Parameters:
        -----------
        n_estimators : int
            Number of boosting rounds
        max_depth : int
            Maximum tree depth
        learning_rate : float
            Learning rate (eta)
        use_gpu : bool
            Whether to use GPU acceleration
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.use_gpu = use_gpu

        self.regressor = None
        self.severity_classifier = None
        self.pollutant_classifier = None
        self.label_encoders = {}
        self.feature_columns = None
        self.is_fitted = False

        # Setup GPU acceleration
        self._setup_gpu()

    def _setup_gpu(self):
        """Setup GPU acceleration for XGBoost"""
        print("🔧 Setting up XGBoost GPU acceleration...")

        if self.use_gpu:
            try:
                # Check if GPU is available
                import subprocess
                result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)

                if result.returncode == 0:
                    print("✓ NVIDIA GPU detected!")
                    print("  GPU acceleration will be used for XGBoost")

                    # Extract GPU info
                    gpu_info = result.stdout.split('\n')[8:10]  # GPU info lines
                    for line in gpu_info:
                        if 'Tesla' in line or 'GeForce' in line or 'Quadro' in line:
                            gpu_name = line.split('|')[1].strip().split()[0:3]
                            print(f"  GPU: {' '.join(gpu_name)}")
                            break
                else:
                    print("⚠️  No NVIDIA GPU detected. Using CPU.")
                    self.use_gpu = False

            except Exception as e:
                print(f"⚠️  GPU detection failed: {e}")
                print("   Falling back to CPU mode")
                self.use_gpu = False
        else:
            print("🖥️  CPU mode selected")

    def _get_base_params(self):
        """Get base parameters for XGBoost"""
        params = {
            'n_estimators': self.n_estimators,
            'max_depth': self.max_depth,
            'learning_rate': self.learning_rate,
            'random_state': 42,
            'n_jobs': -1
        }

        # Add GPU parameters if available
        if self.use_gpu:
            params.update({
                'tree_method': 'gpu_hist',
                'gpu_id': 0,
                'predictor': 'gpu_predictor'
            })
        else:
            params.update({
                'tree_method': 'hist'
            })

        return params

    def _create_features(self, data, progress_bar=None):
        """Create engineered features"""
        if progress_bar:
            progress_bar.set_description("Engineering features...")

        # Basic features
        feature_data = data.copy()

        # Time-based features if date column exists
        if 'From Date' in data.columns:
            feature_data['From Date'] = pd.to_datetime(feature_data['From Date'])
            feature_data['hour'] = feature_data['From Date'].dt.hour
            feature_data['day_of_week'] = feature_data['From Date'].dt.dayofweek
            feature_data['month'] = feature_data['From Date'].dt.month
            feature_data['is_weekend'] = (feature_data['day_of_week'] >= 5).astype(int)

        # Weather interaction features
        if all(col in data.columns for col in ['AT (degree C)', 'RH (%)']):
            feature_data['temp_humidity_interaction'] = feature_data['AT (degree C)'] * feature_data['RH (%)']

        if all(col in data.columns for col in ['WS (m/s)', 'WD (degree)']):
            feature_data['wind_components_x'] = feature_data['WS (m/s)'] * np.cos(np.radians(feature_data['WD (degree)']))
            feature_data['wind_components_y'] = feature_data['WS (m/s)'] * np.sin(np.radians(feature_data['WD (degree)']))

        # Pollutant ratios
        pollutant_cols = ['Benzene (ug/m3)', 'NO (ug/m3)', 'NOx (ug/m3)', 'Toluene (ug/m3)']
        available_pollutants = [col for col in pollutant_cols if col in data.columns]

        if len(available_pollutants) >= 2:
            for i, col1 in enumerate(available_pollutants):
                for col2 in available_pollutants[i+1:]:
                    ratio_name = f"{col1.split()[0]}_{col2.split()[0]}_ratio"
                    feature_data[ratio_name] = (feature_data[col1] + 1e-8) / (feature_data[col2] + 1e-8)

        if progress_bar:
            progress_bar.set_postfix({"Features": feature_data.shape[1]})

        return feature_data

    def _prepare_data(self, data, progress_bar=None):
        """Prepare data for XGBoost training"""
        if progress_bar:
            progress_bar.set_description("Preparing data...")

        # Create features
        feature_data = self._create_features(data, progress_bar)

        # Handle missing values
        numeric_columns = feature_data.select_dtypes(include=[np.number]).columns
        feature_data[numeric_columns] = feature_data[numeric_columns].fillna(feature_data[numeric_columns].median())

        # Encode categorical variables
        categorical_columns = ['Severity', 'Main Pollutant']
        for col in categorical_columns:
            if col in feature_data.columns:
                if col not in self.label_encoders:
                    self.label_encoders[col] = LabelEncoder()
                    feature_data[col] = self.label_encoders[col].fit_transform(feature_data[col].astype(str))
                else:
                    feature_data[col] = self.label_encoders[col].transform(feature_data[col].astype(str))

        # Select feature columns (exclude targets and non-numeric columns)
        exclude_cols = ['From Date', 'file_name', 'state', 'city']
        self.feature_columns = [col for col in feature_data.columns
                               if col not in exclude_cols and feature_data[col].dtype in ['int64', 'float64']]

        if progress_bar:
            progress_bar.set_postfix({
                "Final features": len(self.feature_columns),
                "Samples": len(feature_data)
            })

        return feature_data[self.feature_columns]

    def fit(self, data, show_progress=True):
        """
        Fit XGBoost models with GPU acceleration and progress tracking

        Parameters:
        -----------
        data : pd.DataFrame
            Training data
        show_progress : bool
            Whether to show progress bar
        """
        # Initialize progress bar
        progress_steps = 6
        if show_progress:
            pbar = tqdm(total=progress_steps, desc="Training XGBoost",
                       bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        else:
            pbar = None

        try:
            # Step 1: Prepare data
            if pbar:
                pbar.update(1)
            X = self._prepare_data(data, pbar)

            # Step 2: Prepare targets
            if pbar:
                pbar.set_description("Preparing targets...")
                pbar.update(1)

            y_aqi = data['AQI'].values
            y_severity = data['Severity'].map(self.label_encoders['Severity'].transform) if 'Severity' in data.columns else None
            y_pollutant = data['Main Pollutant'].map(self.label_encoders['Main Pollutant'].transform) if 'Main Pollutant' in data.columns else None
            print(y_aqi.shape)
            # Step 3: Setup base parameters
            if pbar:
                pbar.set_description("Configuring models...")
                pbar.update(1)

            base_params = self._get_base_params()

            # Step 4: Train AQI regressor
            if pbar:
                pbar.set_description(f"Training AQI regressor ({'GPU' if self.use_gpu else 'CPU'})...")
                pbar.update(1)

            print(f"🚀 Training AQI regressor on {'GPU' if self.use_gpu else 'CPU'}...")

            regressor_params = base_params.copy()
            regressor_params.update({
                'objective': 'reg:squarederror',
                'eval_metric': 'rmse'
            })

            # Custom callback for progress tracking
            def callback_progress(env):
                if env.iteration % 10 == 0:
                    print(f"   Iteration {env.iteration}: RMSE = {env.evaluation_result_list[0][1]:.4f}")

            # Train with validation and early stopping
            X_train, X_val, y_train, y_val = train_test_split(X, y_aqi, test_size=0.2, random_state=42)

            self.regressor = xgb.XGBRegressor(**regressor_params)
            self.regressor.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=10,
                verbose=True if show_progress else False
            )

            # Step 5: Train severity classifier
            if pbar:
                pbar.set_description("Training severity classifier...")
                pbar.update(1)

            if y_severity is not None:
                print("🎯 Training severity classifier...")
                classifier_params = base_params.copy()
                classifier_params.update({
                    'objective': 'multi:softprob',
                    'eval_metric': 'mlogloss',
                    'num_class': len(self.label_encoders['Severity'].classes_)
                })

                X_train_sev, X_val_sev, y_train_sev, y_val_sev = train_test_split(
                    X, y_severity, test_size=0.2, random_state=42
                )

                self.severity_classifier = xgb.XGBClassifier(**classifier_params)
                self.severity_classifier.fit(
                    X_train_sev, y_train_sev,
                    eval_set=[(X_val_sev, y_val_sev)],
                    early_stopping_rounds=10,
                    verbose=False
                )

            # Step 6: Train pollutant classifier
            if pbar:
                pbar.set_description("Training pollutant classifier...")
                pbar.update(1)

            if y_pollutant is not None:
                print("🏭 Training pollutant classifier...")
                classifier_params = base_params.copy()
                classifier_params.update({
                    'objective': 'multi:softprob',
                    'eval_metric': 'mlogloss',
                    'num_class': len(self.label_encoders['Main Pollutant'].classes_)
                })

                X_train_pol, X_val_pol, y_train_pol, y_val_pol = train_test_split(
                    X, y_pollutant, test_size=0.2, random_state=42
                )

                self.pollutant_classifier = xgb.XGBClassifier(**classifier_params)
                self.pollutant_classifier.fit(
                    X_train_pol, y_train_pol,
                    eval_set=[(X_val_pol, y_val_pol)],
                    early_stopping_rounds=10,
                    verbose=False
                )

            self.is_fitted = True

            if pbar:
                pbar.set_description("Training complete!")
                pbar.close()

            # Display training summary
            print("✓ XGBoost models training completed!")
            print(f"  Device: {'GPU' if self.use_gpu else 'CPU'}")
            print(f"  Features: {len(self.feature_columns)}")
            print(f"  AQI RMSE: {np.sqrt(mean_squared_error(y_val, self.regressor.predict(X_val))):.4f}")

            if self.severity_classifier:
                sev_acc = accuracy_score(y_val_sev, self.severity_classifier.predict(X_val_sev))
                print(f"  Severity Accuracy: {sev_acc:.4f}")

            if self.pollutant_classifier:
                pol_acc = accuracy_score(y_val_pol, self.pollutant_classifier.predict(X_val_pol))
                print(f"  Pollutant Accuracy: {pol_acc:.4f}")

        except Exception as e:
            if pbar:
                pbar.close()
            print(f"✗ Error training XGBoost models: {str(e)}")
            raise

    def predict(self, data):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before making predictions")

        X = self._prepare_data(data)

        predictions = {}

        # AQI prediction
        predictions['aqi'] = self.regressor.predict(X)

        # Severity prediction
        if self.severity_classifier:
            predictions['severity'] = self.severity_classifier.predict(X)

        # Pollutant prediction
        if self.pollutant_classifier:
            predictions['pollutant'] = self.pollutant_classifier.predict(X)

        return predictions

    def get_feature_importance(self):
        """Get feature importance from trained models"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before getting feature importance")

        importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'aqi_importance': self.regressor.feature_importances_
        })

        if self.severity_classifier:
            importance_df['severity_importance'] = self.severity_classifier.feature_importances_

        if self.pollutant_classifier:
            importance_df['pollutant_importance'] = self.pollutant_classifier.feature_importances_

        return importance_df.sort_values('aqi_importance', ascending=False)

    def save_model(self, filepath):
        """Save the fitted models"""
        if not self.is_fitted:
            raise ValueError("Models must be fitted before saving")

        model_data = {
            'regressor': self.regressor,
            'severity_classifier': self.severity_classifier,
            'pollutant_classifier': self.pollutant_classifier,
            'label_encoders': self.label_encoders,
            'feature_columns': self.feature_columns,
            'is_fitted': self.is_fitted
        }

        joblib.dump(model_data, filepath)
        print(f"✓ XGBoost models saved to {filepath}")

    def load_model(self, filepath):
        """Load fitted models"""
        model_data = joblib.load(filepath)

        self.regressor = model_data['regressor']
        self.severity_classifier = model_data['severity_classifier']
        self.pollutant_classifier = model_data['pollutant_classifier']
        self.label_encoders = model_data['label_encoders']
        self.feature_columns = model_data['feature_columns']
        self.is_fitted = model_data['is_fitted']

        print(f"✓ XGBoost models loaded from {filepath}")

In [None]:
def setup_colab_environment():
    """Setup Google Colab environment with GPU and required packages"""
    print("🔧 Setting up Google Colab environment...")

    # Check if running in Colab
    try:
        import google.colab
        in_colab = True
        print("✓ Running in Google Colab")
    except ImportError:
        in_colab = False
        print("ℹ️  Not running in Google Colab")

    if in_colab:
        print("📋 Installing required packages for Colab...")

        # Install packages that might not be available in Colab
        import subprocess
        import sys

        packages = [
            'tqdm',
            'plotly',
            'statsmodels'
        ]

        for package in packages:
            try:
                __import__(package)
                print(f"✓ {package} already installed")
            except ImportError:
                print(f"📦 Installing {package}...")
                subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

        # Optional: Install cuML for GPU-accelerated Random Forest
        try:
            import cuml
            print("✓ cuML (GPU acceleration) is available")
        except ImportError:
            print("⚠️  cuML not available. For GPU Random Forest, install with:")
            print("   !pip install cuml-cu11")

    # Check GPU availability
    gpus = tf.config.list_physical_devices('GPU')

    if gpus:
        print(f"🚀 GPU acceleration available!")
        print(f"   GPUs detected: {len(gpus)}")
        for i, gpu in enumerate(gpus):
            print(f"   GPU {i}: {gpu.name}")

        # Configure GPU memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

        return True
    else:
        print("⚠️  No GPU detected. Models will run on CPU.")
        print("   To enable GPU in Colab: Runtime → Change runtime type → GPU")
        return False

def load_and_preprocess_data(file_path):
    """Load and preprocess the AQI dataset"""
    print("📊 Loading and preprocessing data...")

    # Load data
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV file.")

    print(f"   Original data shape: {data.shape}")

    # Basic preprocessing
    if 'From Date' in data.columns:
        data['From Date'] = pd.to_datetime(data['From Date'])
        data = data.sort_values('From Date')

    # Handle missing values in critical columns
    if 'AQI' in data.columns:
        data = data.dropna(subset=['AQI'])

    # Fill missing values for other columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

    print(f"   Processed data shape: {data.shape}")
    print(f"   Date range: {data['From Date'].min()} to {data['From Date'].max()}")
    print(f"   AQI range: {data['AQI'].min():.1f} to {data['AQI'].max():.1f}")

    return data

def split_data(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """Split data into train, validation, and test sets"""
    print("✂️  Splitting data...")

    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"

    n = len(data)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]

    print(f"   Train set: {len(train_data)} samples ({train_ratio*100:.1f}%)")
    print(f"   Validation set: {len(val_data)} samples ({val_ratio*100:.1f}%)")
    print(f"   Test set: {len(test_data)} samples ({test_ratio*100:.1f}%)")

    return train_data, val_data, test_data

def train_models(train_data, val_data, base_path, gpu_available=False, show_progress=True):
    """Train all models with progress tracking"""
    print("🏃‍♂️ Starting model training...")

    # Create models directory
    # Path("models").mkdir(exist_ok=True)

    models = {}
    training_results = {}

    # Progress bar for overall training
    model_count = 4
    if show_progress:
        overall_pbar = tqdm(total=model_count, desc="Training Models",
                           bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')

    # 1. Train ARIMA Model
    print("\n" + "="*60)
    print("1️⃣  TRAINING ARIMA MODEL")
    print("="*60)

    try:
        arima_model = ARIMAModel(order=(1, 1, 1))
        start_time = time.time()
        arima_model.fit(train_data['AQI'], show_progress=show_progress)
        training_time = time.time() - start_time

        # Save model
        arima_model.save_model(base_path + 'arima_model.pkl')

        # Make predictions
        val_pred = arima_model.predict(steps=len(val_data))

        models['arima'] = arima_model
        training_results['arima'] = {
            'training_time': training_time,
            'val_predictions': val_pred
        }

        print(f"✓ ARIMA training completed in {training_time:.2f} seconds")

    except Exception as e:
        print(f"✗ ARIMA training failed: {e}")

    if show_progress:
        overall_pbar.update(1)

    # 2. Train LSTM Model
    print("\n" + "="*60)
    print("2️⃣  TRAINING LSTM MODEL")
    print("="*60)

    try:
        lstm_model = LSTMModel(sequence_length=24, lstm_units=50)
        start_time = time.time()
        history = lstm_model.fit(train_data, epochs=20, show_progress=show_progress)
        training_time = time.time() - start_time

        # Save model
        lstm_model.save_model(base_path + 'lstm_model.h5')

        # Make predictions
        val_pred = lstm_model.predict(val_data)

        models['lstm'] = lstm_model
        training_results['lstm'] = {
            'training_time': training_time,
            'val_predictions': val_pred,
            'history': history
        }

        print(f"✓ LSTM training completed in {training_time:.2f} seconds")

    except Exception as e:
        print(f"✗ LSTM training failed: {e}")

    if show_progress:
        overall_pbar.update(1)

    # 3. Train XGBoost Model
    print("\n" + "="*60)
    print("3️⃣  TRAINING XGBOOST MODEL")
    print("="*60)

    try:
        xgb_model = XGBoostModel(n_estimators=100, use_gpu=gpu_available)
        start_time = time.time()
        xgb_model.fit(train_data, show_progress=show_progress)
        training_time = time.time() - start_time

        # Save model
        xgb_model.save_model(base_path + 'xgboost_model.pkl')

        # Make predictions
        val_pred = xgb_model.predict(val_data)

        models['xgboost'] = xgb_model
        training_results['xgboost'] = {
            'training_time': training_time,
            'val_predictions': val_pred
        }

        print(f"✓ XGBoost training completed in {training_time:.2f} seconds")

    except Exception as e:
        print(f"✗ XGBoost training failed: {e}")

    if show_progress:
        overall_pbar.update(1)

    # 4. Train Random Forest Model
    print("\n" + "="*60)
    print("4️⃣  TRAINING RANDOM FOREST MODEL")
    print("="*60)

    try:
        rf_model = RandomForestModel(n_estimators=100, use_gpu_alternative=gpu_available)
        start_time = time.time()
        rf_model.fit(train_data, show_progress=show_progress)
        training_time = time.time() - start_time

        # Save model
        rf_model.save_model(base_path + 'random_forest_model.pkl')

        # Make predictions
        val_pred = rf_model.predict(val_data)

        models['random_forest'] = rf_model
        training_results['random_forest'] = {
            'training_time': training_time,
            'val_predictions': val_pred
        }

        print(f"✓ Random Forest training completed in {training_time:.2f} seconds")

    except Exception as e:
        print(f"✗ Random Forest training failed: {e}")

    if show_progress:
        overall_pbar.update(1)
        overall_pbar.close()

    return models, training_results

def evaluate_models(models, training_results, val_data, test_data):
    """Evaluate all trained models"""
    print("\n" + "="*60)
    print("📊 MODEL EVALUATION")
    print("="*60)

    evaluation_results = {}

    print(f"{'Model':<15} {'AQI RMSE':<10} {'AQI MAE':<10} {'AQI R²':<10} {'Training Time':<15}")
    print("-" * 70)

    for model_name, result in training_results.items():
        if model_name not in models:
            continue

        model = models[model_name]
        training_time = result['training_time']

        try:
            if model_name == 'arima':
                # ARIMA predictions
                val_pred_aqi = result['val_predictions'][:len(val_data)]
                val_true_aqi = val_data['AQI'].values[:len(val_pred_aqi)]
            else:
                # Other models
                val_pred = result['val_predictions']
                val_pred_aqi = val_pred['aqi'] if isinstance(val_pred, dict) else val_pred
                val_true_aqi = val_data['AQI'].values

                # Ensure same length
                min_len = min(len(val_pred_aqi), len(val_true_aqi))
                val_pred_aqi = val_pred_aqi[:min_len]
                val_true_aqi = val_true_aqi[:min_len]

            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(val_true_aqi, val_pred_aqi))
            mae = mean_absolute_error(val_true_aqi, val_pred_aqi)
            r2 = r2_score(val_true_aqi, val_pred_aqi)

            evaluation_results[model_name] = {
                'rmse': rmse,
                'mae': mae,
                'r2': r2,
                'training_time': training_time
            }

            print(f"{model_name:<15} {rmse:<10.2f} {mae:<10.2f} {r2:<10.3f} {training_time:<15.2f}s")

        except Exception as e:
            print(f"{model_name:<15} {'ERROR':<10} {'ERROR':<10} {'ERROR':<10} {training_time:<15.2f}s")
            print(f"   Error: {e}")

    # Find best model
    if evaluation_results:
        best_model = min(evaluation_results.keys(), key=lambda x: evaluation_results[x]['rmse'])
        print(f"\n🏆 Best model: {best_model} (RMSE: {evaluation_results[best_model]['rmse']:.2f})")

    return evaluation_results

def create_training_summary(evaluation_results, gpu_available):
    """Create a summary of the training session"""
    print("\n" + "="*60)
    print("📋 TRAINING SUMMARY")
    print("="*60)

    print(f"Environment: {'GPU Accelerated' if gpu_available else 'CPU Only'}")
    print(f"Models trained: {len(evaluation_results)}")
    print(f"Total training time: {sum(r['training_time'] for r in evaluation_results.values()):.2f} seconds")

    if evaluation_results:
        # Performance ranking
        sorted_models = sorted(evaluation_results.items(), key=lambda x: x[1]['rmse'])

        print("\nPerformance Ranking (by RMSE):")
        for i, (model_name, results) in enumerate(sorted_models, 1):
            print(f"  {i}. {model_name}: {results['rmse']:.2f}")

        # Speed ranking
        sorted_by_speed = sorted(evaluation_results.items(), key=lambda x: x[1]['training_time'])

        print("\nSpeed Ranking (by training time):")
        for i, (model_name, results) in enumerate(sorted_by_speed, 1):
            print(f"  {i}. {model_name}: {results['training_time']:.2f}s")

def main(data_file):
    """Main training pipeline"""
    print("🚀 AQI PREDICTION MODEL TRAINING PIPELINE")
    print("="*60)

    # Setup environment
    gpu_available = setup_colab_environment()

    # Load data
    data = load_and_preprocess_data(data_file)

    # Split data
    train_data, val_data, test_data = split_data(data)

    # Train models
    models, training_results = train_models(train_data, val_data, '/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/' ,gpu_available)

    # Evaluate models
    evaluation_results = evaluate_models(models, training_results, val_data, test_data)

    # Create summary
    create_training_summary(evaluation_results, gpu_available)

    print("\n✅ Training pipeline completed!")
    print(f"📁 Models saved in: {Path('models').absolute()}")

    return models, evaluation_results

In [None]:
def predict_aqi(from_datetime, to_datetime, location_city, model_type='xgboost'):
    """
    Predict AQI for a given time period and location

    Parameters:
    from_datetime: Start datetime (str or datetime)
    to_datetime: End datetime (str or datetime)
    location_city: City name (str)
    model_type: Type of model to use ('arima', 'lstm', 'xgboost', 'random_forest')

    Returns:
    DataFrame with predictions
    """
    try:
        # Convert datetime strings if needed
        if isinstance(from_datetime, str):
            from_datetime = pd.to_datetime(from_datetime)
        if isinstance(to_datetime, str):
            to_datetime = pd.to_datetime(to_datetime)

        # Create hourly datetime range
        datetime_range = pd.date_range(start=from_datetime, end=to_datetime, freq='H')

        # Create dummy data for prediction (in real scenario, you'd have actual sensor data)
        # For demonstration, we'll use sample data patterns
        n_hours = len(datetime_range)

        # Create base DataFrame
        prediction_data = pd.DataFrame({
            'From Date': datetime_range,
            'AQI': np.random.randint(30, 100, n_hours),  # Dummy AQI values
            'AT (degree C)': np.random.normal(25, 5, n_hours),
            'RH (%)': np.random.normal(65, 10, n_hours),
            'WS (m/s)': np.random.normal(2, 0.5, n_hours),
            'WD (degree)': np.random.uniform(0, 360, n_hours),
            'SR (W/mt2)': np.random.uniform(0, 500, n_hours),
            'NO (ug/m3)': np.random.uniform(10, 50, n_hours),
            'NOx (ug/m3)': np.random.uniform(20, 80, n_hours),
            'Benzene (ug/m3)': np.random.uniform(0.5, 2, n_hours),
            'Toluene (ug/m3)': np.random.uniform(2, 8, n_hours),
            'RF (mm)': np.random.uniform(0, 2, n_hours),
            'BP (mmHg)': np.random.normal(760, 20, n_hours),
            'Severity': np.random.choice(['GOOD', 'SATISFACTORY', 'MODERATE', 'POOR'], n_hours),
            'Main Pollutant': np.random.choice(['PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'NO2 (ug/m3)', 'CO (mg/m3)'], n_hours),
            'city': location_city,
            'state': 'Unknown',
            'latitude': 0.0,
            'longitude': 0.0,
            'elevation': 0.0,
            'file_name': 'PRED001'
        })

        # Load appropriate model
        models_dir = 'models'
        if model_type == 'arima':
            model = ARIMAModel()
            model.load_model(os.path.join(models_dir, 'arima_model.pkl'))
            # For ARIMA, we need to implement a different prediction approach
            forecast, conf_int = model.predict(steps=n_hours)
            predictions = pd.DataFrame({
                'datetime': datetime_range,
                'AQI': forecast,
                'Severity': 'PREDICTED',
                'Main_Pollutant': 'PREDICTED'
            })

        elif model_type == 'lstm':
            model = LSTMModel()
            model.load_model(os.path.join(models_dir, 'lstm_model'))
            aqi_pred, severity_pred, pollutant_pred = model.predict(prediction_data)
            predictions = pd.DataFrame({
                'datetime': datetime_range,
                'AQI': aqi_pred,
                'Severity': severity_pred,
                'Main_Pollutant': pollutant_pred
            })

        elif model_type == 'xgboost':
            model = XGBoostModel()
            model.load_model(os.path.join(models_dir, 'xgboost_model'))
            aqi_pred, severity_pred, pollutant_pred = model.predict(prediction_data)
            predictions = pd.DataFrame({
                'datetime': datetime_range,
                'AQI': aqi_pred,
                'Severity': severity_pred,
                'Main_Pollutant': pollutant_pred
            })

        elif model_type == 'random_forest':
            model = RandomForestModel()
            model.load_model(os.path.join(models_dir, 'random_forest_model'))
            aqi_pred, severity_pred, pollutant_pred, _, _ = model.predict(prediction_data)
            predictions = pd.DataFrame({
                'datetime': datetime_range,
                'AQI': aqi_pred,
                'Severity': severity_pred,
                'Main_Pollutant': pollutant_pred
            })

        else:
            raise ValueError(f"Unknown model type: {model_type}")

        print(f"Predictions generated for {location_city} from {from_datetime} to {to_datetime}")
        print(f"Model used: {model_type.upper()}")
        print(f"Number of predictions: {len(predictions)}")

        return predictions

    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None

In [21]:
# Main execution
if __name__ == "__main__":
    # Initialize trainer
    trainer = AQIModelTrainer(data_path='/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/statewise_aqi/Andhra Pradesh_aqi.csv',models_dir='/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/models')

    # Run complete pipeline
    results = trainer.run_complete_pipeline()

    # Example prediction
    print("\n" + "=" * 50)
    print("EXAMPLE PREDICTION")
    print("=" * 50)

    # Make a sample prediction
    sample_prediction = predict_aqi(
        from_datetime='2024-01-01 00:00:00',
        to_datetime='2024-01-01 23:00:00',
        location_city='Tirupati',
        model_type='xgboost'
    )

    if sample_prediction is not None:
        print("\nSample predictions:")
        print(sample_prediction.head(10))

🚀 AQI PREDICTION MODEL TRAINING PIPELINE
🔧 Setting up Google Colab environment...
✓ Running in Google Colab
📋 Installing required packages for Colab...
✓ tqdm already installed
✓ plotly already installed
✓ statsmodels already installed
🚀 GPU acceleration available!
   GPUs detected: 1
   GPU 0: /physical_device:GPU:0
📊 Loading and preprocessing data...
   Original data shape: (224868, 21)
   Processed data shape: (224868, 21)
   Date range: 2016-07-01 10:00:00 to 2023-03-31 23:00:00
   AQI range: 0.0 to 500.0
✂️  Splitting data...
   Train set: 157407 samples (70.0%)
   Validation set: 33730 samples (15.0%)
   Test set: 33731 samples (15.0%)
🏃‍♂️ Starting model training...


Training Models:   0%|           | 0/4 [00:00<?]


1️⃣  TRAINING ARIMA MODEL


Training ARIMA:   0%|           | 0/4 [00:00<?]

✓ ARIMA(1, 1, 1) model fitted successfully!
  AIC: 1612107.68
  Data points: 157407
✓ ARIMA model saved to /content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/modelsarima_model.pkl
✓ ARIMA training completed in 90.10 seconds

2️⃣  TRAINING LSTM MODEL
🔧 Setting up GPU acceleration for Google Colab...
✓ GPU acceleration enabled!
  Available GPUs: 1
  GPU Names: ['/physical_device:GPU:0']
  Mixed precision: mixed_float16
  GPU Details: Tesla T4
AQI                         int64
AT (degree C)             float64
BP (mmHg)                 float64
Benzene (ug/m3)           float64
From Date          datetime64[ns]
Main Pollutant             object
NO (ug/m3)                float64
NOx (ug/m3)               float64
RF (mm)                   float64
RH (%)                    float64
SR (W/mt2)                float64
Severity                   object
Toluene (ug/m3)           float64
WD (degree)               float64
WS (m/s)                  float64
file_name                  object
state 

Training LSTM:   0%|           | 0/5 [00:00<?]

✗ Error training LSTM model: float() argument must be a string or a real number, not 'Timestamp'
✗ LSTM training failed: float() argument must be a string or a real number, not 'Timestamp'

3️⃣  TRAINING XGBOOST MODEL
🔧 Setting up XGBoost GPU acceleration...
✓ NVIDIA GPU detected!
  GPU acceleration will be used for XGBoost
  GPU: 0 Tesla T4


Training XGBoost:   0%|           | 0/6 [00:00<?]

✗ Error training XGBoost models: y should be a 1d array, got an array of shape () instead.
✗ XGBoost training failed: y should be a 1d array, got an array of shape () instead.

4️⃣  TRAINING RANDOM FOREST MODEL
🔧 Setting up Random Forest with acceleration options...
⚠️  cuML not available. Install with: pip install cuml-cu11
   Using standard scikit-learn Random Forest
📊 Using all 2 CPU cores for parallel processing


Training Random Forest:   0%|           | 0/6 [00:00<?]

✗ Error training Random Forest models: y should be a 1d array, got an array of shape () instead.
✗ Random Forest training failed: y should be a 1d array, got an array of shape () instead.

📊 MODEL EVALUATION
Model           AQI RMSE   AQI MAE    AQI R²     Training Time  
----------------------------------------------------------------------
arima           71.36      50.84      -0.951     90.10          s

🏆 Best model: arima (RMSE: 71.36)

📋 TRAINING SUMMARY
Environment: GPU Accelerated
Models trained: 1
Total training time: 90.10 seconds

Performance Ranking (by RMSE):
  1. arima: 71.36

Speed Ranking (by training time):
  1. arima: 90.10s

✅ Training pipeline completed!


NameError: name 'Path' is not defined