In [4]:
!pip install tqdm
!pip install opencage
!pip install scikit-learn
!pip install tensorflow
!pip install keras



In [10]:
import pandas as pd
import numpy as np
import time
from geopy.geocoders import Nominatim
from tqdm.notebook import tqdm
from opencage.geocoder import OpenCageGeocode
from google.colab import userdata
import requests
import os
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
import joblib
import warnings
# Arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error
# LSTM
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Random forest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier


warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
class ARIMAModel:
    def __init__(self, order=(1, 1, 1)):
        """
        Initialize ARIMA model for AQI forecasting

        Parameters:
        order: tuple (p, d, q) for ARIMA parameters
        """
        self.order = order
        self.model = None
        self.fitted_model = None
        self.is_fitted = False

    def check_stationarity(self, timeseries):
        """
        Check if time series is stationary using ADF test
        """
        result = adfuller(timeseries.dropna())
        print('ADF Statistic:', result[0])
        print('p-value:', result[1])
        print('Critical Values:')
        for key, value in result[4].items():
            print(f'\t{key}: {value}')

        if result[1] <= 0.05:
            print("Series is stationary")
            return True
        else:
            print("Series is non-stationary")
            return False

    def prepare_data(self, df):
        """
        Prepare data for ARIMA modeling
        """
        # Convert date column and set as index
        df['From Date'] = pd.to_datetime(df['From Date'])
        df = df.set_index('From Date')

        # Sort by date
        df = df.sort_index()

        # Handle missing values in AQI
        df['AQI'] = df['AQI'].fillna(method='ffill').fillna(method='bfill')

        return df['AQI']

    def fit(self, train_data):
        """
        Fit ARIMA model to training data
        """
        try:
            # Prepare time series data
            ts_data = self.prepare_data(train_data)

            # Check stationarity
            self.check_stationarity(ts_data)

            # Fit ARIMA model
            self.model = ARIMA(ts_data, order=self.order)
            self.fitted_model = self.model.fit()
            self.is_fitted = True

            print(f"ARIMA{self.order} model fitted successfully")
            print(self.fitted_model.summary())

            return self.fitted_model

        except Exception as e:
            print(f"Error fitting ARIMA model: {str(e)}")
            return None

    def predict(self, steps=1):
        """
        Make predictions using fitted ARIMA model
        """
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")

        try:
            # Make predictions
            forecast = self.fitted_model.forecast(steps=steps)
            conf_int = self.fitted_model.get_forecast(steps=steps).conf_int()

            return forecast, conf_int

        except Exception as e:
            print(f"Error making predictions: {str(e)}")
            return None, None

    def evaluate(self, test_data):
        """
        Evaluate model performance
        """
        if not self.is_fitted:
            raise ValueError("Model must be fitted before evaluation")

        try:
            # Prepare test data
            ts_test = self.prepare_data(test_data)

            # Make predictions
            predictions, _ = self.predict(steps=len(ts_test))

            # Calculate metrics
            mse = mean_squared_error(ts_test.values, predictions)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(ts_test.values, predictions)

            metrics = {
                'MSE': mse,
                'RMSE': rmse,
                'MAE': mae
            }

            print(f"ARIMA Model Evaluation:")
            print(f"RMSE: {rmse:.4f}")
            print(f"MAE: {mae:.4f}")

            return metrics, predictions

        except Exception as e:
            print(f"Error evaluating model: {str(e)}")
            return None, None

    def save_model(self, filepath):
        """
        Save the fitted model
        """
        if not self.is_fitted:
            raise ValueError("Model must be fitted before saving")

        try:
            model_data = {
                'fitted_model': self.fitted_model,
                'order': self.order,
                'is_fitted': self.is_fitted
            }
            joblib.dump(model_data, filepath)
            print(f"ARIMA model saved to {filepath}")

        except Exception as e:
            print(f"Error saving model: {str(e)}")

    def load_model(self, filepath):
        """
        Load a saved model
        """
        try:
            model_data = joblib.load(filepath)
            self.fitted_model = model_data['fitted_model']
            self.order = model_data['order']
            self.is_fitted = model_data['is_fitted']
            print(f"ARIMA model loaded from {filepath}")

        except Exception as e:
            print(f"Error loading model: {str(e)}")

In [12]:
class LSTMModel:
    def __init__(self, sequence_length=24, lstm_units=50, dropout_rate=0.2):
        """
        Initialize LSTM model for AQI prediction

        Parameters:
        sequence_length: Number of time steps to look back
        lstm_units: Number of LSTM units
        dropout_rate: Dropout rate for regularization
        """
        self.sequence_length = sequence_length
        self.lstm_units = lstm_units
        self.dropout_rate = dropout_rate
        self.regression_model = None
        self.classification_model = None
        self.severity_model = None
        self.scaler = StandardScaler()
        self.severity_encoder = LabelEncoder()
        self.pollutant_encoder = LabelEncoder()
        self.is_fitted = False

    def prepare_data(self, df):
        """
        Prepare data for LSTM modeling
        """
        # Convert date and sort
        df['From Date'] = pd.to_datetime(df['From Date'])
        df = df.sort_values('From Date')

        # Add time features
        df['Hour'] = df['From Date'].dt.hour
        df['Day'] = df['From Date'].dt.day
        df['Month'] = df['From Date'].dt.month
        df['Year'] = df['From Date'].dt.year
        df['DayOfWeek'] = df['From Date'].dt.dayofweek

        # Handle missing values
        numeric_cols = ['AT (degree C)', 'BP (mmHg)', 'Benzene (ug/m3)', 'NO (ug/m3)',
                        'NOx (ug/m3)', 'RF (mm)', 'RH (%)', 'SR (W/mt2)', 'Toluene (ug/m3)',
                        'WD (degree)', 'WS (m/s)']

        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())

        # Encode categorical variables
        df['Severity_encoded'] = self.severity_encoder.fit_transform(df['Severity'])
        df['Main_Pollutant_encoded'] = self.pollutant_encoder.fit_transform(df['Main Pollutant'])

        return df

    def create_sequences(self, data, target_col, feature_cols):
        """
        Create sequences for LSTM input
        """
        X, y = [], []

        for i in range(self.sequence_length, len(data)):
            X.append(data[feature_cols].iloc[i-self.sequence_length:i].values)
            y.append(data[target_col].iloc[i])

        return np.array(X), np.array(y)

    def build_regression_model(self, n_features):
        """
        Build LSTM model for AQI regression
        """
        model = Sequential([
            LSTM(self.lstm_units, return_sequences=True, input_shape=(self.sequence_length, n_features)),
            Dropout(self.dropout_rate),
            LSTM(self.lstm_units // 2, return_sequences=False),
            Dropout(self.dropout_rate),
            Dense(25, activation='relu'),
            Dropout(self.dropout_rate),
            Dense(1, activation='linear')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
        return model

    def build_classification_model(self, n_features, n_classes):
        """
        Build LSTM model for classification
        """
        model = Sequential([
            LSTM(self.lstm_units, return_sequences=True, input_shape=(self.sequence_length, n_features)),
            Dropout(self.dropout_rate),
            LSTM(self.lstm_units // 2, return_sequences=False),
            Dropout(self.dropout_rate),
            Dense(25, activation='relu'),
            Dropout(self.dropout_rate),
            Dense(n_classes, activation='softmax')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001),
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])
        return model

    def fit(self, train_data, validation_data=None, epochs=100, batch_size=32):
        """
        Fit LSTM models
        """
        try:
            # Prepare data
            train_df = self.prepare_data(train_data.copy())

            # Select features
            feature_cols = ['AT (degree C)', 'BP (mmHg)', 'Benzene (ug/m3)', 'NO (ug/m3)',
                           'NOx (ug/m3)', 'RF (mm)', 'RH (%)', 'SR (W/mt2)', 'Toluene (ug/m3)',
                           'WD (degree)', 'WS (m/s)', 'Hour', 'Day', 'Month', 'Year', 'DayOfWeek',
                           'latitude', 'longitude', 'elevation']

            # Filter available columns
            available_features = [col for col in feature_cols if col in train_df.columns]

            # Scale features
            train_df[available_features] = self.scaler.fit_transform(train_df[available_features])

            # Create sequences
            X_train, y_aqi = self.create_sequences(train_df, 'AQI', available_features)
            _, y_severity = self.create_sequences(train_df, 'Severity_encoded', available_features)
            _, y_pollutant = self.create_sequences(train_df, 'Main_Pollutant_encoded', available_features)

            if len(X_train) == 0:
                raise ValueError("Not enough data to create sequences")

            # Build models
            n_features = len(available_features)
            self.regression_model = self.build_regression_model(n_features)
            self.classification_model = self.build_classification_model(n_features,
                                                                       len(np.unique(y_severity)))
            self.severity_model = self.build_classification_model(n_features,
                                                                 len(np.unique(y_pollutant)))

            # Callbacks
            early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0001)

            # Prepare validation data if provided
            X_val, y_val_aqi, y_val_severity, y_val_pollutant = None, None, None, None
            if validation_data is not None:
                val_df = self.prepare_data(validation_data.copy())
                val_df[available_features] = self.scaler.transform(val_df[available_features])
                X_val, y_val_aqi = self.create_sequences(val_df, 'AQI', available_features)
                _, y_val_severity = self.create_sequences(val_df, 'Severity_encoded', available_features)
                _, y_val_pollutant = self.create_sequences(val_df, 'Main_Pollutant_encoded', available_features)

            # Train AQI regression model
            print("Training AQI regression model...")
            history_reg = self.regression_model.fit(
                X_train, y_aqi,
                validation_data=(X_val, y_val_aqi) if X_val is not None else None,
                epochs=epochs,
                batch_size=batch_size,
                callbacks=[early_stopping, reduce_lr] if X_val is not None else [],
                verbose=1
            )

            # Train severity classification model
            print("Training severity classification model...")
            history_sev = self.classification_model.fit(
                X_train, y_severity,
                validation_data=(X_val, y_val_severity) if X_val is not None else None,
                epochs=epochs,
                batch_size=batch_size,
                callbacks=[early_stopping, reduce_lr] if X_val is not None else [],
                verbose=1
            )

            # Train pollutant classification model
            print("Training pollutant classification model...")
            history_pol = self.severity_model.fit(
                X_train, y_pollutant,
                validation_data=(X_val, y_val_pollutant) if X_val is not None else None,
                epochs=epochs,
                batch_size=batch_size,
                callbacks=[early_stopping, reduce_lr] if X_val is not None else [],
                verbose=1
            )

            self.is_fitted = True
            self.feature_cols = available_features
            print("LSTM models trained successfully")

            return history_reg, history_sev, history_pol

        except Exception as e:
            print(f"Error training LSTM models: {str(e)}")
            return None, None, None

    def predict(self, test_data):
        """
        Make predictions using fitted LSTM models
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before making predictions")

        try:
            # Prepare test data
            test_df = self.prepare_data(test_data.copy())
            test_df[self.feature_cols] = self.scaler.transform(test_df[self.feature_cols])

            # Create sequences
            X_test, _ = self.create_sequences(test_df, 'AQI', self.feature_cols)

            if len(X_test) == 0:
                raise ValueError("Not enough test data to create sequences")

            # Make predictions
            aqi_pred = self.regression_model.predict(X_test)
            severity_pred = self.classification_model.predict(X_test)
            pollutant_pred = self.severity_model.predict(X_test)

            # Convert predictions to original format
            aqi_pred = aqi_pred.flatten()
            severity_pred = np.argmax(severity_pred, axis=1)
            pollutant_pred = np.argmax(pollutant_pred, axis=1)

            # Decode categorical predictions
            severity_labels = self.severity_encoder.inverse_transform(severity_pred)
            pollutant_labels = self.pollutant_encoder.inverse_transform(pollutant_pred)

            return aqi_pred, severity_labels, pollutant_labels

        except Exception as e:
            print(f"Error making predictions: {str(e)}")
            return None, None, None

    def evaluate(self, test_data):
        """
        Evaluate model performance
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before evaluation")

        try:
            # Prepare test data
            test_df = self.prepare_data(test_data.copy())
            test_df[self.feature_cols] = self.scaler.transform(test_df[self.feature_cols])

            # Create sequences
            X_test, y_aqi = self.create_sequences(test_df, 'AQI', self.feature_cols)
            _, y_severity = self.create_sequences(test_df, 'Severity_encoded', self.feature_cols)
            _, y_pollutant = self.create_sequences(test_df, 'Main_Pollutant_encoded', self.feature_cols)

            # Make predictions
            aqi_pred, severity_pred, pollutant_pred = self.predict(test_data)

            # Calculate metrics
            aqi_rmse = np.sqrt(mean_squared_error(y_aqi, aqi_pred))
            severity_acc = accuracy_score(y_severity,
                                        self.severity_encoder.transform(severity_pred))
            pollutant_acc = accuracy_score(y_pollutant,
                                         self.pollutant_encoder.transform(pollutant_pred))

            metrics = {
                'AQI_RMSE': aqi_rmse,
                'Severity_Accuracy': severity_acc,
                'Pollutant_Accuracy': pollutant_acc
            }

            print(f"LSTM Model Evaluation:")
            print(f"AQI RMSE: {aqi_rmse:.4f}")
            print(f"Severity Accuracy: {severity_acc:.4f}")
            print(f"Pollutant Accuracy: {pollutant_acc:.4f}")

            return metrics

        except Exception as e:
            print(f"Error evaluating models: {str(e)}")
            return None

    def save_model(self, filepath_prefix):
        """
        Save the fitted models
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before saving")

        try:
            # Save Keras models
            self.regression_model.save(f"{filepath_prefix}_regression.h5")
            self.classification_model.save(f"{filepath_prefix}_severity.h5")
            self.severity_model.save(f"{filepath_prefix}_pollutant.h5")

            # Save preprocessing objects
            preprocessing_data = {
                'scaler': self.scaler,
                'severity_encoder': self.severity_encoder,
                'pollutant_encoder': self.pollutant_encoder,
                'feature_cols': self.feature_cols,
                'sequence_length': self.sequence_length,
                'lstm_units': self.lstm_units,
                'dropout_rate': self.dropout_rate
            }
            joblib.dump(preprocessing_data, f"{filepath_prefix}_preprocessing.pkl")

            print(f"LSTM models saved with prefix {filepath_prefix}")

        except Exception as e:
            print(f"Error saving models: {str(e)}")

    def load_model(self, filepath_prefix):
        """
        Load saved models
        """
        try:
            # Load Keras models
            self.regression_model = tf.keras.models.load_model(f"{filepath_prefix}_regression.h5")
            self.classification_model = tf.keras.models.load_model(f"{filepath_prefix}_severity.h5")
            self.severity_model = tf.keras.models.load_model(f"{filepath_prefix}_pollutant.h5")

            # Load preprocessing objects
            preprocessing_data = joblib.load(f"{filepath_prefix}_preprocessing.pkl")
            self.scaler = preprocessing_data['scaler']
            self.severity_encoder = preprocessing_data['severity_encoder']
            self.pollutant_encoder = preprocessing_data['pollutant_encoder']
            self.feature_cols = preprocessing_data['feature_cols']
            self.sequence_length = preprocessing_data['sequence_length']
            self.lstm_units = preprocessing_data['lstm_units']
            self.dropout_rate = preprocessing_data['dropout_rate']

            self.is_fitted = True
            print(f"LSTM models loaded from {filepath_prefix}")

        except Exception as e:
            print(f"Error loading models: {str(e)}")

In [13]:
class RandomForestModel:
    def __init__(self, n_estimators=100, max_depth=None, random_state=42, n_jobs=-1):
        """
        Initialize Random Forest models for AQI prediction

        Parameters:
        n_estimators: Number of trees in the forest
        max_depth: Maximum depth of trees
        random_state: Random state for reproducibility
        n_jobs: Number of parallel jobs
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.n_jobs = n_jobs

        # Models
        self.regression_model = None
        self.severity_classifier = None
        self.pollutant_classifier = None

        # Preprocessing
        self.scaler = StandardScaler()
        self.severity_encoder = LabelEncoder()
        self.pollutant_encoder = LabelEncoder()
        self.feature_cols = None
        self.is_fitted = False

    def prepare_data(self, df):
        """
        Prepare data for Random Forest modeling
        """
        # Convert date and add time features
        df['From Date'] = pd.to_datetime(df['From Date'])
        df['Hour'] = df['From Date'].dt.hour
        df['Day'] = df['From Date'].dt.day
        df['Month'] = df['From Date'].dt.month
        df['Year'] = df['From Date'].dt.year
        df['DayOfWeek'] = df['From Date'].dt.dayofweek
        df['Quarter'] = df['From Date'].dt.quarter
        df['DayOfYear'] = df['From Date'].dt.dayofyear

        # Handle missing values
        numeric_cols = ['AT (degree C)', 'BP (mmHg)', 'Benzene (ug/m3)', 'NO (ug/m3)',
                        'NOx (ug/m3)', 'RF (mm)', 'RH (%)', 'SR (W/mt2)', 'Toluene (ug/m3)',
                        'WD (degree)', 'WS (m/s)']

        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())

        # Encode categorical variables
        if not self.is_fitted:
            df['Severity_encoded'] = self.severity_encoder.fit_transform(df['Severity'])
            df['Main_Pollutant_encoded'] = self.pollutant_encoder.fit_transform(df['Main Pollutant'])
        else:
            df['Severity_encoded'] = self.severity_encoder.transform(df['Severity'])
            df['Main_Pollutant_encoded'] = self.pollutant_encoder.transform(df['Main Pollutant'])

        return df

    def create_features(self, df):
        """
        Create additional features for better prediction
        """
        # Time-based features
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) |
                           (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(int)

        # Seasonal features
        df['Season'] = pd.cut(df['Month'], bins=[0, 3, 6, 9, 12],
                             labels=['Winter', 'Spring', 'Summer', 'Fall'])
        df['Season_encoded'] = pd.Categorical(df['Season']).codes

        # Lagged features (if enough data)
        if len(df) > 24:
            df['AQI_lag1'] = df['AQI'].shift(1)
            df['AQI_lag24'] = df['AQI'].shift(24)
            df['AQI_rolling_mean_24'] = df['AQI'].rolling(window=24).mean()
            df['AQI_rolling_std_24'] = df['AQI'].rolling(window=24).std()
            df['AQI_rolling_max_24'] = df['AQI'].rolling(window=24).max()
            df['AQI_rolling_min_24'] = df['AQI'].rolling(window=24).min()

        # Pollutant ratios and interactions
        if 'NO (ug/m3)' in df.columns and 'NOx (ug/m3)' in df.columns:
            df['NO_NOx_ratio'] = df['NO (ug/m3)'] / (df['NOx (ug/m3)'] + 1e-8)

        if 'Benzene (ug/m3)' in df.columns and 'Toluene (ug/m3)' in df.columns:
            df['Benzene_Toluene_ratio'] = df['Benzene (ug/m3)'] / (df['Toluene (ug/m3)'] + 1e-8)

        # Weather interactions
        if 'AT (degree C)' in df.columns and 'RH (%)' in df.columns:
            df['Temp_Humidity_interaction'] = df['AT (degree C)'] * df['RH (%)']

        if 'WS (m/s)' in df.columns and 'WD (degree)' in df.columns:
            df['Wind_component_x'] = df['WS (m/s)'] * np.cos(np.radians(df['WD (degree)']))
            df['Wind_component_y'] = df['WS (m/s)'] * np.sin(np.radians(df['WD (degree)']))

        # Solar radiation and temperature interaction
        if 'SR (W/mt2)' in df.columns and 'AT (degree C)' in df.columns:
            df['Solar_Temp_interaction'] = df['SR (W/mt2)'] * df['AT (degree C)']

        # Air pressure and humidity interaction
        if 'BP (mmHg)' in df.columns and 'RH (%)' in df.columns:
            df['Pressure_Humidity_interaction'] = df['BP (mmHg)'] * df['RH (%)']

        return df

    def fit(self, train_data):
        """
        Fit Random Forest models
        """
        try:
            # Prepare data
            train_df = self.prepare_data(train_data.copy())
            train_df = self.create_features(train_df)

            # Select features
            base_features = ['AT (degree C)', 'BP (mmHg)', 'Benzene (ug/m3)', 'NO (ug/m3)',
                            'NOx (ug/m3)', 'RF (mm)', 'RH (%)', 'SR (W/mt2)', 'Toluene (ug/m3)',
                            'WD (degree)', 'WS (m/s)', 'Hour', 'Day', 'Month', 'Year', 'DayOfWeek',
                            'Quarter', 'DayOfYear', 'IsWeekend', 'IsRushHour', 'Season_encoded',
                            'latitude', 'longitude', 'elevation']

            # Add engineered features
            engineered_features = ['AQI_lag1', 'AQI_lag24', 'AQI_rolling_mean_24', 'AQI_rolling_std_24',
                                 'AQI_rolling_max_24', 'AQI_rolling_min_24', 'NO_NOx_ratio',
                                 'Benzene_Toluene_ratio', 'Temp_Humidity_interaction', 'Wind_component_x',
                                 'Wind_component_y', 'Solar_Temp_interaction', 'Pressure_Humidity_interaction']

            all_features = base_features + engineered_features

            # Filter available columns
            self.feature_cols = [col for col in all_features if col in train_df.columns]

            # Remove rows with NaN values (from lagged features)
            train_df = train_df.dropna()

            if len(train_df) == 0:
                raise ValueError("No data remaining after removing NaN values")

            # Prepare features and targets
            X_train = train_df[self.feature_cols]
            y_aqi = train_df['AQI']
            y_severity = train_df['Severity_encoded']
            y_pollutant = train_df['Main_Pollutant_encoded']

            # Scale features (Random Forest doesn't require scaling, but we'll keep it for consistency)
            X_train_scaled = self.scaler.fit_transform(X_train)

            # Initialize models
            self.regression_model = RandomForestRegressor(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                oob_score=True
            )

            self.severity_classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                oob_score=True
            )

            self.pollutant_classifier = RandomForestClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                oob_score=True
            )

            # Fit models
            print("Training AQI regression model...")
            self.regression_model.fit(X_train_scaled, y_aqi)
            print(f"OOB Score (R²): {self.regression_model.oob_score_:.4f}")

            print("Training severity classification model...")
            self.severity_classifier.fit(X_train_scaled, y_severity)
            print(f"OOB Score (Accuracy): {self.severity_classifier.oob_score_:.4f}")

            print("Training pollutant classification model...")
            self.pollutant_classifier.fit(X_train_scaled, y_pollutant)
            print(f"OOB Score (Accuracy): {self.pollutant_classifier.oob_score_:.4f}")

            self.is_fitted = True
            print("Random Forest models trained successfully")

            # Feature importance
            print("\nTop 10 most important features for AQI prediction:")
            feature_importance = self.regression_model.feature_importances_
            indices = np.argsort(feature_importance)[::-1][:10]
            for i, idx in enumerate(indices):
                print(f"{i+1}. {self.feature_cols[idx]}: {feature_importance[idx]:.4f}")

            return self.regression_model, self.severity_classifier, self.pollutant_classifier

        except Exception as e:
            print(f"Error training Random Forest models: {str(e)}")
            return None, None, None

    def predict(self, test_data):
        """
        Make predictions using fitted Random Forest models
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before making predictions")

        try:
            # Prepare test data
            test_df = self.prepare_data(test_data.copy())
            test_df = self.create_features(test_df)

            # Handle missing lagged features for prediction
            if 'AQI_lag1' in self.feature_cols and 'AQI_lag1' not in test_df.columns:
                test_df['AQI_lag1'] = test_df['AQI'].shift(1).fillna(test_df['AQI'].mean())
            if 'AQI_lag24' in self.feature_cols and 'AQI_lag24' not in test_df.columns:
                test_df['AQI_lag24'] = test_df['AQI'].shift(24).fillna(test_df['AQI'].mean())

            # Handle rolling features
            rolling_features = ['AQI_rolling_mean_24', 'AQI_rolling_std_24', 'AQI_rolling_max_24', 'AQI_rolling_min_24']
            for feature in rolling_features:
                if feature in self.feature_cols and feature not in test_df.columns:
                    if 'mean' in feature:
                        test_df[feature] = test_df['AQI'].rolling(window=24).mean().fillna(test_df['AQI'].mean())
                    elif 'std' in feature:
                        test_df[feature] = test_df['AQI'].rolling(window=24).std().fillna(test_df['AQI'].std())
                    elif 'max' in feature:
                        test_df[feature] = test_df['AQI'].rolling(window=24).max().fillna(test_df['AQI'].max())
                    elif 'min' in feature:
                        test_df[feature] = test_df['AQI'].rolling(window=24).min().fillna(test_df['AQI'].min())

            # Fill remaining NaN values
            test_df = test_df.fillna(method='ffill').fillna(method='bfill')

            # Prepare features
            X_test = test_df[self.feature_cols]
            X_test_scaled = self.scaler.transform(X_test)

            # Make predictions
            aqi_pred = self.regression_model.predict(X_test_scaled)
            severity_pred = self.severity_classifier.predict(X_test_scaled)
            pollutant_pred = self.pollutant_classifier.predict(X_test_scaled)

            # Get prediction probabilities for additional information
            severity_proba = self.severity_classifier.predict_proba(X_test_scaled)
            pollutant_proba = self.pollutant_classifier.predict_proba(X_test_scaled)

            # Decode categorical predictions
            severity_labels = self.severity_encoder.inverse_transform(severity_pred)
            pollutant_labels = self.pollutant_encoder.inverse_transform(pollutant_pred)

            return aqi_pred, severity_labels, pollutant_labels, severity_proba, pollutant_proba

        except Exception as e:
            print(f"Error making predictions: {str(e)}")
            return None, None, None, None, None

    def evaluate(self, test_data):
        """
        Evaluate model performance
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before evaluation")

        try:
            # Prepare test data
            test_df = self.prepare_data(test_data.copy())
            test_df = self.create_features(test_df)
            test_df = test_df.dropna()

            if len(test_df) == 0:
                raise ValueError("No test data remaining after removing NaN values")

            # Get actual values
            y_true_aqi = test_df['AQI']
            y_true_severity = test_df['Severity_encoded']
            y_true_pollutant = test_df['Main_Pollutant_encoded']

            # Make predictions
            aqi_pred, severity_pred, pollutant_pred, _, _ = self.predict(test_data)

            # Calculate metrics
            aqi_rmse = np.sqrt(mean_squared_error(y_true_aqi, aqi_pred[:len(y_true_aqi)]))
            aqi_mae = np.mean(np.abs(y_true_aqi - aqi_pred[:len(y_true_aqi)]))

            severity_acc = accuracy_score(y_true_severity,
                                        self.severity_encoder.transform(severity_pred[:len(y_true_severity)]))
            pollutant_acc = accuracy_score(y_true_pollutant,
                                         self.pollutant_encoder.transform(pollutant_pred[:len(y_true_pollutant)]))

            # F1 scores
            severity_f1 = f1_score(y_true_severity,
                                 self.severity_encoder.transform(severity_pred[:len(y_true_severity)]),
                                 average='weighted')
            pollutant_f1 = f1_score(y_true_pollutant,
                                   self.pollutant_encoder.transform(pollutant_pred[:len(y_true_pollutant)]),
                                   average='weighted')

            # R² score
            ss_res = np.sum((y_true_aqi - aqi_pred[:len(y_true_aqi)]) ** 2)
            ss_tot = np.sum((y_true_aqi - np.mean(y_true_aqi)) ** 2)
            r2_score = 1 - (ss_res / ss_tot)

            metrics = {
                'AQI_RMSE': aqi_rmse,
                'AQI_MAE': aqi_mae,
                'AQI_R2': r2_score,
                'Severity_Accuracy': severity_acc,
                'Severity_F1': severity_f1,
                'Pollutant_Accuracy': pollutant_acc,
                'Pollutant_F1': pollutant_f1
            }

            print(f"Random Forest Model Evaluation:")
            print(f"AQI RMSE: {aqi_rmse:.4f}")
            print(f"AQI MAE: {aqi_mae:.4f}")
            print(f"AQI R²: {r2_score:.4f}")
            print(f"Severity Accuracy: {severity_acc:.4f}")
            print(f"Severity F1: {severity_f1:.4f}")
            print(f"Pollutant Accuracy: {pollutant_acc:.4f}")
            print(f"Pollutant F1: {pollutant_f1:.4f}")

            return metrics

        except Exception as e:
            print(f"Error evaluating models: {str(e)}")
            return None

    def get_feature_importance(self):
        """
        Get feature importance from the regression model
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before getting feature importance")

        importance_df = pd.DataFrame({
            'feature': self.feature_cols,
            'importance': self.regression_model.feature_importances_
        }).sort_values('importance', ascending=False)

        return importance_df

    def save_model(self, filepath_prefix):
        """
        Save the fitted models
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before saving")

        try:
            # Save models
            joblib.dump(self.regression_model, f"{filepath_prefix}_regression.pkl")
            joblib.dump(self.severity_classifier, f"{filepath_prefix}_severity.pkl")
            joblib.dump(self.pollutant_classifier, f"{filepath_prefix}_pollutant.pkl")

            # Save preprocessing objects
            preprocessing_data = {
                'scaler': self.scaler,
                'severity_encoder': self.severity_encoder,
                'pollutant_encoder': self.pollutant_encoder,
                'feature_cols': self.feature_cols,
                'n_estimators': self.n_estimators,
                'max_depth': self.max_depth,
                'random_state': self.random_state,
                'n_jobs': self.n_jobs
            }
            joblib.dump(preprocessing_data, f"{filepath_prefix}_preprocessing.pkl")

            print(f"Random Forest models saved with prefix {filepath_prefix}")

        except Exception as e:
            print(f"Error saving models: {str(e)}")

    def load_model(self, filepath_prefix):
        """
        Load saved models
        """
        try:
            # Load models
            self.regression_model = joblib.load(f"{filepath_prefix}_regression.pkl")
            self.severity_classifier = joblib.load(f"{filepath_prefix}_severity.pkl")
            self.pollutant_classifier = joblib.load(f"{filepath_prefix}_pollutant.pkl")

            # Load preprocessing objects
            preprocessing_data = joblib.load(f"{filepath_prefix}_preprocessing.pkl")
            self.scaler = preprocessing_data['scaler']
            self.severity_encoder = preprocessing_data['severity_encoder']
            self.pollutant_encoder = preprocessing_data['pollutant_encoder']
            self.feature_cols = preprocessing_data['feature_cols']
            self.n_estimators = preprocessing_data['n_estimators']
            self.max_depth = preprocessing_data['max_depth']
            self.random_state = preprocessing_data['random_state']
            self.n_jobs = preprocessing_data['n_jobs']

            self.is_fitted = True
            print(f"Random Forest models loaded from {filepath_prefix}")

        except Exception as e:
            print(f"Error loading models: {str(e)}")

In [14]:
class XGBoostModel:
    def __init__(self, n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42):
        """
        Initialize XGBoost models for AQI prediction

        Parameters:
        n_estimators: Number of boosting rounds
        max_depth: Maximum depth of trees
        learning_rate: Learning rate
        random_state: Random state for reproducibility
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.random_state = random_state

        # Models
        self.regression_model = None
        self.severity_classifier = None
        self.pollutant_classifier = None

        # Preprocessing
        self.scaler = StandardScaler()
        self.severity_encoder = LabelEncoder()
        self.pollutant_encoder = LabelEncoder()
        self.feature_cols = None
        self.is_fitted = False

    def prepare_data(self, df):
        """
        Prepare data for XGBoost modeling
        """
        # Convert date and add time features
        df['From Date'] = pd.to_datetime(df['From Date'])
        df['Hour'] = df['From Date'].dt.hour
        df['Day'] = df['From Date'].dt.day
        df['Month'] = df['From Date'].dt.month
        df['Year'] = df['From Date'].dt.year
        df['DayOfWeek'] = df['From Date'].dt.dayofweek
        df['Quarter'] = df['From Date'].dt.quarter

        # Handle missing values
        numeric_cols = ['AT (degree C)', 'BP (mmHg)', 'Benzene (ug/m3)', 'NO (ug/m3)',
                        'NOx (ug/m3)', 'RF (mm)', 'RH (%)', 'SR (W/mt2)', 'Toluene (ug/m3)',
                        'WD (degree)', 'WS (m/s)']

        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].fillna(df[col].median())

        # Encode categorical variables
        if not self.is_fitted:
            df['Severity_encoded'] = self.severity_encoder.fit_transform(df['Severity'])
            df['Main_Pollutant_encoded'] = self.pollutant_encoder.fit_transform(df['Main Pollutant'])
        else:
            df['Severity_encoded'] = self.severity_encoder.transform(df['Severity'])
            df['Main_Pollutant_encoded'] = self.pollutant_encoder.transform(df['Main Pollutant'])

        return df

    def create_features(self, df):
        """
        Create additional features for better prediction
        """
        # Lagged features (if enough data)
        if len(df) > 24:
            df['AQI_lag1'] = df['AQI'].shift(1)
            df['AQI_lag24'] = df['AQI'].shift(24)
            df['AQI_rolling_mean_24'] = df['AQI'].rolling(window=24).mean()
            df['AQI_rolling_std_24'] = df['AQI'].rolling(window=24).std()

        # Pollutant ratios
        if 'NO (ug/m3)' in df.columns and 'NOx (ug/m3)' in df.columns:
            df['NO_NOx_ratio'] = df['NO (ug/m3)'] / (df['NOx (ug/m3)'] + 1e-8)

        # Weather interactions
        if 'AT (degree C)' in df.columns and 'RH (%)' in df.columns:
            df['Temp_Humidity_interaction'] = df['AT (degree C)'] * df['RH (%)']

        if 'WS (m/s)' in df.columns and 'WD (degree)' in df.columns:
            df['Wind_component_x'] = df['WS (m/s)'] * np.cos(np.radians(df['WD (degree)']))
            df['Wind_component_y'] = df['WS (m/s)'] * np.sin(np.radians(df['WD (degree)']))

        return df

    def fit(self, train_data):
        """
        Fit XGBoost models
        """
        try:
            # Prepare data
            train_df = self.prepare_data(train_data.copy())
            train_df = self.create_features(train_df)

            # Select features
            base_features = ['AT (degree C)', 'BP (mmHg)', 'Benzene (ug/m3)', 'NO (ug/m3)',
                            'NOx (ug/m3)', 'RF (mm)', 'RH (%)', 'SR (W/mt2)', 'Toluene (ug/m3)',
                            'WD (degree)', 'WS (m/s)', 'Hour', 'Day', 'Month', 'Year', 'DayOfWeek',
                            'Quarter', 'latitude', 'longitude', 'elevation']

            # Add engineered features
            engineered_features = ['AQI_lag1', 'AQI_lag24', 'AQI_rolling_mean_24', 'AQI_rolling_std_24',
                                 'NO_NOx_ratio', 'Temp_Humidity_interaction', 'Wind_component_x', 'Wind_component_y']

            all_features = base_features + engineered_features

            # Filter available columns
            self.feature_cols = [col for col in all_features if col in train_df.columns]

            # Remove rows with NaN values (from lagged features)
            train_df = train_df.dropna()

            if len(train_df) == 0:
                raise ValueError("No data remaining after removing NaN values")

            # Prepare features and targets
            X_train = train_df[self.feature_cols]
            y_aqi = train_df['AQI']
            y_severity = train_df['Severity_encoded']
            y_pollutant = train_df['Main_Pollutant_encoded']

            # Scale features
            X_train_scaled = self.scaler.fit_transform(X_train)

            # Initialize models
            self.regression_model = XGBRegressor(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                random_state=self.random_state,
                n_jobs=-1
            )

            self.severity_classifier = XGBClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                random_state=self.random_state,
                n_jobs=-1
            )

            self.pollutant_classifier = XGBClassifier(
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                random_state=self.random_state,
                n_jobs=-1
            )

            # Fit models
            print("Training AQI regression model...")
            self.regression_model.fit(X_train_scaled, y_aqi)

            print("Training severity classification model...")
            self.severity_classifier.fit(X_train_scaled, y_severity)

            print("Training pollutant classification model...")
            self.pollutant_classifier.fit(X_train_scaled, y_pollutant)

            self.is_fitted = True
            print("XGBoost models trained successfully")

            # Feature importance
            print("\nTop 10 most important features for AQI prediction:")
            feature_importance = self.regression_model.feature_importances_
            indices = np.argsort(feature_importance)[::-1][:10]
            for i, idx in enumerate(indices):
                print(f"{i+1}. {self.feature_cols[idx]}: {feature_importance[idx]:.4f}")

            return self.regression_model, self.severity_classifier, self.pollutant_classifier

        except Exception as e:
            print(f"Error training XGBoost models: {str(e)}")
            return None, None, None

    def predict(self, test_data):
        """
        Make predictions using fitted XGBoost models
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before making predictions")

        try:
            # Prepare test data
            test_df = self.prepare_data(test_data.copy())
            test_df = self.create_features(test_df)

            # Handle missing lagged features for prediction
            if 'AQI_lag1' in self.feature_cols and 'AQI_lag1' not in test_df.columns:
                test_df['AQI_lag1'] = test_df['AQI'].shift(1).fillna(test_df['AQI'].mean())
            if 'AQI_lag24' in self.feature_cols and 'AQI_lag24' not in test_df.columns:
                test_df['AQI_lag24'] = test_df['AQI'].shift(24).fillna(test_df['AQI'].mean())
            if 'AQI_rolling_mean_24' in self.feature_cols and 'AQI_rolling_mean_24' not in test_df.columns:
                test_df['AQI_rolling_mean_24'] = test_df['AQI'].rolling(window=24).mean().fillna(test_df['AQI'].mean())
            if 'AQI_rolling_std_24' in self.feature_cols and 'AQI_rolling_std_24' not in test_df.columns:
                test_df['AQI_rolling_std_24'] = test_df['AQI'].rolling(window=24).std().fillna(test_df['AQI'].std())

            # Fill remaining NaN values
            test_df = test_df.fillna(method='ffill').fillna(method='bfill')

            # Prepare features
            X_test = test_df[self.feature_cols]
            X_test_scaled = self.scaler.transform(X_test)

            # Make predictions
            aqi_pred = self.regression_model.predict(X_test_scaled)
            severity_pred = self.severity_classifier.predict(X_test_scaled)
            pollutant_pred = self.pollutant_classifier.predict(X_test_scaled)

            # Decode categorical predictions
            severity_labels = self.severity_encoder.inverse_transform(severity_pred)
            pollutant_labels = self.pollutant_encoder.inverse_transform(pollutant_pred)

            return aqi_pred, severity_labels, pollutant_labels

        except Exception as e:
            print(f"Error making predictions: {str(e)}")
            return None, None, None

    def evaluate(self, test_data):
        """
        Evaluate model performance
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before evaluation")

        try:
            # Prepare test data
            test_df = self.prepare_data(test_data.copy())
            test_df = self.create_features(test_df)
            test_df = test_df.dropna()

            if len(test_df) == 0:
                raise ValueError("No test data remaining after removing NaN values")

            # Get actual values
            y_true_aqi = test_df['AQI']
            y_true_severity = test_df['Severity_encoded']
            y_true_pollutant = test_df['Main_Pollutant_encoded']

            # Make predictions
            aqi_pred, severity_pred, pollutant_pred = self.predict(test_data)

            # Calculate metrics
            aqi_rmse = np.sqrt(mean_squared_error(y_true_aqi, aqi_pred[:len(y_true_aqi)]))
            severity_acc = accuracy_score(y_true_severity,
                                        self.severity_encoder.transform(severity_pred[:len(y_true_severity)]))
            pollutant_acc = accuracy_score(y_true_pollutant,
                                         self.pollutant_encoder.transform(pollutant_pred[:len(y_true_pollutant)]))

            # F1 scores
            severity_f1 = f1_score(y_true_severity,
                                 self.severity_encoder.transform(severity_pred[:len(y_true_severity)]),
                                 average='weighted')
            pollutant_f1 = f1_score(y_true_pollutant,
                                   self.pollutant_encoder.transform(pollutant_pred[:len(y_true_pollutant)]),
                                   average='weighted')

            metrics = {
                'AQI_RMSE': aqi_rmse,
                'Severity_Accuracy': severity_acc,
                'Severity_F1': severity_f1,
                'Pollutant_Accuracy': pollutant_acc,
                'Pollutant_F1': pollutant_f1
            }

            print(f"XGBoost Model Evaluation:")
            print(f"AQI RMSE: {aqi_rmse:.4f}")
            print(f"Severity Accuracy: {severity_acc:.4f}")
            print(f"Severity F1: {severity_f1:.4f}")
            print(f"Pollutant Accuracy: {pollutant_acc:.4f}")
            print(f"Pollutant F1: {pollutant_f1:.4f}")

            return metrics

        except Exception as e:
            print(f"Error evaluating models: {str(e)}")
            return None

    def save_model(self, filepath_prefix):
        """
        Save the fitted models
        """
        if not self.is_fitted:
            raise ValueError("Models must be fitted before saving")

        try:
            # Save models
            joblib.dump(self.regression_model, f"{filepath_prefix}_regression.pkl")
            joblib.dump(self.severity_classifier, f"{filepath_prefix}_severity.pkl")
            joblib.dump(self.pollutant_classifier, f"{filepath_prefix}_pollutant.pkl")

            # Save preprocessing objects
            preprocessing_data = {
                'scaler': self.scaler,
                'severity_encoder': self.severity_encoder,
                'pollutant_encoder': self.pollutant_encoder,
                'feature_cols': self.feature_cols,
                'n_estimators': self.n_estimators,
                'max_depth': self.max_depth,
                'learning_rate': self.learning_rate,
                'random_state': self.random_state
            }
            joblib.dump(preprocessing_data, f"{filepath_prefix}_preprocessing.pkl")

            print(f"XGBoost models saved with prefix {filepath_prefix}")

        except Exception as e:
            print(f"Error saving models: {str(e)}")

    def load_model(self, filepath_prefix):
        """
        Load saved models
        """
        try:
            # Load models
            self.regression_model = joblib.load(f"{filepath_prefix}_regression.pkl")
            self.severity_classifier = joblib.load(f"{filepath_prefix}_severity.pkl")
            self.pollutant_classifier = joblib.load(f"{filepath_prefix}_pollutant.pkl")

            # Load preprocessing objects
            preprocessing_data = joblib.load(f"{filepath_prefix}_preprocessing.pkl")
            self.scaler = preprocessing_data['scaler']
            self.severity_encoder = preprocessing_data['severity_encoder']
            self.pollutant_encoder = preprocessing_data['pollutant_encoder']
            self.feature_cols = preprocessing_data['feature_cols']
            self.n_estimators = preprocessing_data['n_estimators']
            self.max_depth = preprocessing_data['max_depth']
            self.learning_rate = preprocessing_data['learning_rate']
            self.random_state = preprocessing_data['random_state']

            self.is_fitted = True
            print(f"XGBoost models loaded from {filepath_prefix}")

        except Exception as e:
            print(f"Error loading models: {str(e)}")