In [30]:
!pip install tqdm
!pip install opencage
!pip install scikit-learn
!pip install tensorflow
!pip install keras



In [44]:
import pandas as pd
import numpy as np
import time
from geopy.geocoders import Nominatim
from tqdm.notebook import tqdm
from opencage.geocoder import OpenCageGeocode
from google.colab import userdata
import requests
import os
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
import joblib
import warnings
# Arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, r2_score
warnings.filterwarnings('ignore')

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
class ARIMAModel:
    """
    Enhanced ARIMA Model with progress bar support for Jupyter notebooks
    """

    def __init__(self, order=(1, 1, 1)):
        """
        Initialize ARIMA model

        Parameters:
        -----------
        order : tuple
            (p, d, q) order for ARIMA model
        """
        self.order = order
        self.model = None
        self.fitted_model = None
        self.is_fitted = False

    def _check_stationarity(self, timeseries, progress_bar=None):
        """Check if time series is stationary"""
        if progress_bar:
            progress_bar.set_description("Checking stationarity...")
            time.sleep(0.1)  # Small delay for visual feedback

        result = adfuller(timeseries.dropna())
        p_value = result[1]

        if progress_bar:
            progress_bar.set_postfix({"ADF p-value": f"{p_value:.6f}"})

        return p_value < 0.05

    def _prepare_data(self, data, progress_bar=None):
        """Prepare data for ARIMA modeling"""
        if progress_bar:
            progress_bar.set_description("Preparing data...")
            time.sleep(0.1)

        # Handle missing values
        if isinstance(data, pd.Series):
            data = data.interpolate(method='time').fillna(method='bfill').fillna(method='ffill')
        else:
            data = pd.Series(data).interpolate().fillna(method='bfill').fillna(method='ffill')

        if progress_bar:
            progress_bar.set_postfix({"Data points": len(data)})

        return data

    def fit(self, data, show_progress=True):
        """
        Fit ARIMA model with progress tracking

        Parameters:
        -----------
        data : array-like or pd.Series
            Time series data
        show_progress : bool
            Whether to show progress bar
        """
        # Initialize progress bar
        progress_steps = 4
        if show_progress:
            pbar = tqdm(total=progress_steps, desc="Training ARIMA",
                       bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        else:
            pbar = None

        try:
            # Step 1: Prepare data
            if pbar:
                pbar.update(1)
            prepared_data = self._prepare_data(data, pbar)

            # Step 2: Check stationarity
            if pbar:
                pbar.update(1)
            is_stationary = self._check_stationarity(prepared_data, pbar)

            # Step 3: Fit model
            if pbar:
                pbar.set_description("Fitting ARIMA model...")
                pbar.update(1)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.model = ARIMA(prepared_data, order=self.order)
                self.fitted_model = self.model.fit()

            # Step 4: Finalize
            if pbar:
                pbar.set_description("Model fitting complete")
                pbar.set_postfix({
                    "AIC": f"{self.fitted_model.aic:.2f}",
                    "Order": str(self.order)
                })
                pbar.update(1)
                time.sleep(0.5)  # Brief pause to show completion

            self.is_fitted = True

            if pbar:
                pbar.close()

            print(f"  ARIMA{self.order} model fitted successfully!")
            print(f"  AIC: {self.fitted_model.aic:.2f}")
            print(f"  Data points: {len(prepared_data)}")

        except Exception as e:
            if pbar:
                pbar.close()
            print(f"✗ Error fitting ARIMA model: {str(e)}")
            raise

    def predict(self, steps=1):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")

        forecast = self.fitted_model.forecast(steps=steps)
        return forecast

    def save_model(self, filepath):
        """Save the fitted model"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before saving")

        model_data = {
            'fitted_model': self.fitted_model,
            'order': self.order,
            'is_fitted': self.is_fitted
        }
        joblib.dump(model_data, filepath)
        print(f"ARIMA model saved to {filepath}")

    def load_model(self, filepath):
        """Load a fitted model"""
        model_data = joblib.load(filepath)
        self.fitted_model = model_data['fitted_model']
        self.order = model_data['order']
        self.is_fitted = model_data['is_fitted']
        print(f"ARIMA model loaded from {filepath}")

In [None]:
def load_and_preprocess_data(file_path):
    """Load and preprocess the AQI dataset"""
    print("Loading and preprocessing data...")

    # Load data
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV file.")

    print(f"   Original data shape: {data.shape}")

    # Basic preprocessing
    if 'From Date' in data.columns:
        data['From Date'] = pd.to_datetime(data['From Date'])
        data = data.sort_values('From Date')

    # Handle missing values in critical columns
    if 'AQI' in data.columns:
        data = data.dropna(subset=['AQI'])

    # Fill missing values for other columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

    print(f"   Processed data shape: {data.shape}")
    print(f"   Date range: {data['From Date'].min()} to {data['From Date'].max()}")
    print(f"   AQI range: {data['AQI'].min():.1f} to {data['AQI'].max():.1f}")

    return data

def split_data(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """Split data into train, validation, and test sets"""
    print("Splitting data...")

    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"

    n = len(data)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]

    print(f"   Train set: {len(train_data)} samples ({train_ratio*100:.1f}%)")
    print(f"   Validation set: {len(val_data)} samples ({val_ratio*100:.1f}%)")
    print(f"   Test set: {len(test_data)} samples ({test_ratio*100:.1f}%)")

    return train_data, val_data, test_data

def train_models(train_data, val_data, base_path, gpu_available=False, show_progress=True):
    """Train all models with progress tracking"""
    print("Starting model training...")

    # Create models directory
    # Path("models").mkdir(exist_ok=True)

    models = {}
    training_results = {}

    # Progress bar for overall training
    model_count = 4
    if show_progress:
        overall_pbar = tqdm(total=model_count, desc="Training Models",
                           bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')

    # 1. Train ARIMA Model
    print("\n" + "="*60)
    print("1️⃣  TRAINING ARIMA MODEL")
    print("="*60)

    try:
        arima_model = ARIMAModel(order=(1, 1, 1))
        start_time = time.time()
        arima_model.fit(train_data['AQI'], show_progress=show_progress)
        training_time = time.time() - start_time

        # Save model
        arima_model.save_model(base_path + 'arima_model.pkl')

        # Make predictions
        val_pred = arima_model.predict(steps=len(val_data))

        models['arima'] = arima_model
        training_results['arima'] = {
            'training_time': training_time,
            'val_predictions': val_pred
        }

        print(f"✓ ARIMA training completed in {training_time:.2f} seconds")

    except Exception as e:
        print(f"✗ ARIMA training failed: {e}")

    if show_progress:
        overall_pbar.update(1)
    return models, training_results

def evaluate_models(models, training_results, val_data, test_data):
    """Evaluate all trained models"""
    print("\n" + "="*60)
    print("📊 MODEL EVALUATION")
    print("="*60)

    evaluation_results = {}

    print(f"{'Model':<15} {'AQI RMSE':<10} {'AQI MAE':<10} {'AQI R²':<10} {'Training Time':<15}")
    print("-" * 70)

    for model_name, result in training_results.items():
        if model_name not in models:
            continue

        model = models[model_name]
        training_time = result['training_time']

        try:
            if model_name == 'arima':
                # ARIMA predictions
                val_pred_aqi = result['val_predictions'][:len(val_data)]
                val_true_aqi = val_data['AQI'].values[:len(val_pred_aqi)]
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(val_true_aqi, val_pred_aqi))
            mae = mean_absolute_error(val_true_aqi, val_pred_aqi)
            r2 = r2_score(val_true_aqi, val_pred_aqi)

            evaluation_results[model_name] = {
                'rmse': rmse,
                'mae': mae,
                'r2': r2,
                'training_time': training_time
            }

            print(f"{model_name:<15} {rmse:<10.2f} {mae:<10.2f} {r2:<10.3f} {training_time:<15.2f}s")

        except Exception as e:
            print(f"{model_name:<15} {'ERROR':<10} {'ERROR':<10} {'ERROR':<10} {training_time:<15.2f}s")
            print(f"   Error: {e}")

    # Find best model
    if evaluation_results:
        best_model = min(evaluation_results.keys(), key=lambda x: evaluation_results[x]['rmse'])
        print(f"\n🏆 Best model: {best_model} (RMSE: {evaluation_results[best_model]['rmse']:.2f})")

    return evaluation_results

def create_training_summary(evaluation_results, gpu_available):
    """Create a summary of the training session"""
    print("\n" + "="*60)
    print("TRAINING SUMMARY")
    print("="*60)

    print(f"Environment: {'GPU Accelerated' if gpu_available else 'CPU Only'}")
    print(f"Models trained: {len(evaluation_results)}")
    print(f"Total training time: {sum(r['training_time'] for r in evaluation_results.values()):.2f} seconds")

    if evaluation_results:
        # Performance ranking
        sorted_models = sorted(evaluation_results.items(), key=lambda x: x[1]['rmse'])

        print("\nPerformance Ranking (by RMSE):")
        for i, (model_name, results) in enumerate(sorted_models, 1):
            print(f"  {i}. {model_name}: {results['rmse']:.2f}")

        # Speed ranking
        sorted_by_speed = sorted(evaluation_results.items(), key=lambda x: x[1]['training_time'])

        print("\nSpeed Ranking (by training time):")
        for i, (model_name, results) in enumerate(sorted_by_speed, 1):
            print(f"  {i}. {model_name}: {results['training_time']:.2f}s")

def main(data_file):
    """Main training pipeline"""
    print("AQI PREDICTION MODEL TRAINING PIPELINE")
    print("="*60)

    # Setup environment
    # gpu_available = setup_colab_environment()

    # Load data
    data = load_and_preprocess_data(data_file)

    # Split data
    train_data, val_data, test_data = split_data(data)

    # Train models
    models, training_results = train_models(train_data, val_data, '/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/', True)

    # Evaluate models
    evaluation_results = evaluate_models(models, training_results, val_data, test_data)

    # Create summary
    create_training_summary(evaluation_results, True)

    print("\nTraining pipeline completed!")
    print(f"Models saved in: /content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/models")

    return models, evaluation_results

In [45]:
if __name__ == "__main__":
    models, results = main('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/statewise_aqi/')

AQI PREDICTION MODEL TRAINING PIPELINE
📊 Loading and preprocessing data...
   Original data shape: (224868, 21)
   Processed data shape: (224868, 21)
   Date range: 2016-07-01 10:00:00 to 2023-03-31 23:00:00
   AQI range: 0.0 to 500.0
✂️  Splitting data...
   Train set: 157407 samples (70.0%)
   Validation set: 33730 samples (15.0%)
   Test set: 33731 samples (15.0%)
🏃‍♂️ Starting model training...


Training Models:   0%|           | 0/4 [00:00<?]


1️⃣  TRAINING ARIMA MODEL


Training ARIMA:   0%|           | 0/4 [00:00<?]

✓ ARIMA(1, 1, 1) model fitted successfully!
  AIC: 1612107.68
  Data points: 157407
✓ ARIMA model saved to /content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/arima_model.pkl
✓ ARIMA training completed in 99.47 seconds

📊 MODEL EVALUATION
Model           AQI RMSE   AQI MAE    AQI R²     Training Time  
----------------------------------------------------------------------
arima           71.36      50.84      -0.951     99.47          s

🏆 Best model: arima (RMSE: 71.36)


TypeError: create_training_summary() missing 1 required positional argument: 'gpu_available'