In [23]:
!pip install tqdm
!pip install opencage
!pip install scikit-learn
!pip install tensorflow
!pip install keras



In [24]:
import pandas as pd
import numpy as np
import time
from geopy.geocoders import Nominatim
from tqdm.notebook import tqdm
from opencage.geocoder import OpenCageGeocode
from google.colab import userdata
import requests
import os
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
import joblib
import warnings
# Arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, mean_absolute_error
warnings.filterwarnings('ignore')

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
class ARIMAModel:
    """
    Enhanced ARIMA Model with progress bar support for Jupyter notebooks
    """

    def __init__(self, order=(1, 1, 1)):
        """
        Initialize ARIMA model

        Parameters:
        -----------
        order : tuple
            (p, d, q) order for ARIMA model
        """
        self.order = order
        self.model = None
        self.fitted_model = None
        self.is_fitted = False

    def _check_stationarity(self, timeseries, progress_bar=None):
        """Check if time series is stationary"""
        if progress_bar:
            progress_bar.set_description("Checking stationarity...")
            time.sleep(0.1)  # Small delay for visual feedback

        result = adfuller(timeseries.dropna())
        p_value = result[1]

        if progress_bar:
            progress_bar.set_postfix({"ADF p-value": f"{p_value:.6f}"})

        return p_value < 0.05

    def _prepare_data(self, data, progress_bar=None):
        """Prepare data for ARIMA modeling"""
        if progress_bar:
            progress_bar.set_description("Preparing data...")
            time.sleep(0.1)

        # Handle missing values
        if isinstance(data, pd.Series):
            data = data.interpolate(method='time').fillna(method='bfill').fillna(method='ffill')
        else:
            data = pd.Series(data).interpolate().fillna(method='bfill').fillna(method='ffill')

        if progress_bar:
            progress_bar.set_postfix({"Data points": len(data)})

        return data

    def fit(self, data, show_progress=True):
        """
        Fit ARIMA model with progress tracking

        Parameters:
        -----------
        data : array-like or pd.Series
            Time series data
        show_progress : bool
            Whether to show progress bar
        """
        # Initialize progress bar
        progress_steps = 4
        if show_progress:
            pbar = tqdm(total=progress_steps, desc="Training ARIMA",
                       bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
        else:
            pbar = None

        try:
            # Step 1: Prepare data
            if pbar:
                pbar.update(1)
            prepared_data = self._prepare_data(data, pbar)

            # Step 2: Check stationarity
            if pbar:
                pbar.update(1)
            is_stationary = self._check_stationarity(prepared_data, pbar)

            # Step 3: Fit model
            if pbar:
                pbar.set_description("Fitting ARIMA model...")
                pbar.update(1)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.model = ARIMA(prepared_data, order=self.order)
                self.fitted_model = self.model.fit()

            # Step 4: Finalize
            if pbar:
                pbar.set_description("Model fitting complete")
                pbar.set_postfix({
                    "AIC": f"{self.fitted_model.aic:.2f}",
                    "Order": str(self.order)
                })
                pbar.update(1)
                time.sleep(0.5)  # Brief pause to show completion

            self.is_fitted = True

            if pbar:
                pbar.close()

            print(f"‚úì ARIMA{self.order} model fitted successfully!")
            print(f"  AIC: {self.fitted_model.aic:.2f}")
            print(f"  Data points: {len(prepared_data)}")

        except Exception as e:
            if pbar:
                pbar.close()
            print(f"‚úó Error fitting ARIMA model: {str(e)}")
            raise

    def predict(self, steps=1):
        """Make predictions"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before making predictions")

        forecast = self.fitted_model.forecast(steps=steps)
        return forecast

    def save_model(self, filepath):
        """Save the fitted model"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before saving")

        model_data = {
            'fitted_model': self.fitted_model,
            'order': self.order,
            'is_fitted': self.is_fitted
        }
        joblib.dump(model_data, filepath)
        print(f"‚úì ARIMA model saved to {filepath}")

    def load_model(self, filepath):
        """Load a fitted model"""
        model_data = joblib.load(filepath)
        self.fitted_model = model_data['fitted_model']
        self.order = model_data['order']
        self.is_fitted = model_data['is_fitted']
        print(f"‚úì ARIMA model loaded from {filepath}")

In [27]:
def setup_colab_environment():
    """Setup Google Colab environment with GPU and required packages"""
    print("üîß Setting up Google Colab environment...")

    # Check if running in Colab
    try:
        import google.colab
        in_colab = True
        print("‚úì Running in Google Colab")
    except ImportError:
        in_colab = False
        print("‚ÑπÔ∏è  Not running in Google Colab")

    if in_colab:
        print("üìã Installing required packages for Colab...")

        # Install packages that might not be available in Colab
        import subprocess
        import sys

        packages = [
            'tqdm',
            'plotly',
            'statsmodels'
        ]

        for package in packages:
            try:
                __import__(package)
                print(f"‚úì {package} already installed")
            except ImportError:
                print(f"üì¶ Installing {package}...")
                subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

        # Optional: Install cuML for GPU-accelerated Random Forest
        try:
            import cuml
            print("‚úì cuML (GPU acceleration) is available")
        except ImportError:
            print("‚ö†Ô∏è  cuML not available. For GPU Random Forest, install with:")
            print("   !pip install cuml-cu11")

    # Check GPU availability
    gpus = tf.config.list_physical_devices('GPU')

    if gpus:
        print(f"üöÄ GPU acceleration available!")
        print(f"   GPUs detected: {len(gpus)}")
        for i, gpu in enumerate(gpus):
            print(f"   GPU {i}: {gpu.name}")

        # Configure GPU memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

        return True
    else:
        print("‚ö†Ô∏è  No GPU detected. Models will run on CPU.")
        print("   To enable GPU in Colab: Runtime ‚Üí Change runtime type ‚Üí GPU")
        return False

def load_and_preprocess_data(file_path):
    """Load and preprocess the AQI dataset"""
    print("üìä Loading and preprocessing data...")

    # Load data
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a CSV file.")

    print(f"   Original data shape: {data.shape}")

    # Basic preprocessing
    if 'From Date' in data.columns:
        data['From Date'] = pd.to_datetime(data['From Date'])
        data = data.sort_values('From Date')

    # Handle missing values in critical columns
    if 'AQI' in data.columns:
        data = data.dropna(subset=['AQI'])

    # Fill missing values for other columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

    print(f"   Processed data shape: {data.shape}")
    print(f"   Date range: {data['From Date'].min()} to {data['From Date'].max()}")
    print(f"   AQI range: {data['AQI'].min():.1f} to {data['AQI'].max():.1f}")

    return data

def split_data(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """Split data into train, validation, and test sets"""
    print("‚úÇÔ∏è  Splitting data...")

    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"

    n = len(data)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))

    train_data = data.iloc[:train_end]
    val_data = data.iloc[train_end:val_end]
    test_data = data.iloc[val_end:]

    print(f"   Train set: {len(train_data)} samples ({train_ratio*100:.1f}%)")
    print(f"   Validation set: {len(val_data)} samples ({val_ratio*100:.1f}%)")
    print(f"   Test set: {len(test_data)} samples ({test_ratio*100:.1f}%)")

    return train_data, val_data, test_data

def train_models(train_data, val_data, base_path, gpu_available=False, show_progress=True):
    """Train all models with progress tracking"""
    print("üèÉ‚Äç‚ôÇÔ∏è Starting model training...")

    # Create models directory
    # Path("models").mkdir(exist_ok=True)

    models = {}
    training_results = {}

    # Progress bar for overall training
    model_count = 4
    if show_progress:
        overall_pbar = tqdm(total=model_count, desc="Training Models",
                           bar_format='{l_bar}{bar} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')

    # 1. Train ARIMA Model
    print("\n" + "="*60)
    print("1Ô∏è‚É£  TRAINING ARIMA MODEL")
    print("="*60)

    try:
        arima_model = ARIMAModel(order=(1, 1, 1))
        start_time = time.time()
        arima_model.fit(train_data['AQI'], show_progress=show_progress)
        training_time = time.time() - start_time

        # Save model
        arima_model.save_model(base_path + 'arima_model.pkl')

        # Make predictions
        val_pred = arima_model.predict(steps=len(val_data))

        models['arima'] = arima_model
        training_results['arima'] = {
            'training_time': training_time,
            'val_predictions': val_pred
        }

        print(f"‚úì ARIMA training completed in {training_time:.2f} seconds")

    except Exception as e:
        print(f"‚úó ARIMA training failed: {e}")

    if show_progress:
        overall_pbar.update(1)
    return models, training_results

def evaluate_models(models, training_results, val_data, test_data):
    """Evaluate all trained models"""
    print("\n" + "="*60)
    print("üìä MODEL EVALUATION")
    print("="*60)

    evaluation_results = {}

    print(f"{'Model':<15} {'AQI RMSE':<10} {'AQI MAE':<10} {'AQI R¬≤':<10} {'Training Time':<15}")
    print("-" * 70)

    for model_name, result in training_results.items():
        if model_name not in models:
            continue

        model = models[model_name]
        training_time = result['training_time']

        try:
            if model_name == 'arima':
                # ARIMA predictions
                val_pred_aqi = result['val_predictions'][:len(val_data)]
                val_true_aqi = val_data['AQI'].values[:len(val_pred_aqi)]
            # Calculate metrics
            rmse = np.sqrt(mean_squared_error(val_true_aqi, val_pred_aqi))
            mae = mean_absolute_error(val_true_aqi, val_pred_aqi)
            r2 = r2_score(val_true_aqi, val_pred_aqi)

            evaluation_results[model_name] = {
                'rmse': rmse,
                'mae': mae,
                'r2': r2,
                'training_time': training_time
            }

            print(f"{model_name:<15} {rmse:<10.2f} {mae:<10.2f} {r2:<10.3f} {training_time:<15.2f}s")

        except Exception as e:
            print(f"{model_name:<15} {'ERROR':<10} {'ERROR':<10} {'ERROR':<10} {training_time:<15.2f}s")
            print(f"   Error: {e}")

    # Find best model
    if evaluation_results:
        best_model = min(evaluation_results.keys(), key=lambda x: evaluation_results[x]['rmse'])
        print(f"\nüèÜ Best model: {best_model} (RMSE: {evaluation_results[best_model]['rmse']:.2f})")

    return evaluation_results

def create_training_summary(evaluation_results, gpu_available):
    """Create a summary of the training session"""
    print("\n" + "="*60)
    print("TRAINING SUMMARY")
    print("="*60)

    print(f"Environment: {'GPU Accelerated' if gpu_available else 'CPU Only'}")
    print(f"Models trained: {len(evaluation_results)}")
    print(f"Total training time: {sum(r['training_time'] for r in evaluation_results.values()):.2f} seconds")

    if evaluation_results:
        # Performance ranking
        sorted_models = sorted(evaluation_results.items(), key=lambda x: x[1]['rmse'])

        print("\nPerformance Ranking (by RMSE):")
        for i, (model_name, results) in enumerate(sorted_models, 1):
            print(f"  {i}. {model_name}: {results['rmse']:.2f}")

        # Speed ranking
        sorted_by_speed = sorted(evaluation_results.items(), key=lambda x: x[1]['training_time'])

        print("\nSpeed Ranking (by training time):")
        for i, (model_name, results) in enumerate(sorted_by_speed, 1):
            print(f"  {i}. {model_name}: {results['training_time']:.2f}s")

def main(data_file):
    """Main training pipeline"""
    print("AQI PREDICTION MODEL TRAINING PIPELINE")
    print("="*60)

    # Setup environment
    # gpu_available = setup_colab_environment()

    # Load data
    data = load_and_preprocess_data(data_file)

    # Split data
    train_data, val_data, test_data = split_data(data)

    # Train models
    models, training_results = train_models(train_data, val_data, '/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/' ,gpu_available)

    # Evaluate models
    evaluation_results = evaluate_models(models, training_results, val_data, test_data)

    # Create summary
    create_training_summary(evaluation_results, gpu_available)

    print("\nTraining pipeline completed!")
    print(f"Models saved in: {Path('models').absolute()}")

    return models, evaluation_results

In [28]:
def predict_aqi(from_datetime, to_datetime, location_city, model_type='xgboost'):
    """
    Predict AQI for a given time period and location

    Parameters:
    from_datetime: Start datetime (str or datetime)
    to_datetime: End datetime (str or datetime)
    location_city: City name (str)
    model_type: Type of model to use ('arima', 'lstm', 'xgboost', 'random_forest')

    Returns:
    DataFrame with predictions
    """
    try:
        # Convert datetime strings if needed
        if isinstance(from_datetime, str):
            from_datetime = pd.to_datetime(from_datetime)
        if isinstance(to_datetime, str):
            to_datetime = pd.to_datetime(to_datetime)

        # Create hourly datetime range
        datetime_range = pd.date_range(start=from_datetime, end=to_datetime, freq='H')

        # Create dummy data for prediction (in real scenario, you'd have actual sensor data)
        # For demonstration, we'll use sample data patterns
        n_hours = len(datetime_range)

        # Create base DataFrame
        prediction_data = pd.DataFrame({
            'From Date': datetime_range,
            'AQI': np.random.randint(30, 100, n_hours),  # Dummy AQI values
            'AT (degree C)': np.random.normal(25, 5, n_hours),
            'RH (%)': np.random.normal(65, 10, n_hours),
            'WS (m/s)': np.random.normal(2, 0.5, n_hours),
            'WD (degree)': np.random.uniform(0, 360, n_hours),
            'SR (W/mt2)': np.random.uniform(0, 500, n_hours),
            'NO (ug/m3)': np.random.uniform(10, 50, n_hours),
            'NOx (ug/m3)': np.random.uniform(20, 80, n_hours),
            'Benzene (ug/m3)': np.random.uniform(0.5, 2, n_hours),
            'Toluene (ug/m3)': np.random.uniform(2, 8, n_hours),
            'RF (mm)': np.random.uniform(0, 2, n_hours),
            'BP (mmHg)': np.random.normal(760, 20, n_hours),
            'Severity': np.random.choice(['GOOD', 'SATISFACTORY', 'MODERATE', 'POOR'], n_hours),
            'Main Pollutant': np.random.choice(['PM10 (ug/m3)', 'PM2.5 (ug/m3)', 'NO2 (ug/m3)', 'CO (mg/m3)'], n_hours),
            'city': location_city,
            'state': 'Unknown',
            'latitude': 0.0,
            'longitude': 0.0,
            'elevation': 0.0,
            'file_name': 'PRED001'
        })

        # Load appropriate model
        models_dir = 'models'
        if model_type == 'arima':
            model = ARIMAModel()
            model.load_model(os.path.join(models_dir, 'arima_model.pkl'))
            # For ARIMA, we need to implement a different prediction approach
            forecast, conf_int = model.predict(steps=n_hours)
            predictions = pd.DataFrame({
                'datetime': datetime_range,
                'AQI': forecast,
                'Severity': 'PREDICTED',
                'Main_Pollutant': 'PREDICTED'
            })
        print(f"Predictions generated for {location_city} from {from_datetime} to {to_datetime}")
        print(f"Model used: {model_type.upper()}")
        print(f"Number of predictions: {len(predictions)}")

        return predictions

    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None

In [29]:
if __name__ == "__main__":
    models, results = main('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/statewise_aqi/Andhra Pradesh_aqi.csv')

üöÄ AQI PREDICTION MODEL TRAINING PIPELINE
üîß Setting up Google Colab environment...
‚úì Running in Google Colab
üìã Installing required packages for Colab...
‚úì tqdm already installed
‚úì plotly already installed
‚úì statsmodels already installed
‚ö†Ô∏è  cuML not available. For GPU Random Forest, install with:
   !pip install cuml-cu11
üöÄ GPU acceleration available!
   GPUs detected: 1
   GPU 0: /physical_device:GPU:0
üìä Loading and preprocessing data...
   Original data shape: (224868, 21)
   Processed data shape: (224868, 21)
   Date range: 2016-07-01 10:00:00 to 2023-03-31 23:00:00
   AQI range: 0.0 to 500.0
‚úÇÔ∏è  Splitting data...
   Train set: 157407 samples (70.0%)
   Validation set: 33730 samples (15.0%)
   Test set: 33731 samples (15.0%)
üèÉ‚Äç‚ôÇÔ∏è Starting model training...


Training Models:   0%|           | 0/4 [00:00<?]


1Ô∏è‚É£  TRAINING ARIMA MODEL


Training ARIMA:   0%|           | 0/4 [00:00<?]

‚úì ARIMA(1, 1, 1) model fitted successfully!
  AIC: 1612107.68
  Data points: 157407
‚úì ARIMA model saved to /content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/arima_model.pkl
‚úì ARIMA training completed in 113.35 seconds

2Ô∏è‚É£  TRAINING LSTM MODEL
üîß Setting up GPU acceleration for Google Colab...
‚úì GPU acceleration enabled!
  Available GPUs: 1
  GPU Names: ['/physical_device:GPU:0']
  Mixed precision: mixed_float16
  GPU Details: Tesla T4


Training LSTM:   0%|           | 0/5 [00:00<?]

Initial
AQI                         int64
AT (degree C)             float64
BP (mmHg)                 float64
Benzene (ug/m3)           float64
From Date          datetime64[ns]
Main Pollutant             object
NO (ug/m3)                float64
NOx (ug/m3)               float64
RF (mm)                   float64
RH (%)                    float64
SR (W/mt2)                float64
Severity                   object
Toluene (ug/m3)           float64
WD (degree)               float64
WS (m/s)                  float64
file_name                  object
state                      object
city                       object
latitude                  float64
longitude                 float64
elevation                 float64
dtype: object
1
processed_data


Building sequences:   0%|          | 0/157383 [00:00<?, ?it/s]

_create_sequences
‚úó Error training LSTM model: y should be a 1d array, got an array of shape () instead.
‚úó LSTM training failed: y should be a 1d array, got an array of shape () instead.

3Ô∏è‚É£  TRAINING XGBOOST MODEL
üîß Setting up XGBoost GPU acceleration...
‚úì NVIDIA GPU detected!
  GPU acceleration will be used for XGBoost
  GPU: 0 Tesla T4


Training XGBoost:   0%|           | 0/6 [00:00<?]

‚úó Error training XGBoost models: y should be a 1d array, got an array of shape () instead.
‚úó XGBoost training failed: y should be a 1d array, got an array of shape () instead.

4Ô∏è‚É£  TRAINING RANDOM FOREST MODEL
üîß Setting up Random Forest with acceleration options...
‚ö†Ô∏è  cuML not available. Install with: pip install cuml-cu11
   Using standard scikit-learn Random Forest
üìä Using all 2 CPU cores for parallel processing


Training Random Forest:   0%|           | 0/6 [00:00<?]

‚úó Error training Random Forest models: y should be a 1d array, got an array of shape () instead.
‚úó Random Forest training failed: y should be a 1d array, got an array of shape () instead.

üìä MODEL EVALUATION
Model           AQI RMSE   AQI MAE    AQI R¬≤     Training Time  
----------------------------------------------------------------------
arima           ERROR      ERROR      ERROR      113.35         s
   Error: name 'r2_score' is not defined

üìã TRAINING SUMMARY
Environment: GPU Accelerated
Models trained: 0
Total training time: 0.00 seconds

‚úÖ Training pipeline completed!


NameError: name 'Path' is not defined