# AQI Prediction Backend - Google Colab Notebook

This notebook allows you to run the AQI Prediction backend in Google Colab, making it accessible to your frontend application.

## Instructions
1. Run each cell in order
2. Copy the ngrok URL provided at the end to connect your frontend


## Step 1: Set up the environment

First, we'll install the required packages:

In [None]:
!pip install fastapi uvicorn pandas scikit-learn numpy python-multipart joblib pydantic python-dotenv requests fastapi-cors statsmodels matplotlib pmdarima pyngrok

## Step 2: Create backend files

Now we'll create the necessary backend files:

In [None]:
# Create directories
!mkdir -p models uploads

### Create time_series_models.py

In [None]:
%%writefile time_series_models.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Union
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs("models", exist_ok=True)

class TimeSeriesModels:
    """
    Class containing implementation of various time series models
    for AQI prediction
    """
    
    @staticmethod
    def prepare_data_for_ts(data_df: pd.DataFrame, target_col: str = 'aqi') -> pd.DataFrame:
        """
        Prepare data for time series modeling
        """
        # Ensure date is in datetime format and set as index
        df = data_df.copy()
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')
            df = df.set_index('date')
        
        # If we have very few data points, we cannot build a good model
        # So we add some synthetic data points based on existing data trends
        if len(df) < 14:
            print(f"Warning: Only {len(df)} data points provided. Adding synthetic data for better modeling.")
            
            # Get the earliest date and add synthetic data before it
            if isinstance(df.index, pd.DatetimeIndex):
                earliest_date = df.index.min()
                
                # Calculate average change between consecutive values
                avg_change = 0
                if len(df) > 1:
                    changes = []
                    for i in range(1, len(df)):
                        changes.append(df.iloc[i][target_col] - df.iloc[i-1][target_col])
                    avg_change = sum(changes) / len(changes) if changes else 0
                
                # Add synthetic data
                synthetic_data = []
                for i in range(1, 15 - len(df) + 1):
                    synthetic_date = earliest_date - timedelta(days=i)
                    
                    # Calculate synthetic values using the average change (with some noise)
                    base_val = df.iloc[0][target_col]
                    synthetic_val = max(0, base_val - (avg_change * i) + np.random.normal(0, 5))
                    
                    # Create row with the same columns as the original data
                    row = {col: 0 for col in df.columns}
                    row[target_col] = synthetic_val
                    
                    synthetic_data.append((synthetic_date, row))
                
                # Add the synthetic data to the dataframe
                for date, row in synthetic_data:
                    df.loc[date] = row
                
                # Sort again after adding synthetic data
                df = df.sort_index()
        
        return df
    
    @staticmethod
    def fit_arima_model(data: pd.DataFrame, target_col: str = 'aqi', order: tuple = (5, 1, 0)):
        """
        Fit an ARIMA model to the data
        
        Parameters:
        -----------
        data : pd.DataFrame
            Time series data with date as index
        target_col : str
            Column name of the target variable
        order : tuple
            ARIMA order (p, d, q) parameters
            
        Returns:
        --------
        fitted_model : ARIMA model
        """
        # Prepare data
        df = TimeSeriesModels.prepare_data_for_ts(data, target_col)
        
        # Fit ARIMA model
        model = ARIMA(df[target_col], order=order)
        fitted_model = model.fit()
        
        # Save the model
        joblib.dump(fitted_model, "models/arima_model.pkl")
        
        return fitted_model
    
    @staticmethod
    def predict_arima(model, steps: int = 7):
        """
        Generate predictions using the fitted ARIMA model
        
        Parameters:
        -----------
        model : ARIMA model
            Fitted ARIMA model
        steps : int
            Number of steps to forecast
            
        Returns:
        --------
        forecast : pd.Series
            Forecast values
        """
        forecast = model.forecast(steps=steps)
        return forecast
    
    @staticmethod
    def auto_arima_forecast(data: pd.DataFrame, target_col: str = 'aqi', forecast_steps: int = 7):
        """
        Automatically find the best ARIMA parameters and forecast
        
        Parameters:
        -----------
        data : pd.DataFrame
            Time series data with datetime index
        target_col : str
            Column name of the target variable
        forecast_steps : int
            Number of steps to forecast
            
        Returns:
        --------
        forecast : pd.Series
            Forecast values
        model : ARIMA model
            Fitted ARIMA model
        """
        # Prepare data
        df = TimeSeriesModels.prepare_data_for_ts(data, target_col)
        
        # Use auto_arima to find the best parameters
        model = pm.auto_arima(
            df[target_col],
            seasonal=False,
            stepwise=True,
            suppress_warnings=True,
            error_action="ignore",
            max_order=6,
            trace=False
        )
        
        # Save model
        joblib.dump(model, "models/auto_arima_model.pkl")
        
        # Generate forecast
        forecast = model.predict(n_periods=forecast_steps)
        
        return forecast, model
    
    @staticmethod
    def fit_sarimax_model(data: pd.DataFrame, target_col: str = 'aqi', 
                          order: tuple = (1, 1, 1), seasonal_order: tuple = (1, 1, 1, 7)):
        """
        Fit a SARIMAX model to the data
        
        Parameters:
        -----------
        data : pd.DataFrame
            Time series data with date as index
        target_col : str
            Column name of the target variable
        order : tuple
            ARIMA order (p, d, q) parameters
        seasonal_order : tuple
            Seasonal order (P, D, Q, s) parameters
            
        Returns:
        --------
        fitted_model : SARIMAX model
        """
        # Prepare data
        df = TimeSeriesModels.prepare_data_for_ts(data, target_col)
        
        # If we have too few data points for seasonal modeling
        if len(df) < seasonal_order[3]:
            print(f"Warning: Not enough data points for seasonal modeling with period={seasonal_order[3]}.")
            # Adjust the seasonal period
            seasonal_order = (seasonal_order[0], seasonal_order[1], seasonal_order[2], min(7, len(df) // 2))
            
        # Fit SARIMAX model
        model = SARIMAX(df[target_col], order=order, seasonal_order=seasonal_order)
        
        try:
            fitted_model = model.fit(disp=False)
            
            # Save the model
            joblib.dump(fitted_model, "models/sarimax_model.pkl")
            
            return fitted_model
        except Exception as e:
            print(f"Error fitting SARIMAX model: {e}")
            # Fall back to ARIMA if SARIMAX fails
            print("Falling back to ARIMA model")
            arima_model = TimeSeriesModels.fit_arima_model(data, target_col)
            return arima_model
    
    @staticmethod
    def predict_sarimax(model, steps: int = 7):
        """
        Generate predictions using the fitted SARIMAX model
        
        Parameters:
        -----------
        model : SARIMAX model
            Fitted SARIMAX model
        steps : int
            Number of steps to forecast
            
        Returns:
        --------
        forecast : pd.Series
            Forecast values
        """
        forecast = model.forecast(steps=steps)
        return forecast
    
    @staticmethod
    def get_best_model_for_data(data: pd.DataFrame, target_col: str = 'aqi'):
        """
        Determine the best model for the given data
        """
        # If we have at least 14 data points, we can try SARIMAX
        if len(data) >= 14:
            try:
                # Try SARIMAX model
                return TimeSeriesModels.fit_sarimax_model(data, target_col)
            except Exception as e:
                print(f"SARIMAX failed: {e}")
                # Fall back to auto ARIMA
                try:
                    _, model = TimeSeriesModels.auto_arima_forecast(data, target_col)
                    return model
                except Exception as e2:
                    print(f"Auto ARIMA failed: {e2}")
                    # Fall back to basic ARIMA
                    return TimeSeriesModels.fit_arima_model(data, target_col)
        else:
            # For small datasets, auto ARIMA is better
            try:
                _, model = TimeSeriesModels.auto_arima_forecast(data, target_col)
                return model
            except Exception as e:
                print(f"Auto ARIMA failed: {e}")
                # Fall back to basic ARIMA
                return TimeSeriesModels.fit_arima_model(data, target_col)
    
    @staticmethod
    def load_model(model_name: str):
        """
        Load a saved model
        """
        model_path = f"models/{model_name}_model.pkl"
        
        if os.path.exists(model_path):
            return joblib.load(model_path)
        else:
            return None

    @staticmethod
    def forecast_with_model(model_name: str, data: pd.DataFrame, steps: int = 7, target_col: str = 'aqi'):
        """
        Generate forecasts using the specified model
        """
        # Prepare data
        df = TimeSeriesModels.prepare_data_for_ts(data, target_col)
        last_date = df.index[-1]
        
        # Load or fit model
        model = TimeSeriesModels.load_model(model_name)
        
        if model is None:
            # Model not found, fit a new one
            if model_name.lower() == 'arima':
                model = TimeSeriesModels.fit_arima_model(df, target_col)
            elif model_name.lower() == 'sarimax':
                model = TimeSeriesModels.fit_sarimax_model(df, target_col)
            else:
                # Default to auto ARIMA
                _, model = TimeSeriesModels.auto_arima_forecast(df, target_col)
        
        # Generate forecast
        try:
            forecast = model.forecast(steps=steps)
        except Exception as e:
            print(f"Error generating forecast with {model_name}: {e}")
            # Try to refit the model
            if model_name.lower() == 'arima':
                model = TimeSeriesModels.fit_arima_model(df, target_col)
            elif model_name.lower() == 'sarimax':
                model = TimeSeriesModels.fit_sarimax_model(df, target_col)
            else:
                # Default to auto ARIMA
                _, model = TimeSeriesModels.auto_arima_forecast(df, target_col)
            
            forecast = model.forecast(steps=steps)
        
        # Convert forecast to DataFrame with dates
        dates = [last_date + timedelta(days=i+1) for i in range(steps)]
        forecast_df = pd.DataFrame({
            'date': dates,
            target_col: forecast.values if hasattr(forecast, 'values') else forecast
        })
        
        # Convert values to positive numbers and round
        forecast_df[target_col] = forecast_df[target_col].apply(lambda x: max(0, round(x)))
        
        return forecast_df

    @staticmethod
    def process_csv_data(csv_file_path: str, target_col: str = 'aqi', date_col: str = 'date'):
        """
        Process CSV data for time series modeling
        """
        try:
            # Read CSV
            df = pd.read_csv(csv_file_path)
            
            # Check if required columns exist
            if date_col not in df.columns:
                raise ValueError(f"Date column '{date_col}' not found in CSV file")
            if target_col not in df.columns:
                raise ValueError(f"Target column '{target_col}' not found in CSV file")
            
            # Process date column
            df[date_col] = pd.to_datetime(df[date_col])
            
            # Sort by date
            df = df.sort_values(date_col)
            
            return df
        except Exception as e:
            print(f"Error processing CSV file: {e}")
            return None

### Create main.py

In [None]:
%%writefile main.py
from fastapi import FastAPI, HTTPException, Depends, Query, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Optional, Dict, Any, Union
import pandas as pd
import numpy as np
import joblib
from datetime import datetime, timedelta
import os
import requests
from pydantic import BaseModel
import json
from time_series_models import TimeSeriesModels

# Models for request/response
class PollutantData(BaseModel):
    pm25: float
    pm10: float
    no2: float
    o3: float
    co: float
    so2: float
    nh3: float

class AQIDataPoint(BaseModel):
    date: str
    city: str
    location: Optional[str] = None
    aqi: float
    pollutants: Optional[PollutantData] = None
    predicted: Optional[bool] = False

class AQIRequest(BaseModel):
    city: str
    state: Optional[str] = None
    country: Optional[str] = "India"
    api_key: str
    platform: str = "airvisual"  # 'airvisual' or 'aqicn'

class PredictionRequest(BaseModel):
    historical_data: List[AQIDataPoint]
    model_name: str = "ARIMA"  # Default to ARIMA if not specified

class CSVDataRequest(BaseModel):
    target_column: str = "aqi"
    date_column: str = "date"
    city_column: Optional[str] = None
    model_name: str = "ARIMA"

# Create FastAPI app
app = FastAPI(title="AQI Prediction API")

# Add CORS middleware to allow requests from frontend
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Allow all origins (you should restrict this in production)
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Create upload directory if it doesn't exist
UPLOAD_DIR = "uploads"
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs("models", exist_ok=True)

# API Endpoints
@app.get("/")
def read_root():
    return {"message": "AQI Prediction API is running"}

@app.post("/api/fetch-aqi", response_model=List[AQIDataPoint])
async def fetch_aqi_data(request: AQIRequest):
    """
    Fetch AQI data from the selected platform (airvisual or aqicn)
    """
    try:
        if request.platform == "airvisual":
            return await fetch_airvisual_data(request.city, request.state, request.country, request.api_key)
        else:
            return await fetch_aqicn_data(request.city, request.api_key)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching AQI data: {str(e)}")

@app.post("/api/predict", response_model=List[AQIDataPoint])
async def predict_aqi(request: PredictionRequest):
    """
    Generate AQI predictions based on historical data and chosen model
    """
    try:
        # Convert input data to pandas DataFrame for processing
        data_points = []
        for point in request.historical_data:
            data_dict = {
                "date": point.date,
                "city": point.city,
                "aqi": point.aqi
            }
            if point.pollutants:
                data_dict.update({
                    "pm25": point.pollutants.pm25,
                    "pm10": point.pollutants.pm10,
                    "no2": point.pollutants.no2,
                    "o3": point.pollutants.o3,
                    "co": point.pollutants.co,
                    "so2": point.pollutants.so2,
                    "nh3": point.pollutants.nh3
                })
            data_points.append(data_dict)
            
        df = pd.DataFrame(data_points)
        
        # Sort by date
        if not df.empty:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')
        
        # Make predictions using appropriate model
        predictions = generate_predictions(df, request.model_name)
        
        # Convert predictions back to AQIDataPoint format
        result = []
        for index, row in predictions.iterrows():
            pollutants = None
            if all(col in row.index for col in ["pm25", "pm10", "no2", "o3", "co", "so2", "nh3"]):
                pollutants = PollutantData(
                    pm25=float(row["pm25"]),
                    pm10=float(row["pm10"]),
                    no2=float(row["no2"]),
                    o3=float(row["o3"]),
                    co=float(row["co"]),
                    so2=float(row["so2"]),
                    nh3=float(row["nh3"])
                )
            
            result.append(AQIDataPoint(
                date=row["date"].strftime("%Y-%m-%d") if isinstance(row["date"], (datetime, pd.Timestamp)) else row["date"],
                city=row["city"],
                location=row["location"] if "location" in row else None,
                aqi=float(row["aqi"]),
                pollutants=pollutants,
                predicted=bool(row["predicted"])
            ))
        
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error generating predictions: {str(e)}")

@app.post("/api/predict-csv", response_model=List[AQIDataPoint])
async def predict_from_csv(
    file: UploadFile = File(...),
    target_column: str = Query("aqi"),
    date_column: str = Query("date"),
    city_column: Optional[str] = Query(None),
    model_name: str = Query("ARIMA")
):
    """
    Generate predictions from uploaded CSV data
    """
    try:
        # Save the uploaded file
        file_path = os.path.join(UPLOAD_DIR, file.filename)
        with open(file_path, "wb") as f:
            content = await file.read()
            f.write(content)
        
        # Process the CSV file
        df = TimeSeriesModels.process_csv_data(file_path, target_column, date_column)
        
        if df is None:
            raise HTTPException(status_code=400, detail="Failed to process CSV file")
        
        # Add city information if provided
        if city_column and city_column in df.columns:
            city = df[city_column].iloc[0]
        else:
            city = "Unknown"
            df['city'] = city
        
        # Generate predictions using the specified model
        predictions_df = generate_predictions_from_csv(df, model_name, target_column)
        
        # Convert to AQIDataPoint format
        result = []
        for _, row in predictions_df.iterrows():
            result.append(AQIDataPoint(
                date=row["date"].strftime("%Y-%m-%d") if isinstance(row["date"], (datetime, pd.Timestamp)) else row["date"],
                city=row["city"] if "city" in row else city,
                aqi=float(row[target_column]),
                predicted=bool(row["predicted"])
            ))
        
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing CSV file: {str(e)}")

@app.get("/api/models")
async def list_available_models():
    """
    List all available prediction models
    """
    basic_models = ["ARIMA", "SARIMAX", "RandomForest", "LSTM"]
    
    # Check for custom trained models
    model_files = [f.replace("_model.pkl", "") for f in os.listdir("models") if f.endswith("_model.pkl")]
    
    # Combine lists and remove duplicates
    all_models = list(set(basic_models + model_files))
    
    return {"models": all_models}

# Helper functions for data fetching and predictions
async def fetch_airvisual_data(city: str, state: Optional[str], country: str, api_key: str) -> List[AQIDataPoint]:
    """
    Fetch data from AirVisual API
    """
    # Define base URL and parameters
    base_url = "https://api.airvisual.com/v2/city"
    params = {
        "city": city,
        "state": state if state and state != "All States" else "Delhi",  # Default to Delhi if not specified
        "country": country,
        "key": api_key
    }
    
    # Make API request
    response = requests.get(base_url, params=params)
    
    if not response.ok:
        raise HTTPException(status_code=response.status_code, 
                           detail=f"AirVisual API error: {response.text}")
    
    data = response.json()
    
    # Process API response
    if data["status"] == "success":
        current_date = datetime.now().strftime("%Y-%m-%d")
        city_name = city
        
        # Create current data point
        current_aqi = data["data"]["current"]["pollution"]["aqius"]
        pollutants = PollutantData(
            pm25=data["data"]["current"]["pollution"].get("pm25", 0),
            pm10=data["data"]["current"]["pollution"].get("pm10", 0),
            no2=0,  # AirVisual free API doesn't provide these values
            o3=0,
            co=0,
            so2=0,
            nh3=0
        )
        
        current_point = AQIDataPoint(
            date=current_date,
            city=city_name,
            aqi=current_aqi,
            pollutants=pollutants,
            predicted=False
        )
        
        # Generate historical data (simulated)
        result = [current_point]
        for i in range(1, 15):
            past_date = (datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d")
            variation = np.random.randint(-10, 11)
            historical_aqi = max(0, current_aqi + variation)
            
            pollutants_variation = {
                "pm25": max(0, pollutants.pm25 + np.random.randint(-5, 6)),
                "pm10": max(0, pollutants.pm10 + np.random.randint(-7, 8)),
                "no2": 0,
                "o3": 0,
                "co": 0,
                "so2": 0,
                "nh3": 0
            }
            
            result.append(AQIDataPoint(
                date=past_date,
                city=city_name,
                aqi=historical_aqi,
                pollutants=PollutantData(**pollutants_variation),
                predicted=False
            ))
        
        # Sort by date
        result.sort(key=lambda x: x.date)
        return result
    else:
        raise HTTPException(status_code=400, detail="Failed to get data from AirVisual")

async def fetch_aqicn_data(city: str, api_key: str) -> List[AQIDataPoint]:
    """
    Fetch data from AQICN API
    """
    # Extract base city name for API query
    if "," in city:
        base_city = city.split(",")[-1].strip()
    else:
        base_city = city
    
    # Make API request
    base_url = f"https://api.waqi.info/feed/{base_city}/"
    params = {"token": api_key}
    
    response = requests.get(base_url, params=params)
    
    if not response.ok:
        raise HTTPException(status_code=response.status_code, 
                           detail=f"AQICN API error: {response.text}")
    
    data = response.json()
    
    # Process API response
    if data["status"] == "ok":
        current_date = datetime.now().strftime("%Y-%m-%d")
        
        # Extract location info
        full_location = data["data"]["city"]["name"]
        location_parts = full_location.split(",")
        specific_location = location_parts[0].strip() if len(location_parts) > 1 else ""
        
        # Determine city from location
        city_name = base_city
        if len(location_parts) > 1:
            city_name = location_parts[-1].strip()
        
        # Extract current AQI and pollutants
        current_aqi = data["data"]["aqi"]
        iaqi = data["data"]["iaqi"]
        pollutants = PollutantData(
            pm25=iaqi.get("pm25", {}).get("v", 0),
            pm10=iaqi.get("pm10", {}).get("v", 0),
            no2=iaqi.get("no2", {}).get("v", 0),
            o3=iaqi.get("o3", {}).get("v", 0),
            co=iaqi.get("co", {}).get("v", 0),
            so2=iaqi.get("so2", {}).get("v", 0),
            nh3=0  # AQICN doesn't provide NH3 typically
        )
        
        # Create current data point
        current_point = AQIDataPoint(
            date=current_date,
            city=city_name,
            location=specific_location,
            aqi=current_aqi,
            pollutants=pollutants,
            predicted=False
        )
        
        # Generate historical data (simulated)
        result = [current_point]
        for i in range(1, 15):
            past_date = (datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d")
            variation = np.random.randint(-10, 11)
            historical_aqi = max(0, current_aqi + variation)
            
            pollutants_variation = {
                "pm25": max(0, pollutants.pm25 + np.random.randint(-5, 6)),
                "pm10": max(0, pollutants.pm10 + np.random.randint(-7, 8)),
                "no2": max(0, pollutants.no2 + np.random.randint(-4, 5)),
                "o3": max(0, pollutants.o3 + np.random.randint(-3, 4)),
                "co": max(0, pollutants.co + np.random.randint(-2, 3)),
                "so2": max(0, pollutants.so2 + np.random.randint(-1, 2)),
                "nh3": 0
            }
            
            result.append(AQIDataPoint(
                date=past_date,
                city=city_name,
                location=specific_location,
                aqi=historical_aqi,
                pollutants=PollutantData(**pollutants_variation),
                predicted=False
            ))
        
        # Sort by date
        result.sort(key=lambda x: x.date)
        return result
    else:
        raise HTTPException(status_code=400, detail="Failed to get data from AQICN")

def generate_predictions(df: pd.DataFrame, model_name: str) -> pd.DataFrame:
    """
    Generate AQI predictions using the specified model
    """
    if df.empty:
        return pd.DataFrame(columns=["date", "city", "location", "aqi", "predicted"])
    
    # Extract basic info that we'll need for predictions
    city = df.iloc[-1]["city"] 
    location = df.iloc[-1]["location"] if "location" in df.columns else None
    
    # Determine if we should use advanced time series models
    use_time_series = model_name.upper() in ["ARIMA", "SARIMAX"]
    
    # Get the latest actual data point
    current_date = datetime.now().date()
    
    # Find the latest non-predicted data point
    actual_data = df[~df.get("predicted", False)].copy() if "predicted" in df.columns else df.copy()
    actual_data = actual_data.sort_values("date", ascending=False)
    
    current_aqi_point = None
    if not actual_data.empty:
        current_aqi_point = actual_data.iloc[0].to_dict()
    
    # Prepare for predictions
    forecast_df = pd.DataFrame()
    
    try:
        # Generate 7-day forecast
        forecast_dates = [current_date + timedelta(days=i) for i in range(7)]
        
        # Use time series models for ARIMA and SARIMAX
        if use_time_series:
            # Prepare data for time series modeling
            ts_df = df[["date", "aqi"]].copy()
            ts_df['date'] = pd.to_datetime(ts_df['date'])
            ts_df = ts_df.set_index('date')
            
            # Generate forecasts
            if model_name.upper() == "ARIMA":
                forecast = TimeSeriesModels.forecast_with_model("arima", df, 7)
            elif model_name.upper() == "SARIMAX":
                forecast = TimeSeriesModels.forecast_with_model("sarimax", df, 7)
            else:
                forecast = TimeSeriesModels.forecast_with_model("arima", df, 7)  # Default
                
            # Create forecast DataFrame with city and predicted flag
            forecast_df = forecast.copy()
            forecast_df['city'] = city
            forecast_df['predicted'] = True
            
            if location:
                forecast_df['location'] = location
                
            # Ensure the first day's AQI matches current if available
            if current_aqi_point and len(forecast_df) > 0:
                # The first prediction will be for tomorrow, so we insert today's actual value
                today_row = {
                    'date': current_date,
                    'aqi': current_aqi_point['aqi'],
                    'city': city,
                    'predicted': False
                }
                if location:
                    today_row['location'] = location
                
                # Add today's row and resort
                forecast_df = pd.concat([forecast_df, pd.DataFrame([today_row])])
                forecast_df = forecast_df.sort_values('date')
            
        else:
            # For other models, use the existing simulation-based approach
            # Start with today's date
            forecast_df = pd.DataFrame({
                "date": forecast_dates,
                "city": city,
                "predicted": True
            })
            
            if location:
                forecast_df["location"] = location
            
            # Use a different forecasting approach based on the model name
            last_aqi = df.iloc[-1]["aqi"] if not df.empty else 100
            aqi_values = []
            
            if model_name == "RandomForest":
                # Simulate Random Forest-like behavior with step-wise predictions
                for i in range(7):
                    if i == 0 and current_aqi_point:
                        # For today, use the actual current AQI
                        aqi_values.append(current_aqi_point["aqi"])
                    else:
                        # Each step is a bit less certain (increasing randomness)
                        prev = aqi_values[-1] if aqi_values else last_aqi
                        random_component = np.random.normal(0, 2 + i)
                        aqi_values.append(max(0, prev * 0.9 + random_component))
                        
            elif model_name == "LSTM":
                # Simulate LSTM-like behavior with trend and seasonality
                for i in range(7):
                    if i == 0 and current_aqi_point:
                        # For today, use the actual current AQI
                        aqi_values.append(current_aqi_point["aqi"])
                    else:
                        # Simulate trend + seasonality + residual
                        trend = -2  # Slight downward trend
                        seasonality = 5 * np.sin(i/7 * 2 * np.pi)  # Weekly cycle
                        residual = np.random.normal(0, 3)
                        
                        prev = aqi_values[-1] if aqi_values else last_aqi
                        aqi_values.append(max(0, prev + trend + seasonality + residual))
            else:
                # Simple linear trend with noise
                for i in range(7):
                    if i == 0 and current_aqi_point:
                        # For today, use the actual current AQI
                        aqi_values.append(current_aqi_point["aqi"])
                    else:
                        base = last_aqi - i * 2  # Linear decrease
                        noise = np.random.normal(0, 5)
                        aqi_values.append(max(0, base + noise))
            
            # Round AQI values
            forecast_df["aqi"] = [round(val) for val in aqi_values]
        
        # Generate pollutant predictions
        if "pollutants" in df.columns or any(col in df.columns for col in ["pm25", "pm10", "no2", "o3", "co", "so2", "nh3"]):
            # Get the latest pollutant values as base
            latest_pollutants = {}
            for pollutant in ["pm25", "pm10", "no2", "o3", "co", "so2", "nh3"]:
                if pollutant in df.columns:
                    latest_pollutants[pollutant] = df.iloc[-1].get(pollutant, 0)
                else:
                    latest_pollutants[pollutant] = 0
            
            # Add predictions for each pollutant
            for pollutant in ["pm25", "pm10", "no2", "o3", "co", "so2", "nh3"]:
                base_val = latest_pollutants[pollutant]
                pollutant_vals = []
                
                for i in range(len(forecast_df)):
                    if forecast_df.iloc[i].get('predicted', True) == False and pollutant in current_aqi_point:
                        # For actual data points, use actual value if available
                        pollutant_vals.append(current_aqi_point[pollutant])
                    else:
                        # Generate reasonable prediction based on base value and AQI trend
                        aqi_ratio = forecast_df.iloc[i]['aqi'] / last_aqi if last_aqi > 0 else 1
                        predicted_val = base_val * aqi_ratio * (0.95 + np.random.random() * 0.1)
                        pollutant_vals.append(max(0, round(predicted_val)))
                
                forecast_df[pollutant] = pollutant_vals
        
        # Convert date column to string format if it's not already
        if isinstance(forecast_df["date"].iloc[0], (datetime, pd.Timestamp)):
            forecast_df["date"] = forecast_df["date"].dt.strftime("%Y-%m-%d")
        
    except Exception as e:
        print(f"Error generating predictions: {str(e)}")
        # Return empty dataframe if prediction fails
        return pd.DataFrame(columns=["date", "city", "location", "aqi", "predicted"])
    
    return forecast_df

def generate_predictions_from_csv(df: pd.DataFrame, model_name: str, target_col: str = 'aqi') -> pd.DataFrame:
    """
    Generate predictions from CSV data
    """
    # Check if we have the minimum required columns
    required_cols = ['date', target_col]
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Required column {col} not found in CSV data")
    
    # Add city column if it doesn't exist
    if 'city' not in df.columns:
        df['city'] = 'Unknown'
    
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Sort by date
    df = df.sort_values('date')
    
    # Generate 7-day forecast using appropriate model
    if model_name.upper() in ["ARIMA", "SARIMAX"]:
        # Use time series model
        forecast = TimeSeriesModels.forecast_with_model(
            model_name.lower(), df, 7, target_col
        )
        
        # Add prediction flag
        forecast['predicted'] = True
        
        # Add city if it exists in original data
        if 'city' in df.columns:
            forecast['city'] = df['city'].iloc[0]
        else:
            forecast['city'] = 'Unknown'
            
        # Combine historical and forecast data
        last_date = df['date'].max()
        historical = df.copy()
        historical['predicted'] = False
        
        # Only keep historical data up to the last date to avoid overlap
        historical = historical[historical['date'] <= last_date]
        
        # Combine and sort
        combined = pd.concat([historical, forecast])
        combined = combined.sort_values('date')
        
        return combined
        
    else:
        # For other models, use a simpler approach
        # Get the last actual value
        last_actual = df[target_col].iloc[-1]
        
        # Generate dates for next 7 days
        last_date = df['date'].max()
        forecast_dates = [last_date + timedelta(days=i+1) for i in range(7)]
        
        # Create forecast DataFrame
        forecast = pd.DataFrame({'date': forecast_dates})
        forecast['city'] = df['city'].iloc[0] if 'city' in df.columns else 'Unknown'
        forecast['predicted'] = True
        
        # Generate values based on model type
        if model_name.upper() == "RANDOMFOREST":
            # Random forest tends to have less extreme predictions
            vals = [max(0, last_actual * (0.95 + 0.1 * np.random.randn()) - i * 2) for i in range(7)]
            forecast[target_col] = [round(val) for val in vals]
        elif model_name.upper() == "LSTM":
            # LSTM can capture patterns better
            vals = []
            prev = last_actual
            for i in range(7):
                # Add cyclical pattern + trend
                val = prev * 0.9 + last_actual * 0.1 + 5 * np.sin(i/7 * 2 * np.pi) - i
                vals.append(max(0, val + np.random.randn() * 5))
                prev = val
            forecast[target_col] = [round(val) for val in vals]
        else:
            # Default simple approach
            vals = [max(0, last_actual - i * 3 + np.random.randn() * 7) for i in range(7)]
            forecast[target_col] = [round(val) for val in vals]
        
        # Combine historical and forecast data
        historical = df.copy()
        historical['predicted'] = False
        combined = pd.concat([historical, forecast])
        combined = combined.sort_values('date')
        
        return combined

# Run the server with: uvicorn main:app --reload
if __name__ == "__main__":
    import uvicorn
    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

## Step 3: Run the FastAPI server

Now, let's run the FastAPI server using ngrok to make it publicly accessible:

In [None]:
# Import necessary libraries
from pyngrok import ngrok, conf
import threading
import time
import uvicorn

# Start the FastAPI app in a separate thread
def start_server():
    uvicorn.run("main:app", host="127.0.0.1", port=8000)

# Start the server in a background thread
server_thread = threading.Thread(target=start_server, daemon=True)
server_thread.start()

# Wait for the server to start
time.sleep(2)

# Setup ngrok
ngrok_tunnel = ngrok.connect(8000)
print(f"\n\n✅ Backend server is running at: {ngrok_tunnel.public_url}")
print("\n📋 Use this URL in your frontend settings to connect to the backend")
print("\n📝 API Documentation: {}/docs".format(ngrok_tunnel.public_url))
print("\nKeep this notebook running while using the backend!\n")

# Keep the notebook running
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Shutting down...")
    ngrok.disconnect(ngrok_tunnel.public_url)

## Step 4: Test the API

Once the server is running, you can test the endpoints using the URL provided above.

### Example: Creating and Using a CSV for predictions

Let's create a sample CSV file and test the prediction with it:

In [None]:
# Create a sample CSV file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Generate sample data
dates = [datetime.now() - timedelta(days=i) for i in range(30)]
dates = [d.strftime('%Y-%m-%d') for d in dates]

# Generate AQI values with seasonality and trend
base_aqi = 100
aqi_values = []
for i in range(30):
    # Add trend (slight decrease)
    trend = -i * 0.5
    
    # Add weekly seasonality
    season = 15 * np.sin(i/7 * 2 * np.pi)
    
    # Add noise
    noise = np.random.normal(0, 10)
    
    # Combine components
    aqi = max(0, round(base_aqi + trend + season + noise))
    aqi_values.append(aqi)

# Create DataFrame
df = pd.DataFrame({
    'date': dates,
    'city': 'Delhi',
    'aqi': aqi_values
})

# Save to CSV
csv_path = 'sample_aqi_data.csv'
df.to_csv(csv_path, index=False)

print(f"Created sample CSV file: {csv_path}")
df.head()

## How to use this backend with your frontend

1. Copy the ngrok URL displayed above
2. In your frontend application, open the Backend Settings
3. Enable backend integration
4. Paste the ngrok URL
5. Test the connection
6. Save settings

Now your frontend will use this backend for AQI predictions using ARIMA and SARIMAX models!