In [1]:
import requests
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configuration
LATITUDE = 24.8607
LONGITUDE = 67.0011
CITY_NAME = "Karachi"
AQICN_API_TOKEN = "6fd579d03bc442add85f9d948a9a05424f8d5fbb"
PAST_DAYS = 180

os.makedirs("data", exist_ok=True)

def fetch_aqicn_current(lat, lon, token, city_name):
    """Fetch current AQI data from AQICN API"""
    url = f"https://api.waqi.info/feed/{city_name}/"
    params = {'token': token}
    
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        
        if data['status'] == 'ok':
            return data['data']
        
        # Fallback to coordinates
        url = f"https://api.waqi.info/feed/geo:{lat};{lon}/"
        response = requests.get(url, params=params, timeout=10)
        data = response.json()
        
        if data['status'] == 'ok':
            return data['data']
    except Exception as e:
        print(f"Error fetching AQICN data: {e}")
    
    return None

def fetch_open_meteo(lat, lon, past_days):
    """Fetch air quality data from Open-Meteo"""
    url = (
        "https://air-quality-api.open-meteo.com/v1/air-quality"
        f"?latitude={lat}&longitude={lon}"
        "&hourly=pm2_5,pm10,nitrogen_dioxide,ozone,sulphur_dioxide,carbon_monoxide"
        f"&past_days={past_days}&timezone=UTC"
    )
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        df = pd.DataFrame(data["hourly"])
        df["time"] = pd.to_datetime(df["time"], utc=True)
        df = df[df["time"] <= pd.Timestamp.utcnow()]
        
        return df
    except Exception as e:
        print(f"Error fetching pollution data: {e}")
        return None

def fetch_weather_data(lat, lon, start_date, end_date):
    """Fetch weather data from Meteostat"""
    try:
        import meteostat as ms
        
        stations = ms.stations.nearby(ms.Point(lat, lon), limit=5)
        if stations.empty:
            return None
        
        station_id = stations.index[0]
        ts = ms.hourly(station_id, start_date, end_date, timezone="UTC")
        weather_df = ts.fetch()
        
        if weather_df is None or weather_df.empty:
            return None
        
        weather_df = weather_df.reset_index()
        if 'time' not in weather_df.columns:
            weather_df = weather_df.rename(columns={weather_df.columns[0]: 'time'})
        
        weather_df['time'] = pd.to_datetime(weather_df['time'])
        return weather_df
    except:
        return None

def calculate_pm25_aqi(pm25):
    """Calculate AQI from PM2.5 concentration"""
    if pd.isna(pm25) or pm25 < 0:
        return np.nan
    
    breakpoints = [
        (0.0, 12.0, 0, 50),
        (12.1, 35.4, 51, 100),
        (35.5, 55.4, 101, 150),
        (55.5, 150.4, 151, 200),
        (150.5, 250.4, 201, 300),
        (250.5, 500.4, 301, 500),
    ]
    
    for C_low, C_high, I_low, I_high in breakpoints:
        if C_low <= pm25 <= C_high:
            aqi = ((I_high - I_low) / (C_high - C_low)) * (pm25 - C_low) + I_low
            return round(aqi)
    
    return 500

def calculate_pm10_aqi(pm10):
    """Calculate AQI from PM10 concentration"""
    if pd.isna(pm10) or pm10 < 0:
        return np.nan
    
    breakpoints = [
        (0, 54, 0, 50),
        (55, 154, 51, 100),
        (155, 254, 101, 150),
        (255, 354, 151, 200),
        (355, 424, 201, 300),
        (425, 604, 301, 500),
    ]
    
    for C_low, C_high, I_low, I_high in breakpoints:
        if C_low <= pm10 <= C_high:
            aqi = ((I_high - I_low) / (C_high - C_low)) * (pm10 - C_low) + I_low
            return round(aqi)
    
    return 500

def get_aqi_category(aqi):
    """Get AQI category from value"""
    if pd.isna(aqi):
        return 'Unknown'
    elif aqi <= 50:
        return 'Good'
    elif aqi <= 100:
        return 'Moderate'
    elif aqi <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif aqi <= 200:
        return 'Unhealthy'
    elif aqi <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

def get_aqi_color(aqi):
    """Get color code for AQI"""
    if pd.isna(aqi):
        return 'gray'
    elif aqi <= 50:
        return 'green'
    elif aqi <= 100:
        return 'yellow'
    elif aqi <= 150:
        return 'orange'
    elif aqi <= 200:
        return 'red'
    elif aqi <= 300:
        return 'purple'
    else:
        return 'maroon'

def add_time_features(df):
    """Add time-based features"""
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    df['day_of_month'] = df['time'].dt.day
    df['month'] = df['time'].dt.month
    df['day_of_year'] = df['time'].dt.dayofyear
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    def get_time_of_day(hour):
        if 6 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 17:
            return 'afternoon'
        elif 17 <= hour < 21:
            return 'evening'
        else:
            return 'night'
    
    df['time_of_day'] = df['hour'].apply(get_time_of_day)
    
    def get_season(month):
        if month in [12, 1, 2]:
            return 'winter'
        elif month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        else:
            return 'fall'
    
    df['season'] = df['month'].apply(get_season)
    return df

def add_lag_features(df):
    """Add lag features for time series"""
    for lag in [1, 3, 6, 12, 24]:
        df[f'aqi_lag_{lag}h'] = df['aqi'].shift(lag)
    
    for lag in [1, 6, 24]:
        df[f'pm25_lag_{lag}h'] = df['pm2_5'].shift(lag)
    
    return df

def add_rolling_features(df):
    """Add rolling statistics"""
    for window in [6, 12, 24]:
        df[f'aqi_ma_{window}h'] = df['aqi'].rolling(window=window, min_periods=1).mean()
        df[f'aqi_std_{window}h'] = df['aqi'].rolling(window=window, min_periods=1).std()
    
    for window in [6, 24]:
        df[f'pm25_ma_{window}h'] = df['pm2_5'].rolling(window=window, min_periods=1).mean()
    
    return df

def main():
    print(f"Collecting AQI data for {CITY_NAME} (last {PAST_DAYS} days)...")
    
    # Fetch pollution data
    pollution_df = fetch_open_meteo(LATITUDE, LONGITUDE, PAST_DAYS)
    if pollution_df is None:
        print("Failed to fetch pollution data")
        return
    
    # Calculate AQI
    pollution_df['aqi_pm25'] = pollution_df['pm2_5'].apply(calculate_pm25_aqi)
    pollution_df['aqi_pm10'] = pollution_df['pm10'].apply(calculate_pm10_aqi)
    pollution_df['aqi'] = pollution_df[['aqi_pm25', 'aqi_pm10']].max(axis=1)
    
    # Fetch weather data
    start_date = pollution_df['time'].min()
    end_date = pollution_df['time'].max()
    weather_df = fetch_weather_data(LATITUDE, LONGITUDE, start_date, end_date)
    
    # Merge data
    pollution_df['time'] = pollution_df['time'].dt.floor('h')
    
    if weather_df is not None:
        weather_df['time'] = pd.to_datetime(weather_df['time']).dt.floor('h')
        merged_df = pollution_df.merge(weather_df, on='time', how='left')
    else:
        merged_df = pollution_df.copy()
    
    # Add features
    merged_df = add_time_features(merged_df)
    merged_df = add_lag_features(merged_df)
    merged_df = add_rolling_features(merged_df)
    merged_df['aqi_category'] = merged_df['aqi'].apply(get_aqi_category)
    merged_df['aqi_color'] = merged_df['aqi'].apply(get_aqi_color)
    
    # Handle missing values
    numeric_cols = merged_df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if merged_df[col].isnull().sum() > 0:
            merged_df[col] = merged_df[col].interpolate(method='linear', limit_direction='both')
    
    merged_df = merged_df.ffill().bfill()
    
    # Save dataset
    output_path = "data/cleaned_aqi_data_v2.csv"
    merged_df.to_csv(output_path, index=False)
    
    print(f"\nDataset saved: {output_path}")
    print(f"Records: {len(merged_df):,}")
    print(f"Features: {len(merged_df.columns)}")
    print(f"Date range: {merged_df['time'].min()} to {merged_df['time'].max()}")
    print(f"Mean AQI: {merged_df['aqi'].mean():.1f}")
    print(f"Max AQI: {merged_df['aqi'].max():.0f}")
    print("\nAQI Distribution:")
    for category, count in merged_df['aqi_category'].value_counts().items():
        print(f"  {category}: {count} ({count/len(merged_df)*100:.1f}%)")

if __name__ == "__main__":
    main()

Collecting AQI data for Karachi (last 180 days)...

Dataset saved: data/cleaned_aqi_data_v2.csv
Records: 4,340
Features: 47
Date range: 2025-08-20 00:00:00+00:00 to 2026-02-16 19:00:00+00:00
Mean AQI: 102.0
Max AQI: 500

AQI Distribution:
  Moderate: 2565 (59.1%)
  Unhealthy for Sensitive Groups: 1244 (28.7%)
  Unhealthy: 371 (8.5%)
  Good: 98 (2.3%)
  Hazardous: 62 (1.4%)


In [2]:
df = pd.read_csv("data/cleaned_aqi_data_v2.csv")

In [3]:
df.columns

Index(['time', 'pm2_5', 'pm10', 'nitrogen_dioxide', 'ozone', 'sulphur_dioxide',
       'carbon_monoxide', 'aqi_pm25', 'aqi_pm10', 'aqi', 'temp', 'rhum',
       'prcp', 'snwd', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'cldc', 'coco',
       'hour', 'day_of_week', 'day_of_month', 'month', 'day_of_year',
       'is_weekend', 'time_of_day', 'season', 'aqi_lag_1h', 'aqi_lag_3h',
       'aqi_lag_6h', 'aqi_lag_12h', 'aqi_lag_24h', 'pm25_lag_1h',
       'pm25_lag_6h', 'pm25_lag_24h', 'aqi_ma_6h', 'aqi_std_6h', 'aqi_ma_12h',
       'aqi_std_12h', 'aqi_ma_24h', 'aqi_std_24h', 'pm25_ma_6h', 'pm25_ma_24h',
       'aqi_category', 'aqi_color'],
      dtype='object')