In [4]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load your dataset
from supabase import create_client
import pandas as pd

SUPABASE_URL = "https://qmjpxafyxikhynczflch.supabase.co"
SUPABASE_KEY = "sb_publishable_3yT3_nQMXBfLu27TyCU9UQ_jI57jX4q"

supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Ensure the 'date' column is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Get the last date in the dataset
last_date = df['date'].max()

# Calculate the date 30 days before the last date
start_date = last_date - pd.Timedelta(days=30)

# Filter the dataframe for the last 30 days, excluding today
last_30_days_df = df[(df['date'] > start_date) & (df['date'] < last_date)]

# Save the filtered data to a new CSV
last_30_days_df.to_csv('last_30_days_data.csv', index=False)

print("Filtered data saved to 'last_30_days_data.csv'")

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
response = supabase.table("air_quality_raw") \
    .select("date, pm25") \
    .order("date", desc=False) \
    .execute()

df = pd.DataFrame(response.data)
df["date"] = pd.to_datetime(df["date"])


In [None]:
df.drop(columns=[' co', ' so2'], inplace=True)
print(df.head())

In [None]:
df.columns = df.columns.str.strip()
print(df.head(30))

In [None]:
cols = ['pm25', 'pm10', 'no2', 'o3']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
print(df.info())

In [None]:
# Convert Date and Set Index
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date').reset_index(drop=True)

# interprolate values (linear)
df[['pm25', 'pm10', 'no2', 'o3']] = df[['pm25', 'pm10', 'no2', 'o3']].interpolate(method='linear')

# Fill remaining NaN at start/end using forword fill then backword fill
df[['pm25', 'pm10', 'no2', 'o3']] =df[['pm25', 'pm10', 'no2', 'o3']].ffill().bfill()

df.set_index('date', inplace=True)
print(df.info())
print(df.head(30))

In [None]:
missing_cols = df.columns[df.isnull().any()]
print("Columns with Missing Values:")
print(missing_cols)

In [None]:
print(df[df.isna().any(axis=1)])

In [None]:
# Replace '-' with NaN and forward fill
df.replace('-', np.nan, inplace=True)
df.ffill(inplace=True)
df.info()

In [None]:
print(df.head(30))

In [None]:
# Select Features (Pollutant Columns)
features = ['pm25', 'pm10', 'no2', 'o3']
data = df[features]

# Handle Missing Values (Using Simple Interpolation)
# This fills gaps using a linear trend between known values.
data = data.interpolate(method='linear')

# Verify no more missing data
print("\nMissing values after interpolation:")
print(data.isnull().sum())
print("\nProcessed Data Head:")
print(data.head())

In [None]:
# Scaling and Sequence Generation

# Scaling
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data)

# Define Sequence Parameters
N_IN = 7      # Lookback window (7 days)
N_OUT = 2     # Prediction horizon (2 days)
N_FEATURES = len(features) # 4 pollutants

# Function to Create Sequences (Sliding Window)
def create_sequences(data, n_in, n_out):
    X, y = [], []
    # Loop from the starting point up to the end minus the total length of the sequence
    for i in range(len(data) - n_in - n_out + 1):
        # Input sequence (7 days)
        end_ix = i + n_in
        X.append(data[i:end_ix, :])

        # Output sequence (2 days starting right after the input ends)
        out_ix = end_ix + n_out
        y.append(data[end_ix:out_ix, :])

    return np.array(X), np.array(y)

# Create the sequences
X, y = create_sequences(scaled_data, N_IN, N_OUT)

print(f"\nShape of Input (X): (Samples, Lookback Days, Features) -> {X.shape}")
print(f"Shape of Output (y): (Samples, Prediction Horizon, Features) -> {y.shape}")

# Chronological Train/Test Split (80% Train, 20% Test)
# Time series data must be split chronologically.
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"Train/Test Split: {len(X_train)} training samples, {len(X_test)} test samples.")

In [None]:
# Build, Train, and Evaluate the LSTM Model

#  Build the Model
model = Sequential()

# LSTM Layer: 50 units, returns sequences for the next layer
model.add(LSTM(50, activation='relu', input_shape=(N_IN, N_FEATURES)))

# Output Layer: Dense layer to predict the 4 features (pollutants) for the 2 days (N_OUT)
# flatten the output (N_OUT * N_FEATURES = 8 values)
model.add(Dense(N_OUT * N_FEATURES))
model.compile(optimizer='adam', loss='mse')

# Display Model Summary
print("\nModel Summary:")
model.summary()

# Training the Model
# EarlyStopping prevents overfitting by stopping training if validation loss doesn't improve.
callback = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

history = model.fit(X_train, y_train.reshape(X_train.shape[0], -1), # Reshape y_train for the model
                    epochs=100,
                    batch_size=32,
                    verbose=1,
                    validation_split=0.1, # Use 10% of training data for validation
                    callbacks=[callback])

# Evaluate Model on the Test Set
test_loss = model.evaluate(X_test, y_test.reshape(X_test.shape[0], -1), verbose=0)
print(f"\nTest Set Mean Squared Error (MSE): {test_loss:.4f}")

# Make Predictions on Test Set
y_pred_scaled = model.predict(X_test)

# Reshape predictions and actuals back to (Samples, N_OUT, N_FEATURES)
y_pred_scaled = y_pred_scaled.reshape(y_test.shape)

# Inverse Transform (Rescale to original values)
# To inverse transform (must first flatten the 3D arrays to 2D)
y_test_original = scaler.inverse_transform(y_test.reshape(-1, N_FEATURES))
y_pred_original = scaler.inverse_transform(y_pred_scaled.reshape(-1, N_FEATURES))

# Reshape back to 3D for comparison plots
y_test_original = y_test_original.reshape(y_test.shape)
y_pred_original = y_pred_original.reshape(y_test.shape)

In [None]:
# Final 2-Day Forecast

# Prepare the Final Input
# Take the last N_IN (7) days from the scaled dataset
last_7_days_scaled = scaled_data[-N_IN:]

# Reshape the input to match model's expected input shape: (1, N_IN, N_FEATURES)
X_input = last_7_days_scaled.reshape(1, N_IN, N_FEATURES)

# Generate the Forecast
forecast_scaled = model.predict(X_input)

# Reshape the forecast (1, N_OUT * N_FEATURES) to (N_OUT, N_FEATURES) for inverse scaling
forecast_scaled = forecast_scaled.reshape(N_OUT, N_FEATURES)

# Inverse Transform the Forecast to Original Units
final_forecast_original = scaler.inverse_transform(forecast_scaled)

# Create a Forecast DataFrame
last_date = df.index[-1]
forecast_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=N_OUT, freq='D')

forecast_df = pd.DataFrame(final_forecast_original,
                           index=forecast_dates,
                           columns=features)

print("\n\n#####################################################")
print("## ✅ 2-DAY AIR QUALITY FORECAST (NEXT 48 HOURS) ✅ ##")
print("#####################################################")
print(forecast_df)
print("-----------------------------------------------------")

In [None]:
# AQI CALCULATION (POST-PROCESSING, NO MODEL CHANGE) ---

# AQI breakpoint tables (CPCB – India)
aqi_breakpoints = {
    "pm25": [(0,30,0,50),(31,60,51,100),(61,90,101,200),(91,120,201,300),(121,250,301,400),(251,500,401,500)],
    "pm10": [(0,50,0,50),(51,100,51,100),(101,250,101,200),(251,350,201,300),(351,430,301,400),(431,600,401,500)],
    "no2":  [(0,40,0,50),(41,80,51,100),(81,180,101,200),(181,280,201,300),(281,400,301,400),(401,1000,401,500)],
    "o3":   [(0,50,0,50),(51,100,51,100),(101,168,101,200),(169,208,201,300),(209,748,301,400)]
}

def calculate_sub_aqi(pollutant, value):
    for bp_lo, bp_hi, i_lo, i_hi in aqi_breakpoints[pollutant]:
        if bp_lo <= value <= bp_hi:
            return ((i_hi - i_lo)/(bp_hi - bp_lo)) * (value - bp_lo) + i_lo
    return None

# Calculate AQI for each predicted day
aqi_values = []
aqi_categories = []

for _, row in forecast_df.iterrows():
    sub_indices = [
        calculate_sub_aqi('pm25', row['pm25']),
        calculate_sub_aqi('pm10', row['pm10']),
        calculate_sub_aqi('no2', row['no2']),
        calculate_sub_aqi('o3', row['o3'])
    ]
    final_aqi = int(max(sub_indices))
    aqi_values.append(final_aqi)

    if final_aqi <= 50:
        aqi_categories.append("Good")
    elif final_aqi <= 100:
        aqi_categories.append("Satisfactory")
    elif final_aqi <= 200:
        aqi_categories.append("Moderate")
    elif final_aqi <= 300:
        aqi_categories.append("Poor")
    elif final_aqi <= 400:
        aqi_categories.append("Very Poor")
    else:
        aqi_categories.append("Severe")

# Add AQI to forecast table
forecast_df["AQI"] = aqi_values
forecast_df["AQI_Category"] = aqi_categories

from datetime import datetime, timedelta

record = {
    "forecast_date": (datetime.utcnow() + timedelta(days=1)).isoformat(),
    "day_ahead": 1,
    "pm25": float(pred_pm25),
    "aqi": int(aqi),
    "aqi_level": aqi_category(aqi),
    "model_name": "Teammate_Model_Google_Aligned"
}

supabase.table("air_quality_forecast").insert(record).execute()
