In [2]:
# ==============================================================
# ==============================================================
import os, math, random, warnings, argparse
from datetime import datetime, timedelta


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import joblib


try:
    import gradio as gr
    _HAS_GRADIO = True
except Exception:
    _HAS_GRADIO = False


warnings.filterwarnings("ignore")


OUTDIR = "data"
CSV_PATH = os.path.join(OUTDIR, "pollution.csv")
MODEL_PATH = os.path.join(OUTDIR, "pollution_model.pkl")
SCALER_PATH = os.path.join(OUTDIR, "scaler.pkl")


START = datetime(2024, 1, 1)
N_DAYS = 30
TIMESTEP_MIN = 60
CITY_CENTER = (28.6139, 77.2090)
RADIUS_KM = 20


random.seed(42)
np.random.seed(42)


FEATURES = ["pm25_lag1", "pm25_lag24", "hour", "dayofweek", "lat", "lon"]
TARGET = "pm25"


os.makedirs(OUTDIR, exist_ok=True)




def random_point(center, radius_km):
    lat0, lon0 = center
    r = radius_km * math.sqrt(random.random())
    theta = random.random() * 2 * math.pi
    dx = r * math.cos(theta) / 110.574
    dy = r * math.sin(theta) / (111.320 * math.cos(math.radians(lat0)))
    return lat0 + dx, lon0 + dy

In [4]:
# STEP 1: Generate synthetic dataset (if missing)
# ==============================================================
if not os.path.exists(CSV_PATH):
    print("Generating synthetic dataset...")
    sensors = []
    for i in range(20):
        lat, lon = random_point(CITY_CENTER, RADIUS_KM)
        sensors.append({"sensor_id": f"S{i+1}", "lat": lat, "lon": lon})
    sensors = pd.DataFrame(sensors)


    rows = []
    for day in range(N_DAYS):
        for t in range(0, 24*60, TIMESTEP_MIN):
            ts = START + timedelta(days=day, minutes=t)
            hour = ts.hour
            base_pm = 40 + 20*math.sin((hour/24.0)*2*math.pi)
            for _, s in sensors.iterrows():
                pm = max(5, np.random.normal(base_pm + random.uniform(-5,5), 8))
                rows.append({"timestamp": ts, "sensor_id": s.sensor_id,
                             "lat": s.lat, "lon": s.lon, "pm25": round(pm,2)})
    df = pd.DataFrame(rows)
    df.to_csv(CSV_PATH, index=False)
    print(f"Synthetic data saved to {CSV_PATH}")
else:
    df = pd.read_csv(CSV_PATH, parse_dates=["timestamp"])
    print(f"Loaded existing dataset from {CSV_PATH}")

Generating synthetic dataset...
Synthetic data saved to data/pollution.csv


In [None]:
# STEP 3: Model Training
# ==============================================================
print("Training model...")

# Define the model
model = RandomForestRegressor(random_state=RANDOM_STATE)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit GridSearchCV to the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_

print(f"Best parameters found: {grid_search.best_params_}")
print("Model training complete.")

# Save the best model
joblib.dump(best_model, MODEL_PATH)
print(f"Best model saved to {MODEL_PATH}")

In [10]:
# STEP 0: Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import gradio as gr

# STEP 1: Setup paths
OUTDIR = "data"
MODEL_PATH = os.path.join(OUTDIR, "rf_model.pkl")
os.makedirs(OUTDIR, exist_ok=True)

# STEP 2: Generate synthetic dataset (if not already available)
def generate_synthetic_data(n=1000):
    np.random.seed(42)
    df = pd.DataFrame({
        "pm25_lag1": np.random.randint(30, 200, n),
        "pm25_lag24": np.random.randint(20, 180, n),
        "hour": np.random.randint(0, 24, n),
        "dayofweek": np.random.randint(0, 7, n),
        "lat": 28.5 + np.random.rand(n) * 0.5,
        "lon": 77.0 + np.random.rand(n) * 0.5
    })
    df["pm25"] = (
        0.6 * df["pm25_lag1"] +
        0.3 * df["pm25_lag24"] +
        2 * np.sin(df["hour"]/24*2*np.pi) +
        np.random.randn(n)*10
    )
    return df

df = generate_synthetic_data(2000)

# STEP 3: Train/test split
X = df[["pm25_lag1", "pm25_lag24", "hour", "dayofweek", "lat", "lon"]]
y = df["pm25"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# STEP 4: Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, MODEL_PATH)

# STEP 5: Evaluate & save plots
y_pred = model.predict(X_test)

# Plot 1: Prediction vs True
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("True PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Prediction vs True")
plt.savefig(os.path.join(OUTDIR, "pred_vs_true.png"))
plt.close()

# Plot 2: Sensor map (median PM2.5 by location)
df_map = df.groupby(["lat","lon"])["pm25"].median().reset_index()
plt.figure(figsize=(6,5))
plt.scatter(df_map["lon"], df_map["lat"], c=df_map["pm25"], cmap="coolwarm", s=60)
plt.colorbar(label="Median PM2.5")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Sensor Map - Median PM2.5")
plt.savefig(os.path.join(OUTDIR, "sensor_map_median_pm25.png"))
plt.close()

print("✅ Model trained & plots saved.")

# STEP 6: Prediction function for Gradio
def predict_pollution_gr(pm25_lag1, pm25_lag24, hour, dayofweek, lat, lon):
    try:
        model = joblib.load(MODEL_PATH)
        x = pd.DataFrame([[pm25_lag1, pm25_lag24, hour, dayofweek, lat, lon]],
                         columns=["pm25_lag1","pm25_lag24","hour","dayofweek","lat","lon"])
        pred = model.predict(x)[0]

        img1 = os.path.join(OUTDIR,'pred_vs_true.png')
        img2 = os.path.join(OUTDIR,'sensor_map_median_pm25.png')

        img1_out = img1 if os.path.exists(img1) else None
        img2_out = img2 if os.path.exists(img2) else None

        return round(pred,2), img1_out, img2_out
    except Exception as e:
        return f"Error: {str(e)}", None, None

# STEP 7: Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🌍 Smart City Pollution Predictor")

    with gr.Row():
        with gr.Column():
            pm25_lag1_in = gr.Number(label="PM2.5 (previous hour)", value=60)
            pm25_lag24_in = gr.Number(label="PM2.5 (24 hours ago)", value=50)
            hour_in = gr.Slider(0, 23, step=1, label="Hour of Day", value=12)
            day_in = gr.Dropdown(list(range(7)), label="Day of Week", value=0)
            lat_in = gr.Number(label="Latitude", value=28.6139)
            lon_in = gr.Number(label="Longitude", value=77.2090)
            btn = gr.Button("Predict PM2.5")

        with gr.Column():
            out = gr.Number(label="Predicted PM2.5")
            img_out1 = gr.Image(label="Prediction vs True Plot")
            img_out2 = gr.Image(label="Sensor Map Plot")

    btn.click(
        predict_pollution_gr,
        inputs=[pm25_lag1_in, pm25_lag24_in, hour_in, day_in, lat_in, lon_in],
        outputs=[out, img_out1, img_out2]
    )

    demo.launch(share=True)


✅ Model trained & plots saved.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f50a45f597787d21d9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
