In [11]:
!pip install -U openaq



In [17]:
import requests
import pandas as pd

lat, lon = 28.31733, 76.91603  # Manesar
url = f"https://power.larc.nasa.gov/api/temporal/hourly/point"
params = {
    "parameters": "T2M,RH2M,PRECTOTCORR",
    "community": "RE",
    "longitude": lon,
    "latitude": lat,
    "format": "JSON",
    "start": "20240901",
    "end": "20250901"
}

r = requests.get(url, params=params)
data = r.json()["properties"]["parameter"]

df_weather = pd.DataFrame({
    "datetime": list(data["T2M"].keys()),
    "temperature": list(data["T2M"].values()),
    "humidity": list(data["RH2M"].values()),
    "precipitation": list(data["PRECTOTCORR"].values())
})

df_weather.to_csv("weather_data.csv", index=False)
print(df_weather.head())


     datetime  temperature  humidity  precipitation
0  2024090100        28.09     75.37            0.0
1  2024090101        27.71     76.67            0.0
2  2024090102        27.27     79.20            0.0
3  2024090103        26.74     81.70            0.0
4  2024090104        26.30     84.17            0.0


In [23]:
df_weather.count()


Unnamed: 0,0
datetime,8784
temperature,8784
humidity,8784
precipitation,8784


In [25]:
import requests
import pandas as pd
import time

# -------------------------------
# CONFIGURATION
# -------------------------------
API_KEY = "72dd209011465d910f7b44d557d8764f26bfeb1d8306bd00bef2781308e44913"
SENSOR_ID = "10326358"  # Example sensor ID
PARAMETER = "pm25"
LIMIT = 1000
START_DATE = "2024-09-01"
END_DATE = "2025-09-01"

HEADERS = {"X-API-Key": API_KEY}
BASE_URL = f"https://api.openaq.org/v3/sensors/{SENSOR_ID}/hours"

# -------------------------------
# 1️⃣ Fetch all PM2.5 data (in chunks)
# -------------------------------
all_results = []
page = 1

print("🔄 Fetching hourly PM2.5 data from OpenAQ...")

while True:
    params = {
        "date_from": START_DATE,
        "date_to": END_DATE,
        "limit": LIMIT,
        "page": page
    }

    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    if response.status_code != 200:
        print(f"❌ Error {response.status_code}: {response.text}")
        break

    data = response.json()
    results = data.get("results", [])
    if not results:
        print(f"✅ No more data. Finished at page {page}.")
        break

    all_results.extend(results)
    print(f"📦 Page {page}: fetched {len(results)} entries (total = {len(all_results)})")

    page += 1
    time.sleep(1)  # avoid rate limit

# -------------------------------
# 2️⃣ Convert to DataFrame
# -------------------------------
if not all_results:
    print("⚠️ No results found. Exiting.")
    exit()

aq_data = pd.DataFrame(all_results)

# Parse datetime and value columns
aq_data["datetime"] = pd.to_datetime(aq_data["period"].apply(lambda x: x["datetimeFrom"]["utc"]))
aq_data["pm25"] = aq_data["value"]
aq_data = aq_data[["datetime", "pm25"]]

# Save raw PM2.5 data
aq_data.to_csv("pm25_data.csv", index=False)
print(f"💾 Saved {len(aq_data)} PM2.5 entries → pm25_data.csv")


🔄 Fetching hourly PM2.5 data from OpenAQ...
📦 Page 1: fetched 1000 entries (total = 1000)
📦 Page 2: fetched 1000 entries (total = 2000)
📦 Page 3: fetched 1000 entries (total = 3000)
📦 Page 4: fetched 1000 entries (total = 4000)
📦 Page 5: fetched 1000 entries (total = 5000)
📦 Page 6: fetched 1000 entries (total = 6000)
📦 Page 7: fetched 1000 entries (total = 7000)
📦 Page 8: fetched 1000 entries (total = 8000)
📦 Page 9: fetched 327 entries (total = 8327)
✅ No more data. Finished at page 10.
💾 Saved 8327 PM2.5 entries → pm25_data.csv


MergeError: incompatible merge keys [0] datetime64[ns, UTC] and dtype('<M8[ns]'), must be the same type

In [26]:
pmData= pd.read_csv('pm25_data.csv')
pmData.head()

Unnamed: 0,datetime,pm25
0,2024-08-20 06:00:00+00:00,10.1
1,2024-08-20 07:00:00+00:00,11.1
2,2024-08-20 09:00:00+00:00,11.9
3,2024-08-20 10:00:00+00:00,12.3
4,2024-08-20 11:00:00+00:00,12.8


In [27]:
pmData= pd.read_csv('weather_data.csv')
pmData.head()

Unnamed: 0,datetime,temperature,humidity,precipitation
0,2024090100,28.09,75.37,0.0
1,2024090101,27.71,76.67,0.0
2,2024090102,27.27,79.2,0.0
3,2024090103,26.74,81.7,0.0
4,2024090104,26.3,84.17,0.0


In [28]:
# -------------------------------
# 3️⃣ Merge with weather data
# -------------------------------
try:
    df_weather = pd.read_csv("weather_data.csv")
except FileNotFoundError:
    print("⚠️ weather_data.csv not found. Please place your NASA weather file in the same folder.")
    exit()

# Parse datetime for both datasets
aq_data["datetime"] = pd.to_datetime(aq_data["datetime"]).dt.tz_localize(None)
df_weather["datetime"] = pd.to_datetime(df_weather["datetime"], format="%Y%m%d%H").dt.tz_localize(None)

# Merge nearest timestamps
df_merged = pd.merge_asof(
    aq_data.sort_values("datetime"),
    df_weather.sort_values("datetime"),
    on="datetime",
    direction="nearest",
    tolerance=pd.Timedelta("1H")  # allow up to 1-hour difference
)

# Drop rows missing PM2.5 or weather info
df_merged = df_merged.dropna(subset=["pm25", "temperature", "humidity", "precipitation"], how="any")

# Save merged dataset
df_merged.to_csv("air_quality_weather.csv", index=False)
print(f"✅ Final merged dataset saved → air_quality_weather.csv ({len(df_merged)} rows)")


✅ Final merged dataset saved → air_quality_weather.csv (7456 rows)


  tolerance=pd.Timedelta("1H")  # allow up to 1-hour difference


In [29]:
import pandas as pd

# -------------------------------
# 1️⃣ Load PM2.5 data
# -------------------------------
pm_df = pd.read_csv("pm25_data.csv")
pm_df["datetime"] = pd.to_datetime(pm_df["datetime"]).dt.tz_localize(None)
pm_df = pm_df.sort_values("datetime")

# Rename PM2.5 column for clarity
pm_df.rename(columns={"value": "pm25"}, inplace=True, errors="ignore")

# -------------------------------
# 2️⃣ Load weather data
# -------------------------------
weather_df = pd.read_csv("weather_data.csv")
weather_df["datetime"] = pd.to_datetime(weather_df["datetime"], format="%Y%m%d%H").dt.tz_localize(None)
weather_df = weather_df.sort_values("datetime")

# -------------------------------
# 3️⃣ Merge by nearest timestamp (within 1 hour)
# -------------------------------
merged_df = pd.merge_asof(
    pm_df.sort_values("datetime"),
    weather_df.sort_values("datetime"),
    on="datetime",
    direction="nearest",
    tolerance=pd.Timedelta("1H")
)

# Drop rows with missing weather or PM2.5
merged_df = merged_df.dropna(subset=["pm25", "temperature", "humidity", "precipitation"], how="any")

# -------------------------------
# 4️⃣ Create next-hour PM2.5 column
# -------------------------------
merged_df["pm25_next"] = merged_df["pm25"].shift(-1)

# Drop last row (no next-hour value)
merged_df = merged_df.dropna(subset=["pm25_next"])

# -------------------------------
# 5️⃣ Rename columns clearly
# -------------------------------
merged_df = merged_df.rename(columns={"pm25": "pm25_current"})

# -------------------------------
# 6️⃣ Save final dataset
# -------------------------------
merged_df.to_csv("air_quality_forecast_dataset.csv", index=False)

print(f"✅ Final dataset saved → air_quality_forecast_dataset.csv ({len(merged_df)} rows)")
print("📊 Columns:", list(merged_df.columns))
print(merged_df.head())


✅ Final dataset saved → air_quality_forecast_dataset.csv (7455 rows)
📊 Columns: ['datetime', 'pm25_current', 'temperature', 'humidity', 'precipitation', 'pm25_next']
               datetime  pm25_current  temperature  humidity  precipitation  \
175 2024-08-31 23:00:00          32.1        28.09     75.37            0.0   
176 2024-09-01 00:00:00          39.2        28.09     75.37            0.0   
177 2024-09-01 01:00:00          23.8        27.71     76.67            0.0   
178 2024-09-01 02:00:00          20.8        27.27     79.20            0.0   
179 2024-09-01 03:00:00          20.0        26.74     81.70            0.0   

     pm25_next  
175       39.2  
176       23.8  
177       20.8  
178       20.0  
179       18.9  


  tolerance=pd.Timedelta("1H")


In [32]:
merged_df.isna().count()

Unnamed: 0,0
datetime,7455
pm25_current,7455
temperature,7455
humidity,7455
precipitation,7455
pm25_next,7455


In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# -------------------------------
# 1. Load your dataset
# -------------------------------
# For demonstration, we combine OpenAQ + weather data
# Replace this CSV with actual collected/merged data
# Columns: ['pm25', 'temperature', 'humidity', 'precipitation']
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]  # features
y = df['pm25_next']  # target (next hour/day PM2.5 prediction)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# 3. Train model
# -------------------------------
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
model.fit(X_train, y_train)

# -------------------------------
# 4. Evaluate
# -------------------------------
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save trained model
# -------------------------------
joblib.dump(model, "forecast_model.pkl")
print("Model saved to forecast_model.pkl")


MAE: 4.42
RMSE: 67.11
Model saved to forecast_model.pkl


In [36]:
!pip install xgboost lightgbm



In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------

# == Option 1: Tuned Random Forest Regressor ==
print("--- Tuning Random Forest ---")
rf_param_dist = {
    'n_estimators': randint(100, 500),      # Number of trees in the forest
    'max_depth': randint(10, 50),           # Maximum depth of the tree
    'min_samples_split': randint(2, 20),    # Minimum samples required to split a node
    'min_samples_leaf': randint(1, 20),     # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None]  # Number of features to consider at every split
}
model_tuner = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=rf_param_dist,
    n_iter=50,       # Number of parameter settings that are sampled
    cv=5,            # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1        # Use all available cores
)
# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # RMSE is the square root of MSE

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model1.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")

--- Tuning Random Forest ---
Starting hyperparameter search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Hyperparameters Found:
{'max_depth': 11, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 13, 'n_estimators': 413}

--- Evaluating Best Model ---
MAE: 4.38
RMSE: 8.07

Tuned model saved to tuned_forecast_model.pkl


In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------

# # == Option 2: Tuned XGBoost Regressor ==
print("--- Tuning XGBoost ---")
xgb_param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': randint(3, 15),
    'subsample': [0.7, 0.8, 0.9, 1.0],         # Fraction of training data to use per tree
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]   # Fraction of features to use per tree
}
model_tuner = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    param_distributions=xgb_param_dist,
    n_iter=50,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # RMSE is the square root of MSE

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model2.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")

--- Tuning XGBoost ---
Starting hyperparameter search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Hyperparameters Found:
{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 413, 'subsample': 0.8}

--- Evaluating Best Model ---
MAE: 4.42
RMSE: 8.07

Tuned model saved to tuned_forecast_model.pkl


In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------

# == Option 3: Tuned LightGBM Regressor ==
print("--- Tuning LightGBM ---")
lgb_param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': randint(20, 100),            # Main driver of model complexity
    'max_depth': randint(5, 30),
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}
model_tuner = RandomizedSearchCV(
    estimator=lgb.LGBMRegressor(random_state=42),
    param_distributions=lgb_param_dist,
    n_iter=50,
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # RMSE is the square root of MSE

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model3.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")

--- Tuning LightGBM ---
Starting hyperparameter search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 5964, number of used features: 4
[LightGBM] [Info] Start training from score 35.166757

Best Hyperparameters Found:
{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 406, 'num_leaves': 26, 'subsample': 0.7}

--- Evaluating Best Model ---
MAE: 4.43
RMSE: 8.20

Tuned model saved to tuned_forecast_model.pkl


In [40]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# --- Model Imports ---
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint, uniform

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------

# == Option 1: Tuned SVR (with Scaling) ==
# A pipeline first scales the data, then trains the SVR
print("--- Tuning Support Vector Regressor (SVR) ---")
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])
svr_param_dist = {
    'svr__C': uniform(0.1, 100),            # Regularization parameter
    'svr__gamma': ['scale', 'auto'] + list(np.logspace(-3, 2, 6)), # Kernel coefficient
    'svr__kernel': ['rbf', 'poly', 'sigmoid']
}
model_tuner = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=svr_param_dist,
    n_iter=50, cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model5.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")

--- Tuning Support Vector Regressor (SVR) ---
Starting hyperparameter search...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# --- Model Imports ---
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint, uniform

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------


# == Option 2: Tuned MLP Regressor (Neural Network, with Scaling) ==
print("--- Tuning MLP Regressor ---")
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(max_iter=1000, early_stopping=True, random_state=42))
])
mlp_param_dist = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'mlp__activation': ['relu', 'tanh', 'logistic'],
    'mlp__solver': ['adam'],
    'mlp__alpha': uniform(0.0001, 0.1), # L2 penalty (regularization)
    'mlp__learning_rate_init': uniform(0.001, 0.1)
}
model_tuner = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=mlp_param_dist,
    n_iter=25, cv=5, verbose=1, random_state=42, n_jobs=-1 # Fewer iterations as NN can be slow
)


# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model6.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# --- Model Imports ---
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint, uniform

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------

# == Option 3: Tuned CatBoost Regressor ==
print("--- Tuning CatBoost ---")
cbr_param_dist = {
    'iterations': randint(200, 1000),
    'learning_rate': uniform(0.01, 0.3),
    'depth': randint(4, 10),
    'l2_leaf_reg': randint(1, 10) # L2 regularization
}
# CatBoost can be verbose, so silent=True is helpful
model_tuner = RandomizedSearchCV(
    estimator=CatBoostRegressor(random_state=42, verbose=0),
    param_distributions=cbr_param_dist,
    n_iter=50, cv=5, verbose=1, random_state=42, n_jobs=-1
)

# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model7.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# --- Model Imports ---
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
from scipy.stats import randint, uniform

# -------------------------------
# 1. Load your dataset
# -------------------------------
df = pd.read_csv("air_quality_forecast_dataset.csv")

# Ensure numeric and no missing values
df = df.dropna()
for col in ['pm25_current', 'temperature', 'humidity', 'precipitation']:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna()

# -------------------------------
# 2. Split data
# -------------------------------
X = df[['pm25_current', 'temperature', 'humidity', 'precipitation']]
y = df['pm25_next']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------------------------------------
# 3. Hyperparameter Tuning with RandomizedSearchCV
#    - Choose ONE of the following model sections to uncomment and run.
# ----------------------------------------------------------------

# == Option 4: Tuned Scikit-learn Gradient Boosting Regressor ==
print("--- Tuning Gradient Boosting ---")
gbr_param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.7, 0.3) # 0.3 here means 0.7 to 1.0
}
model_tuner = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(random_state=42),
    param_distributions=gbr_param_dist,
    n_iter=50, cv=5, verbose=1, random_state=42, n_jobs=-1
)


# Fit the tuner to find the best hyperparameters
print("Starting hyperparameter search...")
model_tuner.fit(X_train, y_train)

# Get the best model found by the tuner
best_model = model_tuner.best_estimator_
print("\nBest Hyperparameters Found:")
print(model_tuner.best_params_)

# -------------------------------
# 4. Evaluate the BEST model
# -------------------------------
print("\n--- Evaluating Best Model ---")
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# -------------------------------
# 5. Save the BEST trained model
# -------------------------------
joblib.dump(best_model, "tuned_forecast_model8.pkl")
print("\nTuned model saved to tuned_forecast_model.pkl")