In [9]:
# Cell 1: Imports & configuration
import os
import numpy as np
import pandas as pd

# sklearn utils
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance

# Regressors
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Optional boosters (wrapped in try/except to not break if libs missing)
try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None
try:
    from lightgbm import LGBMRegressor
except Exception:
    LGBMRegressor = None
try:
    from catboost import CatBoostRegressor
except Exception:
    CatBoostRegressor = None

In [10]:
df = pd.read_csv("/Users/ravigurjar/Desktop/temp/sample/dessertation/data/cleaned/final_df.csv")

In [11]:

# Local config (change if you want)
TARGET_COL = "aqi_value"
ASSETS_PATH = "../asset/prediction"
RANDOM_STATE = 42
TEST_SIZE = 0.25
USE_IMPUTATION = False  
TIME_COL_NAMES = {"date", "timestamp", "datetime"}
KFOLD_SPLITS = 5
PERM_N_REPEATS = 10


os.makedirs(ASSETS_PATH, exist_ok=True)
print("Config loaded. Assets path:", ASSETS_PATH)

Config loaded. Assets path: ../asset/prediction


In [12]:
# Cell 2: Basic checks (assumes a DataFrame named `df` already exists in the notebook)
if 'df' not in globals():
    raise RuntimeError("DataFrame 'df' not found. Load your dataset into variable `df` before running further cells.")

if TARGET_COL not in df.columns:
    raise KeyError(f"Target column '{TARGET_COL}' not found. Available columns: {df.columns.tolist()}")

print("df found with rows:", len(df), "columns:", df.shape[1])

df found with rows: 7835 columns: 13


In [13]:
# Cell 3: Feature selection (numeric) and leakage exclusion
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features = [c for c in numeric_cols]

if len(features) == 0:
    raise ValueError("No numeric features found after excluding leakage. Inspect df.columns")

X = df[features].copy()
y = df[TARGET_COL].copy()

print("Selected numeric features (count):", len(features))
print(features)

Selected numeric features (count): 11
['latitude', 'longitude', 'pm25_ugm3', 'co_ppm', 'no2_ppb', 'so2_ppb', 'o3_ppb', 'temperature_c', 'relative_humidity', 'wind_speed_ms', 'wind_direction_deg']


In [14]:
# Cell 4: Missing values - either drop rows with NaNs or impute numeric features
from sklearn.impute import SimpleImputer

if USE_IMPUTATION:
    imputer = SimpleImputer(strategy="mean")
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
    X = X_imputed
    print("Imputation applied (mean) to numeric features.")
else:
    mask = X.notna().all(axis=1) & (~y.isna())
    X = X.loc[mask]
    y = y.loc[mask]
    print("Dropped rows with missing features/target. Remaining rows:", len(X))

Dropped rows with missing features/target. Remaining rows: 7324


In [15]:
# Cell 5: Train-test split (time-aware if timestamp exists, otherwise random)
time_col = None
for name in TIME_COL_NAMES:
    if name in df.columns:
        time_col = name
        break

if time_col:
    # align indices and sort by time
    idx = X.index
    tmp = df.loc[idx, :].copy()
    tmp[time_col] = pd.to_datetime(tmp[time_col])
    tmp = tmp.sort_values(by=time_col)
    X = X.loc[tmp.index]
    y = y.loc[tmp.index]
    split_idx = int((1 - TEST_SIZE) * len(X))
    X_train = X.iloc[:split_idx]
    X_test  = X.iloc[split_idx:]
    y_train = y.iloc[:split_idx]
    y_test  = y.iloc[split_idx:]
    print(f"Time-aware split used ({time_col}). Train: {len(X_train)}, Test: {len(X_test)}")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                        random_state=RANDOM_STATE, shuffle=True)
    print(f"Random split used. Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

Time-aware split used (timestamp). Train: 5493, Test: 1831


In [16]:
# Cell 6: Scaling (fit scaler on train only)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),  columns=X_test.columns,  index=X_test.index)
feature_names = list(X_train.columns)
print("Scaling complete. Feature count:", len(feature_names))

Scaling complete. Feature count: 11


In [17]:
# Cell 7: Define the models dictionary (core models)
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=RANDOM_STATE),
    "Lasso": Lasso(alpha=0.01, random_state=RANDOM_STATE, max_iter=5000),
    "SVR_rbf": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=RANDOM_STATE),
    "AdaBoost": AdaBoostRegressor(n_estimators=200, random_state=RANDOM_STATE),
    "KNeighbors": KNeighborsRegressor(n_neighbors=7),
    "MLP": MLPRegressor(hidden_layer_sizes=(128,64), max_iter=1000, early_stopping=True, random_state=RANDOM_STATE)
}

if XGBRegressor is not None:
    models["XGBoost"] = XGBRegressor(n_estimators=300, learning_rate=0.05, random_state=RANDOM_STATE, verbosity=0)
if LGBMRegressor is not None:
    models["LightGBM"] = LGBMRegressor(n_estimators=300, learning_rate=0.05, random_state=RANDOM_STATE)
if CatBoostRegressor is not None:
    models["CatBoost"] = CatBoostRegressor(iterations=500, learning_rate=0.05, verbose=0, random_seed=RANDOM_STATE)

print("Models defined:", list(models.keys()))

Models defined: ['LinearRegression', 'Ridge', 'Lasso', 'SVR_rbf', 'RandomForest', 'GradientBoosting', 'AdaBoost', 'KNeighbors', 'MLP']


In [24]:
import numpy as np
import pandas as pd

# 1) Clean target (y_train, y_test) -> remove '-' and force numeric
y_train = pd.to_numeric(y_train.replace("-", np.nan), errors="coerce")
y_test  = pd.to_numeric(y_test.replace("-", np.nan), errors="coerce")

# 2) Replace '-' in X and force numeric where possible
X_train = X_train.replace("-", np.nan)
X_test  = X_test.replace("-", np.nan)

# If you have obvious non-numeric columns like 'timestamp', drop them here
for col in ["timestamp"]:
    if col in X_train.columns:
        X_train = X_train.drop(columns=[col])
        X_test  = X_test.drop(columns=[col])

# Keep only numeric columns
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[numeric_cols].apply(pd.to_numeric, errors="coerce")
X_test  = X_test[numeric_cols].apply(pd.to_numeric, errors="coerce")

# 3) Drop rows with NaNs in TRAIN
mask_train = (~X_train.isna().any(axis=1)) & (~y_train.isna())
X_train = X_train[mask_train]
y_train = y_train[mask_train]

# 4) Drop rows with NaNs in TEST
mask_test = (~X_test.isna().any(axis=1)) & (~y_test.isna())
X_test = X_test[mask_test]
y_test = y_test[mask_test]

print("Train shape after cleaning:", X_train.shape, y_train.shape)
print("Test shape after cleaning:", X_test.shape, y_test.shape)

Train shape after cleaning: (5481, 11) (5481,)
Test shape after cleaning: (1806, 11) (1806,)


In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    index=X_train.index,
    columns=X_train.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    index=X_test.index,
    columns=X_test.columns
)

feature_names = X_train.columns.tolist()

In [26]:
# Cell 9: Training loop (CV on train, fit final model, predict on test, compute importances)
results = []
preds_long = []
importances_list = []
KFOLD = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=RANDOM_STATE)

for name, model in models.items():
    print(f"\n--- {name} ---")
    try:
        tree_like = name in ("RandomForest", "GradientBoosting", "AdaBoost", "XGBoost", "LightGBM", "CatBoost")
        if tree_like:
            Xtr = X_train.values
            Xte = X_test.values
        else:
            Xtr = X_train_scaled.values
            Xte = X_test_scaled.values

        # Cross-validation on training set (helps detect overfitting)
        try:
            cv_scores = cross_val_score(model, Xtr, y_train.values, cv=KFOLD, scoring="r2", n_jobs=-1)
            cv_mean = float(np.mean(cv_scores))
            cv_std = float(np.std(cv_scores))
        except Exception as e:
            print("CV failed:", e)
            cv_mean = np.nan
            cv_std = np.nan

        # Fit on full training set
        model.fit(Xtr, y_train)

        # Predict on test
        y_pred = model.predict(Xte)

        # Metrics
        r2 = float(r2_score(y_test, y_pred))
        mse = float(mean_squared_error(y_test, y_pred))
        rmse = float(np.sqrt(mse))

        # Record per-sample predictions
        temp_df = pd.DataFrame({
            "model": name,
            "sample_index": X_test.index.astype(str),
            "y_true": y_test.values,
            "y_pred": y_pred
        })
        preds_long.append(temp_df)

        # Feature importances (coef_, feature_importances_, or permutation)
        try:
            if hasattr(model, "coef_"):
                coef = np.asarray(model.coef_).ravel()
                imp_df = pd.DataFrame({"feature": feature_names, "importance": coef})
            elif hasattr(model, "feature_importances_"):
                fi = np.asarray(model.feature_importances_).ravel()
                imp_df = pd.DataFrame({"feature": feature_names, "importance": fi})
            else:
                perm = permutation_importance(model, Xte, y_test, n_repeats=PERM_N_REPEATS, random_state=RANDOM_STATE, n_jobs=-1)
                imp_df = pd.DataFrame({"feature": feature_names, "importance": perm.importances_mean})
        except Exception as e:
            print("Importance extraction failed:", e)
            imp_df = pd.DataFrame({"feature": feature_names, "importance": [np.nan]*len(feature_names)})

        imp_df["model"] = name
        importances_list.append(imp_df[["model","feature","importance"]])

        results.append({
            "model": name,
            "r2_test": r2,
            "rmse_test": rmse,
            "cv_r2_mean": cv_mean,
            "cv_r2_std": cv_std
        })

        print(f"{name} | r2_test: {r2:.4f} rmse_test: {rmse:.4f} cv_r2: {cv_mean:.4f} ± {cv_std:.4f}")

    except Exception as e:
        print(f"Model {name} FAILED: {e}")
        results.append({
            "model": name,
            "r2_test": np.nan,
            "rmse_test": np.nan,
            "cv_r2_mean": np.nan,
            "cv_r2_std": np.nan
        })


--- LinearRegression ---
LinearRegression | r2_test: 0.1479 rmse_test: 83.0358 cv_r2: 0.8626 ± 0.0122

--- Ridge ---


Ridge | r2_test: 0.1493 rmse_test: 82.9641 cv_r2: 0.8626 ± 0.0122

--- Lasso ---
Lasso | r2_test: 0.1518 rmse_test: 82.8436 cv_r2: 0.8626 ± 0.0122

--- SVR_rbf ---
SVR_rbf | r2_test: 0.0045 rmse_test: 89.7499 cv_r2: 0.7817 ± 0.0494

--- RandomForest ---
RandomForest | r2_test: 0.3825 rmse_test: 70.6862 cv_r2: 0.9973 ± 0.0043

--- GradientBoosting ---
GradientBoosting | r2_test: 0.3542 rmse_test: 72.2844 cv_r2: 0.9991 ± 0.0011

--- AdaBoost ---
AdaBoost | r2_test: 0.3340 rmse_test: 73.4081 cv_r2: 0.9688 ± 0.0026

--- KNeighbors ---
KNeighbors | r2_test: -0.3545 rmse_test: 104.6901 cv_r2: 0.7980 ± 0.0065

--- MLP ---
MLP | r2_test: 0.4454 rmse_test: 66.9883 cv_r2: 0.9973 ± 0.0003


In [28]:
# Cell 10: Assemble outputs and save CSVs to assets/
metrics_df = pd.DataFrame(results).sort_values(by="r2_test", ascending=False).reset_index(drop=True)
preds_all_df = pd.concat(preds_long, ignore_index=True) if preds_long else pd.DataFrame(columns=["model","sample_index","y_true","y_pred"])
importances_df = pd.concat(importances_list, ignore_index=True) if importances_list else pd.DataFrame(columns=["model","feature","importance"])

# Construct combined single-file output (metrics rows then prediction rows)
metrics_rows = metrics_df.copy()
metrics_rows["row_type"] = "metric"
metrics_rows["sample_index"] = ""
# ensure metric columns exist
metrics_rows = metrics_rows.rename(columns={"r2_test":"r2","rmse_test":"rmse"})
for c in ["y_true","y_pred","cv_r2_mean","cv_r2_std"]:
    if c not in metrics_rows.columns:
        metrics_rows[c] = ""

predictions_out = preds_all_df.copy()
predictions_out["row_type"] = "prediction"
# ensure metric columns present but empty in prediction rows
for c in ["r2","rmse","cv_r2_mean","cv_r2_std"]:
    predictions_out[c] = ""

# Align column order
export_cols = ["row_type","model","sample_index","y_true","y_pred","r2","rmse","cv_r2_mean","cv_r2_std"]
final_combined = pd.concat([metrics_rows[export_cols], predictions_out[export_cols]], ignore_index=True)

OUT_PATH = os.path.join(ASSETS_PATH, "model_predictions_metrics.csv")
final_combined.to_csv(OUT_PATH, index=False)
metrics_df.to_csv(os.path.join(ASSETS_PATH, "model_metrics_table.csv"), index=False)
preds_all_df.to_csv(os.path.join(ASSETS_PATH, "test_predictions_by_model_long.csv"), index=False)
importances_df.to_csv(os.path.join(ASSETS_PATH, "feature_importances_long.csv"), index=False)

print("Saved combined results to:", OUT_PATH)
print("Also saved: model_metrics_table.csv, test_predictions_by_model_long.csv, feature_importances_long.csv")

Saved combined results to: ../asset/prediction/model_predictions_metrics.csv
Also saved: model_metrics_table.csv, test_predictions_by_model_long.csv, feature_importances_long.csv


In [29]:
metrics_df

Unnamed: 0,model,r2_test,rmse_test,cv_r2_mean,cv_r2_std
0,MLP,0.4454,66.988286,0.997327,0.000334
1,RandomForest,0.382479,70.686249,0.997294,0.004298
2,GradientBoosting,0.354239,72.284433,0.999073,0.001076
3,AdaBoost,0.334007,73.408092,0.968848,0.00256
4,Lasso,0.151797,82.843581,0.862639,0.012196
5,Ridge,0.149326,82.964143,0.862635,0.012191
6,LinearRegression,0.147856,83.035824,0.862628,0.012245
7,SVR_rbf,0.00448,89.74989,0.781669,0.049422
8,KNeighbors,-0.354545,104.69013,0.798007,0.006514


In [31]:
preds_all_df.head(12)

Unnamed: 0,model,sample_index,y_true,y_pred
0,LinearRegression,7357,76.0,66.043885
1,LinearRegression,7358,38.0,51.371611
2,LinearRegression,7359,87.0,73.985199
3,LinearRegression,7360,83.0,75.943895
4,LinearRegression,7361,75.0,70.26325
5,LinearRegression,7362,128.0,102.827856
6,LinearRegression,7363,73.0,63.511652
7,LinearRegression,7364,53.0,49.918251
8,LinearRegression,7365,23.0,38.545911
9,LinearRegression,7366,54.0,54.282143


In [32]:
importances_df.head(12)

Unnamed: 0,model,feature,importance
0,LinearRegression,latitude,-5.724093
1,LinearRegression,longitude,12.837533
2,LinearRegression,pm25_ugm3,45.247357
3,LinearRegression,co_ppm,-0.182084
4,LinearRegression,no2_ppb,2.273526
5,LinearRegression,so2_ppb,0.155375
6,LinearRegression,o3_ppb,1.105466
7,LinearRegression,temperature_c,0.686551
8,LinearRegression,relative_humidity,-0.304541
9,LinearRegression,wind_speed_ms,-0.667958
