# Train model for horizon



In [None]:
!pip install hopsworks==4.2.*

Collecting hopsworks==4.2.*
  Using cached hopsworks-4.2.9-py3-none-any.whl.metadata (11 kB)
Collecting pyhumps==1.6.1 (from hopsworks==4.2.*)
  Using cached pyhumps-1.6.1-py3-none-any.whl.metadata (3.7 kB)
Collecting furl (from hopsworks==4.2.*)
  Using cached furl-2.1.4-py2.py3-none-any.whl.metadata (25 kB)
Collecting boto3 (from hopsworks==4.2.*)
  Downloading boto3-1.40.59-py3-none-any.whl.metadata (6.6 kB)
Collecting pandas<2.2.0 (from hopsworks==4.2.*)
  Downloading pandas-2.1.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pyjks (from hopsworks==4.2.*)
  Downloading pyjks-20.0.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting mock (from hopsworks==4.2.*)
  Downloading mock-5.2.0-py3-none-any.whl.metadata (3.1 kB)
Collecting avro==1.11.3 (from hopsworks==4.2.*)
  Downloading avro-1.11.3.tar.gz (90 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import hopsworks
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# -------------------------------
# 1. CONNECT TO HOPSWORKS
# -------------------------------
project = hopsworks.login()
fs = project.get_feature_store()
mr = project.get_model_registry()
print("‚úÖ Connected to project:", project.name)

# -------------------------------
# 2. LOAD FEATURE GROUP
# -------------------------------
fg = fs.get_feature_group("model_features", version=1)
df = fg.read()
print("‚úÖ Loaded feature data. Shape:", df.shape)

# -------------------------------
# 3. DEFINE AQI CONVERSION
# -------------------------------
# These are approximate midpoint values for each AQI category (1‚Äì5)
aqi_scale_map = {
    1: 50,    # Good (0‚Äì50)
    2: 100,   # Moderate (51‚Äì100)
    3: 150,   # Unhealthy for Sensitive Groups (101‚Äì150)
    4: 200,   # Unhealthy (151‚Äì200)
    5: 300,   # Very Unhealthy (201‚Äì300+)
}

# Convert all columns containing 'aqi' with scale 1‚Äì5 ‚Üí actual numeric AQI
aqi_cols = [c for c in df.columns if 'aqi' in c.lower()]
for col in aqi_cols:
    df[col] = df[col].map(aqi_scale_map).fillna(df[col])

print(f"üîÑ Converted scaled AQI columns to numeric AQI for {len(aqi_cols)} columns")

# -------------------------------
# 4. PREPARE DATA
# -------------------------------
H = 72  # forecast horizon in hours
target_col = f"aqi_t_plus_{H}"

if target_col not in df.columns:
    raise ValueError(f"‚ùå Target column '{target_col}' not found in feature group.")

# Drop only where target is missing
df_sup = df.dropna(subset=[target_col]).copy()

# Columns not to use as features
non_feature_cols = ["datetime", "timestamp"]
features = [c for c in df_sup.columns if c not in non_feature_cols + [target_col]]

print(f"üìä Total features before cleaning: {len(features)}")

# Handle missing / invalid input values safely
X = df_sup[features].copy()

# Replace invalid pollutant readings (e.g., negatives or zeros)
X = X.replace([np.inf, -np.inf], np.nan)
X[X <= 0] = np.nan

# Forward fill only (avoid leaking future data)
X = X.ffill()

# Drop rows still having missing values after ffill
missing_before = X.isna().sum().sum()
X = X.dropna()
missing_after = X.isna().sum().sum()

print(f"üßπ Cleaned missing values: {missing_before} ‚Üí {missing_after}")

# Align target with cleaned feature set
y = df_sup.loc[X.index, target_col].astype(float)

# Time-aware split (no shuffle)
split_frac = 0.8
split_idx = int(len(X) * split_frac)
X_train, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]
print(f"‚úÖ Data prepared | Train: {len(X_train)}, Val: {len(X_val)}")

# -------------------------------
# 5. TRAIN & EVALUATE MODELS
# -------------------------------
def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=200, random_state=42, n_jobs=-1
    ),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in models.items():
    print(f"\nüöÄ Training {name} ...")
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    mae, rmse, r2 = metrics(y_val, preds)
    results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})
    print(f"{name} -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R¬≤: {r2:.3f}")

results_df = pd.DataFrame(results).sort_values(by="RMSE")
print("\nüìä Model Comparison:\n", results_df)

# -------------------------------
# 6. REGISTER BEST MODEL
# -------------------------------
best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]
best_metrics = results_df.iloc[0]

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"üìà Metrics -> MAE: {best_metrics['MAE']:.2f}, RMSE: {best_metrics['RMSE']:.2f}, R¬≤: {best_metrics['R2']:.3f}")

# Save best model locally
model_dir = "models"
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, f"{best_model_name}_H{H}.pkl")
joblib.dump(best_model, model_path)
print(f"üíæ Model saved locally at: {model_path}")

# Create schema
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

# Register model
model = mr.sklearn.create_model(
    name=f"AQI_{best_model_name}_H{H}",
    metrics={
        "mae": float(best_metrics["MAE"]),
        "rmse": float(best_metrics["RMSE"]),
        "r2": float(best_metrics["R2"]),
    },
    model_schema=model_schema,
    description=f"{best_model_name} model trained for exact AQI prediction ({H}-hour horizon)"
)
model.save(model_path)

print(f"\n‚úÖ Registered model '{best_model_name}' in Hopsworks (version {model.version})")


Connection closed.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1251499
‚úÖ Connected to project: pearls_aqi_predictor
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.98s) 
‚úÖ Loaded feature data. Shape: (6776, 155)
üîÑ Converted scaled AQI columns to numeric AQI for 19 columns
üìä Total features before cleaning: 153
üßπ Cleaned missing values: 6 ‚Üí 0
‚úÖ Data prepared | Train: 5360, Val: 1341

üöÄ Training RandomForest ...
RandomForest -> MAE: 19.64, RMSE: 36.11, R¬≤: 0.748

üöÄ Training Ridge ...
Ridge -> MAE: 39.87, RMSE: 55.88, R¬≤: 0.397

üöÄ Training GradientBoosting ...
GradientBoosting -> MAE: 30.83, RMSE: 46.31, R¬≤: 0.586

üìä Model Comparison:
               Model        MAE       RMSE        R2
0      RandomForest  19.643446  36.113914  0.748284
2  GradientBoosting  30.832045  46.314794  0.585999
1             Ridge  39.867615  55.882982  0.397272

üèÜ Best Model: RandomForest
üìà Metrics -> MAE: 19.64, R

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /content/models/RandomForest_H72.pkl: 0.000%|          | 0/10913249 elapsed<00:00 remaining<?

Uploading /content/model_schema.json: 0.000%|          | 0/11837 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1251499/models/AQI_RandomForest_H72/1

‚úÖ Registered model 'RandomForest' in Hopsworks (version 1)
