In [1]:
# Week 2 - Model Implementation

# ## Goal
# Train models to predict PM2.5 from weather features.
# Load cleaned data from Week 1 and save the best model for deployment in Week 3.

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib
import os

# ### Load Clean Data
data_path = '../Week_1_(Dataset_Preparation)/data/clean_pollution_data.csv'

if not os.path.exists(data_path):
    print("Cleaned dataset not found. Please ensure Week 1 output exists at the expected path.")
else:
    df = pd.read_csv(data_path)
    # Use correct column names from df
    X = df[['TEMP', 'hour', 'month']]
    y = df['pm2.5']

    # ### Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ### Modeling Pipelines
    lr_pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LinearRegression())
    ])
    rf_pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('rf', RandomForestRegressor(random_state=42))
    ])

    # ### Train Models
    lr_pipe.fit(X_train, y_train)
    rf_pipe.fit(X_train, y_train)

    # ### Evaluation
    import numpy as np
    def eval(model, X_test, y_test):
        preds = model.predict(X_test)
        mae = mean_absolute_error(y_test, preds)
        mse = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)
        print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")
        return mae, rmse

    print("Linear Regression Performance:")
    eval(lr_pipe, X_test, y_test)

    print("Random Forest Performance:")
    eval(rf_pipe, X_test, y_test)

    # ### Save Best Model
    model_path = 'saved_model/pollution_predictor.pkl'
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    joblib.dump(rf_pipe, model_path)
    print(f"Best model saved to {model_path}.")

Linear Regression Performance:
MAE: 65.43, RMSE: 88.65
Random Forest Performance:
MAE: 67.62, RMSE: 91.96
Best model saved to saved_model/pollution_predictor.pkl.
