In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display

In [3]:
# 1. Load dataset
df = pd.read_csv("WWTP/main_2023.csv")

In [4]:
# 2. Create lag feature for linear regression model
df['Nitrate_lag1'] = df['Nitrate'].shift(1)
df = df.dropna().reset_index(drop=True)

In [5]:
# 3. Define feature sets and target
features_base = ["Influent", "oxygen_avg", "Ammonium", "Phosphate", "Temperature"]
features_lr   = features_base + ["Nitrate_lag1"]

X_base = df[features_base]
X_lr   = df[features_lr]
y = df["Nitrate"]
nitrate_range = y.max() - y.min()

In [6]:
# 4. Train/test split (same split for all models)
X_base_train, X_base_test, X_lr_train, X_lr_test, y_train, y_test = train_test_split(
    X_base, X_lr, y, test_size=0.2, random_state=42
)

In [7]:
# 5. Define all models
models = {
    "Linear Regression (with lag)": LinearRegression(),
    "Polynomial Regression (deg=3)": make_pipeline(PolynomialFeatures(3), LinearRegression()),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=None, max_features='sqrt', random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
}

In [8]:
# 6. Evaluate models and collect metrics
results = []
for name, model in models.items():
    if name == "Linear Regression (with lag)":
        model.fit(X_lr_train, y_train)
        y_pred = model.predict(X_lr_test)
    else:
        model.fit(X_base_train, y_train)
        y_pred = model.predict(X_base_test)
    r2   = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_pct = (rmse / nitrate_range) * 100
    results.append({
        "Model": name,
        "R²": round(r2, 3),
        "MAE": round(mae, 3),
        "RMSE": round(rmse, 3),
        "RMSE (%)": f"{rmse_pct:.2f}%"
    })

In [9]:
# 7. Display comparison table
results_df = pd.DataFrame(results).sort_values(by="R²", ascending=False)
display(results_df)

Unnamed: 0,Model,R²,MAE,RMSE,RMSE (%)
0,Linear Regression (with lag),0.906,0.579,0.741,7.72%
2,Random Forest,0.56,1.199,1.599,16.67%
3,Gradient Boosting,0.54,1.235,1.636,17.05%
1,Polynomial Regression (deg=3),0.5,1.321,1.705,17.76%
