# Water Quality Prediction using Random Forest
This notebook builds a model to forecast BSK5 using lagged pollutant features.

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


## Step 1: Load and Explore Dataset

In [None]:
df = pd.read_csv('../data/water_quality.csv', sep=';')
df.head()


## Step 2: Parse Dates and Sort Data

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values(['id', 'date']).reset_index(drop=True)


## Step 3: Handle Missing Values

In [None]:
df.fillna(method='ffill', inplace=True)

## Step 4: Add Date-based Features

In [None]:
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year


## Step 5: Create Lag Features

In [None]:
lag_days = [1, 2, 3]
feature_cols = ['NH4', 'Suspended', 'O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']

for lag in lag_days:
    for col in feature_cols:
        df[f'{col}_lag{lag}'] = df.groupby('id')[col].shift(lag)


## Step 6: Drop NaNs after Lagging

In [None]:
df.dropna(inplace=True)

## Step 7: Define Features and Target

In [None]:
X = df[[f'{col}_lag{lag}' for lag in lag_days for col in feature_cols] + ['id', 'month', 'year']]
y = df['BSK5']


## Step 8: Time-Based Train/Test Split

In [None]:
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]


## Step 9: Train Random Forest Regressor

In [None]:
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


## Step 10: Make Predictions and Evaluate Model

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"R2 Score: {r2:.3f}")


## Step 11: Plot Actual vs Predicted BSK5

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_test.values, label='Actual BSK5', color='blue')
plt.plot(y_pred, label='Predicted BSK5', color='red')
plt.xlabel('Test Samples')
plt.ylabel('BSK5')
plt.title('Water Quality Forecasting (BSK5)')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
