In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv(r'C:\Users\91790\OneDrive\Documents\PB_All_2000_2021.csv' , sep=';')

# EDA
print(df.shape)
print(df.info())
print(df.describe())
print(df.isnull().sum())

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

pollutants = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
for col in pollutants:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()

# Preprocessing
df.drop_duplicates(inplace=True)
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df = df.sort_values(by=['id', 'date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
df = df[(z_scores < 3).all(axis=1)]

df.fillna(df.mean(), inplace=True)
df['id'] = df['id'].astype(str)

X = df.drop(columns=pollutants + ['date'])
y = df[pollutants]

X = pd.get_dummies(X, columns=['id'], drop_first=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
multioutput_model = MultiOutputRegressor(base_model)

# Train the model
multioutput_model.fit(X_train, y_train)

# Predict
y_pred = multioutput_model.predict(X_test)

# Evaluate
for i, col in enumerate(y.columns):
    mse = mean_squared_error(y_test[col], y_pred[:, i])
    rmse = np.sqrt(mse)
    print(f"\nPollutant: {col}")
    print(f"R^2 Score: {r2_score(y_test[col], y_pred[:, i]):.3f}")
    print(f"RMSE: {rmse:.3f}")



In [None]:
from sklearn.metrics import r2_score
from joblib import dump
import numpy as np

# Overall R² score (macro average)
r2_scores = []
for i, col in enumerate(y.columns):
    score = r2_score(y_test[col], y_pred[:, i])
    r2_scores.append(score)
    print(f"\nPollutant: {col}")
    print(f"R^2 Score: {score:.3f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test[col], y_pred[:, i])):.3f}")

mean_r2 = np.mean(r2_scores)
print(f"\n Overall Accuracy (Mean R² Score): {mean_r2:.3f}")

# Save the model
dump(multioutput_model, 'water_quality_model.joblib')
print("\n Model saved as 'water_quality_model.joblib'")
