In [34]:
import pandas as pd
import numpy as np 
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [None]:
df = pd.read_csv(r"C:\Users\Lenovo\Downloads\afa2e701598d20110228.csv", sep=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         2861 non-null   int64  
 1   date       2861 non-null   object 
 2   NH4        2858 non-null   float64
 3   BSK5       2860 non-null   float64
 4   Suspended  2845 non-null   float64
 5   O2         2858 non-null   float64
 6   NO3        2860 non-null   float64
 7   NO2        2858 non-null   float64
 8   SO4        2812 non-null   float64
 9   PO4        2833 non-null   float64
 10  CL         2812 non-null   float64
dtypes: float64(9), int64(1), object(1)
memory usage: 246.0+ KB


In [19]:
pollutants = ['O2', 'NO3', 'NO2', 'SO4', 'PO4', 'CL']
dy = df.dropna(subset=pollutants)
print(df)


      id        date    NH4  BSK5  Suspended     O2    NO3    NO2     SO4  \
0      1  17.02.2000  0.330  2.77       12.0  12.30   9.50  0.057  154.00   
1      1  11.05.2000  0.044  3.00       51.6  14.61  17.75  0.034  352.00   
2      1  11.09.2000  0.032  2.10       24.5   9.87  13.80  0.173  416.00   
3      1  13.12.2000  0.170  2.23       35.6  12.40  17.13  0.099  275.20   
4      1  02.03.2001  0.000  3.03       48.8  14.69  10.00  0.065  281.60   
...   ..         ...    ...   ...        ...    ...    ...    ...     ...   
2856  22  06.10.2020  0.046  2.69        3.6   8.28   3.80  0.038  160.00   
2857  22  27.10.2020  0.000  1.52        0.5  11.26   0.56  0.031  147.20   
2858  22  03.12.2020  0.034  0.29        0.8  11.09   2.58  0.042  209.92   
2859  22  12.01.2021  0.000  2.10        0.0  14.31   3.94  0.034  121.60   
2860  22  10.02.2021  0.000  1.78        0.0  14.30   6.30  0.033  134.40   

        PO4       CL  
0     0.454   289.50  
1     0.090  1792.00  
2     

In [13]:
df.head()


Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,17.02.2000,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5
1,1,11.05.2000,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0
2,1,11.09.2000,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0
3,1,13.12.2000,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0
4,1,02.03.2001,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0


In [17]:
df[pollutants].isnull().sum()

O2     0
NO3    0
NO2    0
SO4    0
PO4    0
CL     0
dtype: int64

In [18]:
df.isnull().sum()

id           0
date         0
NH4          0
BSK5         0
Suspended    0
O2           0
NO3          0
NO2          0
SO4          0
PO4          0
CL           0
dtype: int64

In [30]:
df = df.sort_values(by=['id' , 'date'])
df.head()
df['date'] =pd.to_datetime(df['date'] , format='%d.%m.%Y')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

X = df[['id' , 'year']]
y = df[pollutants]

In [31]:
X_encoded = pd.get_dummies(X, columns=['id'], drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# Train the MultiOutputRegressor with RandomForest
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_test)

# Evaluate model
print("Model Performance on the Test Data:")
for i, pollutant in enumerate(pollutants):
    print(f'{pollutant}:')
    print('   MSE:', mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    print('   R2:', r2_score(y_test.iloc[:, i], y_pred[:, i]))
    print()

Model Performance on the Test Data:
O2:
   MSE: 23.804974219550807
   R2: -0.024655347555051943

NO3:
   MSE: 14.190326747205722
   R2: 0.46420823872601225

NO2:
   MSE: 0.9021202011124738
   R2: -2.306102849515763

SO4:
   MSE: 799.3361629017251
   R2: 0.7636185777198815

PO4:
   MSE: 0.23319148787932148
   R2: 0.14388427070369814

CL:
   MSE: 31223.168320328627
   R2: 0.6324865646628188



In [33]:
station_id ='22'
year_input =2024

input_data = pd.DataFrame({'year': [year_input], 'id': [station_id]})
input_encoded = pd.get_dummies(input_data, columns=['id'])

# Align columns with training data
missing_cols = set(X_encoded.columns) - set(input_encoded.columns)
for col in missing_cols:
    input_encoded[col] = 0
input_encoded = input_encoded[X_encoded.columns]  # Reorder to match

# Predict pollutants
predicted_pollutants = model.predict(input_encoded)[0]

print(f"\nPredicted pollutant levels for station '{station_id}' in {year_input}:")
for p, val in zip(pollutants, predicted_pollutants):
    print(f"  {p}: {val:.2f}")


Predicted pollutant levels for station '22' in 2024:
  O2: 14.04
  NO3: 5.79
  NO2: 0.04
  SO4: 132.33
  PO4: 0.52
  CL: 65.75


In [35]:
joblib.dump(model, 'pollution_model.pkl')
joblib.dump(X_encoded.columns.tolist(), 'model_columns.pkl')
print('✅ Model and feature columns saved!')

✅ Model and feature columns saved!
