In [1]:
# first install libraries using pip 

import pandas as pd
import numpy as np 
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('PB_ALL_2000_2021.csv', sep=';')
df

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,17.02.2000,0.330,2.77,12.0,12.30,9.500,0.057,154.00,0.454,289.50
1,1,11.05.2000,0.044,3.00,51.6,14.61,17.750,0.034,352.00,0.090,1792.00
2,1,11.09.2000,0.032,2.10,24.5,9.87,13.800,0.173,416.00,0.200,2509.00
3,1,13.12.2000,0.170,2.23,35.6,12.40,17.130,0.099,275.20,0.377,1264.00
4,1,02.03.2001,0.000,3.03,48.8,14.69,10.000,0.065,281.60,0.134,1462.00
5,1,07.06.2001,0.020,4.02,34.0,10.61,11.800,0.016,287.00,0.208,1183.00
6,1,10.09.2001,0.863,3.91,147.0,10.96,20.500,0.284,595.20,0.674,4023.00
7,1,06.11.2001,0.060,2.97,71.2,13.47,25.800,0.095,314.00,0.390,1907.00
8,1,12.03.2002,0.168,4.15,27.0,17.82,3.945,0.058,153.60,0.110,473.00
9,1,06.06.2002,0.001,7.11,74.4,19.28,2.260,0.017,409.60,0.181,1782.00


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
id           2861 non-null int64
date         2861 non-null object
NH4          2858 non-null float64
BSK5         2860 non-null float64
Suspended    2845 non-null float64
O2           2858 non-null float64
NO3          2860 non-null float64
NO2          2858 non-null float64
SO4          2812 non-null float64
PO4          2833 non-null float64
CL           2812 non-null float64
dtypes: float64(9), int64(1), object(1)
memory usage: 245.9+ KB


In [5]:
df.shape

(2861, 11)

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,2861.0,12.397064,6.084226,1.0,8.0,14.0,16.0,22.0
NH4,2858.0,0.758734,2.486247,0.0,0.08,0.22,0.5,39.427
BSK5,2860.0,4.316182,2.973997,0.0,2.16,3.8,5.8,50.9
Suspended,2845.0,12.931905,16.543097,0.0,6.0,10.0,15.0,595.0
O2,2858.0,9.508902,4.42826,0.0,7.0925,8.995,11.52,90.0
NO3,2860.0,4.316846,6.881188,0.0,1.39,2.8,5.5825,133.4
NO2,2858.0,0.246128,2.182777,0.0,0.03,0.059,0.12575,109.0
SO4,2812.0,59.362313,96.582641,0.0,27.0525,37.8,64.64,3573.4
PO4,2833.0,0.418626,0.771326,0.0,0.13,0.27,0.47,13.879
CL,2812.0,93.731991,394.512184,0.02,26.8,33.9,45.6075,5615.28


In [8]:
df.isnull().sum()

id            0
date          0
NH4           3
BSK5          1
Suspended    16
O2            3
NO3           1
NO2           3
SO4          49
PO4          28
CL           49
dtype: int64

In [10]:
# date is in object(string) format it needs to be converted 
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y')
df

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,2000-02-17,0.330,2.77,12.0,12.30,9.500,0.057,154.00,0.454,289.50
1,1,2000-05-11,0.044,3.00,51.6,14.61,17.750,0.034,352.00,0.090,1792.00
2,1,2000-09-11,0.032,2.10,24.5,9.87,13.800,0.173,416.00,0.200,2509.00
3,1,2000-12-13,0.170,2.23,35.6,12.40,17.130,0.099,275.20,0.377,1264.00
4,1,2001-03-02,0.000,3.03,48.8,14.69,10.000,0.065,281.60,0.134,1462.00
5,1,2001-06-07,0.020,4.02,34.0,10.61,11.800,0.016,287.00,0.208,1183.00
6,1,2001-09-10,0.863,3.91,147.0,10.96,20.500,0.284,595.20,0.674,4023.00
7,1,2001-11-06,0.060,2.97,71.2,13.47,25.800,0.095,314.00,0.390,1907.00
8,1,2002-03-12,0.168,4.15,27.0,17.82,3.945,0.058,153.60,0.110,473.00
9,1,2002-06-06,0.001,7.11,74.4,19.28,2.260,0.017,409.60,0.181,1782.00


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2861 entries, 0 to 2860
Data columns (total 11 columns):
id           2861 non-null int64
date         2861 non-null datetime64[ns]
NH4          2858 non-null float64
BSK5         2860 non-null float64
Suspended    2845 non-null float64
O2           2858 non-null float64
NO3          2860 non-null float64
NO2          2858 non-null float64
SO4          2812 non-null float64
PO4          2833 non-null float64
CL           2812 non-null float64
dtypes: datetime64[ns](1), float64(9), int64(1)
memory usage: 245.9 KB


In [12]:
df = df.sort_values(by=['id','date'])
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5
1,1,2000-05-11,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0
2,1,2000-09-11,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0


In [13]:
# to extract year only form date
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

In [14]:
df.head()

Unnamed: 0,id,date,NH4,BSK5,Suspended,O2,NO3,NO2,SO4,PO4,CL,year,month
0,1,2000-02-17,0.33,2.77,12.0,12.3,9.5,0.057,154.0,0.454,289.5,2000,2
1,1,2000-05-11,0.044,3.0,51.6,14.61,17.75,0.034,352.0,0.09,1792.0,2000,5
2,1,2000-09-11,0.032,2.1,24.5,9.87,13.8,0.173,416.0,0.2,2509.0,2000,9
3,1,2000-12-13,0.17,2.23,35.6,12.4,17.13,0.099,275.2,0.377,1264.0,2000,12
4,1,2001-03-02,0.0,3.03,48.8,14.69,10.0,0.065,281.6,0.134,1462.0,2001,3


In [15]:
df.columns

Index(['id', 'date', 'NH4', 'BSK5', 'Suspended', 'O2', 'NO3', 'NO2', 'SO4',
       'PO4', 'CL', 'year', 'month'],
      dtype='object')

In [None]:
pollutnts = ['O2', 'NO3', 'NO2', 'SO4','PO4', 'CL']