# import 

In [211]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils import shuffle
from matplotlib.pyplot import figure
import warnings
warnings.filterwarnings('ignore')

# data

In [212]:
data_path = (r'..\data\data_ready.csv')
traffic_path = (r'..\data\traffic_data.xlsx')

In [213]:
data = pd.read_csv(data_path,index_col=0)
traffic = pd.read_excel(traffic_path, engine='openpyxl', sheet_name='udaljenosti', index_col=0)

# traffic

In [214]:
def traffic_norm(x):
    df=traffic.drop(x,axis=0)
    df=df[[x]]
    df['norm']=df[[x]]/df[[x]].max()
    df=df.drop(x,axis=1)
    df = df.filter(like='_80',axis=0)
    df=df.sort_index()
    return df

In [215]:
def traffic_s_norm(data,y):
    x = data.copy()
    x1 = x.filter(like='_80',axis=1)
    x1 = x1.sort_index(axis=1)
    for i,j in zip(x1.columns,y.norm):
        x1[i]=x1[i]*j
    return x1

# podaci svi 

In [216]:
def data_sites(site_1,pollutant_1):
    df = data.copy()
    df2 = df[[pollutant_1]]
    df1 = df.filter(like=site_1, axis=1)
    df3 = df[['Cloud_Cover_Mean','Temperature_Air_2m_Max_Day_Time', 'Temperature_Air_2m_Min_Night_Time','Wind_Speed_10m_Mean']]
    df4 = df[['year', 'dayofyear', 'month_Apr', 'month_Aug', 'month_Dec',
       'month_Feb', 'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar',
       'month_May', 'month_Nov', 'month_Oct', 'month_Sep', 'weekday_Friday',
       'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday',
       'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday',
       'season_fall', 'season_spring', 'season_summer', 'season_winter', 'holiday', 'holiday_school']]
    d = pd.concat([df1,df2,df3,df4],axis=1)
    X = d.drop(pollutant_1,axis=1)
    Y = d[[pollutant_1]] 
    return X,Y

# podaci bez traffic

In [217]:
def data_sites_bez_traffic(site_1,pollutant_1):
    df = data.copy()
    df2 = df[[pollutant_1]]
    df1 = df.filter(like=site_1, axis=1)
    df3 = df[['Cloud_Cover_Mean','Temperature_Air_2m_Max_Day_Time', 'Temperature_Air_2m_Min_Night_Time','Wind_Speed_10m_Mean']]
    df4 = df[['year', 'dayofyear', 'month_Apr', 'month_Aug', 'month_Dec',
       'month_Feb', 'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar',
       'month_May', 'month_Nov', 'month_Oct', 'month_Sep', 'weekday_Friday',
       'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday',
       'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday',
       'season_fall', 'season_spring', 'season_summer', 'season_winter', 'holiday', 'holiday_school']]
    d = pd.concat([df1,df2,df3,df4],axis=1)
    df5 = d.filter(like='_80', axis=1)
    
    X = d.drop(df5.columns,axis=1)
    X = d.drop(pollutant_1,axis=1)
    Y = d[[pollutant_1]] 
    return X,Y

# podaci s norm traffic

In [218]:
def data_sites_s_norm_traffic(site_1,pollutant_1,traffic_1):
    df = data.copy()
    df_t = traffic_1.copy()
    df_t = df_t.filter(like=site_1,axis=1)
    df2 = df[[pollutant_1]]
    df1 = df.filter(like=site_1, axis=1)
    df3 = df[['Cloud_Cover_Mean','Temperature_Air_2m_Max_Day_Time', 'Temperature_Air_2m_Min_Night_Time','Wind_Speed_10m_Mean']]
    df4 = df[['year', 'dayofyear', 'month_Apr', 'month_Aug', 'month_Dec',
       'month_Feb', 'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar',
       'month_May', 'month_Nov', 'month_Oct', 'month_Sep', 'weekday_Friday',
       'weekday_Monday', 'weekday_Saturday', 'weekday_Sunday',
       'weekday_Thursday', 'weekday_Tuesday', 'weekday_Wednesday',
       'season_fall', 'season_spring', 'season_summer', 'season_winter', 'holiday', 'holiday_school']]
    d = pd.concat([df1,df2,df3,df4],axis=1)
    df5 = d.filter(like='_80', axis=1)
    X = d.drop(df5.columns,axis=1)
    X = pd.concat([X,df_t],axis=1)
    X = X.drop(pollutant_1,axis=1)
    Y = d[[pollutant_1]] 
    return X,Y

# data split 

In [219]:
def prep_data(X,Y):
    X_train = X.loc[:'2020-01-02']
    X_test = X.loc['2020-01-03':'2020-03-10']
    y_train = Y.loc[:'2020-01-02']
    y_test = Y.loc['2020-01-03':'2020-03-10']
    X_train, y_train = shuffle(X_train, y_train)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train,X_test,y_train,y_test

# modeli rf i knn

In [220]:
models = {'kn':KNeighborsRegressor,
          'rf':RandomForestRegressor}

In [221]:
sites = ['Nord','Sud','West','Ost','DonBosco'] 
pollutant_O3 = ['N_O3','S_O3'] 
pollutant_PM10 = ['N_PM10K','S_PM10K','W_PM10K','O_PM10K','D_PM10K'] 
pollutant_NO2 = ['N_NO2','S_NO2','W_NO2','O_NO2','D_NO2'] 


In [230]:
for site,pollutant in zip(sites,pollutant_O3): 
    X,Y = data_sites(site,pollutant)
    X_train,X_test,Y_train,Y_test=prep_data(X,Y)
    model = KNeighborsRegressor()
    model.fit(X_train,Y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(Y_test, predictions)
    mse = mean_squared_error(Y_test, predictions, squared=True)
    print(f'{site} {pollutant} ')
    print(f'R2_Score:', r2_score(Y_test, predictions))
    print(f'MSE:', mean_squared_error(Y_test, predictions, squared=True))
    
#     figure(figsize=(12, 6), dpi=80)
#     plt.xticks(rotation=90, ha='right')
#     plt.plot(Y_test)#_inverse)
#     plt.plot(predictions)#_inverse)
#     plt.show()
        

Nord N_O3 
R2_Score: 0.25267539030052666
MSE: 262.75326926747283
Sud S_O3 
R2_Score: 0.5666366641708473
MSE: 112.8602047461655


In [229]:
for site,pollutant in zip(sites,pollutant_O3): 
    temp_traffic = traffic_norm(site) # normalizirani promet
    norm_traffic = traffic_s_norm(data,temp_traffic) #normalizirani promet
    X,Y = data_sites_s_norm_traffic(site,pollutant,norm_traffic)
    X_train,X_test,Y_train,Y_test=prep_data(X,Y)
    model = RandomForestRegressor(random_state=1)
    model.fit(X_train,Y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(Y_test, predictions)
    mse = mean_squared_error(Y_test, predictions, squared=True)
    print(f'{site} {pollutant} ')
    print(f'R2_Score:', r2_score(Y_test, predictions))
    print(f'MSE:', mean_squared_error(Y_test, predictions, squared=True))
    
#     figure(figsize=(12, 6), dpi=80)
#     plt.xticks(rotation=90, ha='right')
#     plt.plot(Y_test)#_inverse)
#     plt.plot(predictions)#_inverse)
#     plt.show()

Nord N_O3 
R2_Score: 0.6615830344708817
MSE: 118.98465929565943
Sud S_O3 
R2_Score: 0.78233034763903
MSE: 56.687401774502284


# multitarget


In [184]:
multi = data.copy()

In [185]:
traffic = multi.filter(like='_80',axis=1)

In [186]:
PM10 = data.filter(like='PM10K',axis=1)
NO = data.filter(like='NO',axis=1)
O3 = data.filter(like='_O3',axis=1)
NOX = data.filter(like='NOX',axis=1)
Ox = data.filter(like='_Ox',axis=1)

In [187]:
NO2 = data.filter(like='NO2',axis=1)

In [188]:
multi = multi.drop(PM10,axis=1)
multi = multi.drop(NO,axis=1)
multi = multi.drop(O3,axis=1)
multi = multi.drop(Ox,axis=1)


In [189]:
X_train = multi.loc[:'2020-01-02']
X_test = multi.loc['2020-01-03':'2020-03-10']
y_train = NO2.loc[:'2020-01-02']
y_test = NO2.loc['2020-01-03':'2020-03-10']

In [190]:
sc = StandardScaler()

In [191]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [192]:
model = RandomForestRegressor(random_state=1)

In [193]:
model.fit(X_train,y_train)

RandomForestRegressor(random_state=1)

In [194]:
predictions = model.predict(X_test)

In [195]:
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions, squared=True)

In [196]:
PM = pd.DataFrame(y_test,columns=NO2.columns)

In [197]:
pred = pd.DataFrame(predictions,columns=NO2.columns)

In [198]:
for i in PM.columns:
    r2 = r2_score(PM[i],pred[i])
    mse = mean_squared_error(PM[i],pred[i], squared=True)
    print(i)
    print(f'R2 {r2}')
    print(f'mse {mse}')
#     plt.plot(PM[i])#_inverse)
#     plt.plot(pred[i])#_inverse)
#     plt.show()
        

D_NO2
R2 0.5097208073011139
mse 60.21778730697081
N_NO2
R2 0.6440888226873311
mse 41.02318103932081
O_NO2
R2 0.43580035852599086
mse 64.80312925898099
S_NO2
R2 0.5145406181175305
mse 46.30993135763425
W_NO2
R2 0.68279452127843
mse 37.43785815109094


multitarget bez traffic 

In [199]:
multi_bez_traffic = multi.drop(traffic,axis=1)

In [201]:
X_train_1 = multi_bez_traffic .loc[:'2020-01-02']
X_test_1 = multi_bez_traffic .loc['2020-01-03':'2020-03-10']
y_train = NO2.loc[:'2020-01-02']
y_test = NO2.loc['2020-01-03':'2020-03-10']

In [202]:
sc = StandardScaler()

In [203]:
X_train_1 = sc.fit_transform(X_train_1)
X_test_1 = sc.transform(X_test_1)

In [204]:
model_1 = RandomForestRegressor(random_state=1)

In [205]:
model_1.fit(X_train_1,y_train)

RandomForestRegressor(random_state=1)

In [206]:
predictions = model_1.predict(X_test_1)

In [207]:
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions, squared=True)

In [208]:
PM_1 = pd.DataFrame(y_test,columns=NO2.columns)

In [209]:
pred = pd.DataFrame(predictions,columns=NO2.columns)

In [210]:
for i in PM.columns:
    r2 = r2_score(PM[i],pred[i])
    mse = mean_squared_error(PM[i],pred[i], squared=True)
    print(i)
    print(f'R2 {r2}')
    print(f'mse {mse}')
#     plt.plot(PM[i])#_inverse)
#     plt.plot(pred[i])#_inverse)
#     plt.show()

D_NO2
R2 0.479453642230824
mse 63.935305275772485
N_NO2
R2 0.6469012533906251
mse 40.69901349062813
O_NO2
R2 0.3367670723403098
mse 76.17794479212412
S_NO2
R2 0.4757164504088498
mse 50.013525538128576
W_NO2
R2 0.6878047786638679
mse 36.84652755348445
