# Purpose of the Notebook:

> This notebook is all about prediction.
For this purpose, two different regressors are taken into account and their performance is compared.
> Furthermore, these regressors are saved for further usage.


# Forecasting

In [1]:
# imports
import pandas as pd
import X3_Forecasting as forecast
import orga_functions as org
import pickle

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import d2_absolute_error_score

In [2]:
# import the regressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 

In [3]:
# read in the data set
df = pd.read_csv(org.path("02_AirQuality_processed.csv"), sep=';', index_col='date')
#df = pd.read_csv(org.path("03_AirQuality_normalized.csv"), sep=';')

> We can distinguish between the data in its original form and the normalised form.
Normalising the AH values leads to the problem that the results can no longer be interpreted by humans.
The usefulness of partial normalisation is also unclear, therefore it is not longer considered.

In [4]:
df

Unnamed: 0_level_0,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-03-10 18:00:00,2.6,1360.0,150.000000,11.9,1046.000000,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,1292.0,112.000000,9.4,955.000000,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,1402.0,88.000000,9.0,939.000000,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,1376.0,80.000000,9.2,948.000000,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,1272.0,51.000000,6.5,836.000000,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-02-06 19:00:00,1.6,985.0,218.607666,4.5,953.579453,227.0,891.0,165.0,875.0,774.0,6.0,38.0,0.3584
2005-02-06 20:00:00,1.8,1002.0,218.607666,5.3,780.000000,252.0,855.0,179.0,892.0,857.0,5.8,36.4,0.3385
2005-02-06 21:00:00,1.4,938.0,218.607666,3.7,953.579453,193.0,937.0,149.0,805.0,737.0,5.8,35.4,0.3286
2005-02-06 22:00:00,1.1,896.0,218.607666,2.6,953.579453,158.0,1033.0,126.0,782.0,610.0,5.4,36.6,0.3304


## Features

In [5]:
# list of features which will be used for the algorithm - based on 05_Feature_selection
features = ["t", "ah"]

## Target

In [6]:
target = 'ah_target'

In [7]:
# shift of the absolute humidty values by 6 hrs
target = 'ah_target'
df[target] = df.ah.shift(periods=-6)

In [8]:
df.head(8)

Unnamed: 0_level_0,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah,ah_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10 18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,0.7603
2004-03-10 19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,0.7702
2004-03-10 20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,0.7648
2004-03-10 21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,0.7517
2004-03-10 22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,0.7465
2004-03-10 23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.2,0.7848,0.7366
2004-03-11 00:00:00,1.2,1185.0,31.0,3.6,953.579453,62.0,848.762615,77.0,1333.0,733.0,11.3,56.8,0.7603,0.7353
2004-03-11 01:00:00,1.0,1136.0,31.0,3.3,953.579453,62.0,848.762615,76.0,1333.0,730.0,10.7,60.0,0.7702,0.7417


> As we can see, the ah_target value from the first line equals to the ah value from the seventh line indicating that the shift was successful.

In [9]:
df.shape

(7998, 14)

## Limitation of the ML_DataFrame

In [10]:
# drop rows where target is unknown
df.dropna(subset=[target], inplace=True)

> due to the shift last 6 rows with NaN ah_target are dropped

In [11]:
df

Unnamed: 0_level_0,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah,ah_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10 18:00:00,2.6,1360.0,150.000000,11.9,1046.000000,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,0.7603
2004-03-10 19:00:00,2.0,1292.0,112.000000,9.4,955.000000,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,0.7702
2004-03-10 20:00:00,2.2,1402.0,88.000000,9.0,939.000000,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,0.7648
2004-03-10 21:00:00,2.2,1376.0,80.000000,9.2,948.000000,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,0.7517
2004-03-10 22:00:00,1.6,1272.0,51.000000,6.5,836.000000,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,0.7465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-02-06 13:00:00,1.2,911.0,218.607666,3.5,953.579453,188.0,981.0,135.0,790.0,558.0,11.1,23.5,0.3105,0.3584
2005-02-06 14:00:00,1.0,868.0,218.607666,2.1,953.579453,127.0,1081.0,100.0,753.0,420.0,10.6,26.0,0.3320,0.3385
2005-02-06 15:00:00,0.8,868.0,218.607666,1.9,953.579453,96.0,1128.0,78.0,755.0,363.0,10.3,27.7,0.3481,0.3286
2005-02-06 16:00:00,1.0,904.0,218.607666,2.7,953.579453,138.0,1040.0,100.0,789.0,410.0,10.2,28.3,0.3516,0.3304


In [12]:
df.shape

(7992, 14)

## Preparation Data

The data set is split into a training set and a test set

In [13]:
# import the split function
from sklearn.model_selection import train_test_split

# split
training, test = train_test_split(df, test_size=1000, shuffle=False, random_state=1999)

In [14]:
# classification of features and the target
X_train = training[features]
y_train = training[target]
x_test = test[features]
y_test = test[target]

> Both sets are subdivided into the features and the target, thus all other columns are not included any further

In [15]:
X_train

Unnamed: 0_level_0,t,ah
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-03-10 18:00:00,13.6,0.7578
2004-03-10 19:00:00,13.3,0.7255
2004-03-10 20:00:00,11.9,0.7502
2004-03-10 21:00:00,11.0,0.7867
2004-03-10 22:00:00,11.2,0.7888
...,...,...
2004-12-26 21:00:00,11.8,1.1381
2004-12-26 22:00:00,11.8,1.0741
2004-12-26 23:00:00,12.0,1.0095
2004-12-27 00:00:00,11.2,1.0587


> As an example, nox was not defined as a feature. It can be seen here that the nox column is no longer a part of the dataframe.

## Models/Regressors

### Linear Regression

In [16]:
# The regressor gets trained 
clf_lin_reg = LinearRegression()
clf_lin_reg.fit(X_train, y_train)

In [17]:
# values are estimated by the regressor
pred_linr_y = clf_lin_reg.predict(x_test)

### Performance Check

In [18]:
# visual comparison of the actual and estimated values
check_linr = forecast.check_df(x_test, y_test, clf_lin_reg)
check_linr

Unnamed: 0_level_0,future_ah,predicted_ah
future_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-12-27 08:00:00,0.9302,0.914418
2004-12-27 09:00:00,0.9113,0.887897
2004-12-27 10:00:00,0.9335,0.870580
2004-12-27 11:00:00,0.9261,0.841111
2004-12-27 12:00:00,0.9379,0.822809
...,...,...
2005-02-06 19:00:00,0.3584,0.375556
2005-02-06 20:00:00,0.3385,0.390809
2005-02-06 21:00:00,0.3286,0.402679
2005-02-06 22:00:00,0.3304,0.405050


In [19]:
#mean_absolute_error of the predicted AH value compared to the actual future AH value
mean_absolute_error(y_test, pred_linr_y)

0.056396974387893865

In [20]:
# variance of the predicted AH value compared to the actual future AH value
d2_absolute_error_score(y_test, pred_linr_y)

0.6954328439949589

## Decision Tree

In [21]:
# The regressor gets trained
clf_tree = DecisionTreeRegressor(random_state = 0) 
clf_tree.fit(X_train, y_train)

In [22]:
# values are estimated by the regressor
pred_tree_y = clf_tree.predict(x_test)

### Performance Check

In [23]:
# visual comparison of the actual and estimated values
check_tree_df = forecast.check_df(x_test, y_test, clf_tree)
check_tree_df

Unnamed: 0_level_0,future_ah,predicted_ah
future_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-12-27 08:00:00,0.9302,0.9206
2004-12-27 09:00:00,0.9113,1.0727
2004-12-27 10:00:00,0.9335,0.8063
2004-12-27 11:00:00,0.9261,0.7616
2004-12-27 12:00:00,0.9379,0.8225
...,...,...
2005-02-06 19:00:00,0.3584,0.4273
2005-02-06 20:00:00,0.3385,0.4799
2005-02-06 21:00:00,0.3286,0.5062
2005-02-06 22:00:00,0.3304,0.5062


In [24]:
#mean_absolute_error of the prediction and the actual values
mean_absolute_error(y_test, pred_tree_y)

0.08347020000000001

In [25]:
# variance of the prediction and the actual values
d2_absolute_error_score(y_test, pred_tree_y)

0.5492261473050031

## Saving

### Save Training/Test Split

In [26]:
# save training dataset locally
new_path = org.path("00_training.csv")
training.to_csv(new_path, sep=';', index = True)

In [27]:
# save test dataset locally
new_path = org.path("00_testing.csv")
test.to_csv(new_path, sep=';', index = True)

In [28]:
# save the x_test dataset
x_path = org.path("00_x_test.csv")
x_test.to_csv(x_path, sep=';', index = True)

# save the y_test dataset
y_path = org.path("00_y_test.csv")
y_test.to_csv(y_path, sep=';', index = True)

### Save Model

In [29]:
# the trained LinearRegressor is saved as a pickle file
new_path = org.path("LinearRegression_ah_regressor.pkl")

with open(new_path, 'wb') as f:
    pickle.dump(clf_lin_reg, f)

In [30]:
# the trained DecisionTree is saved as a pickle file
new_path = org.path("DecisionTree_ah_regressor.pkl")

with open(new_path, 'wb') as f:
    pickle.dump(clf_tree, f)

> These regressors are stored to be used later for unit tests and also in reality.