# Forecasting

In [1]:
# imports
import pandas as pd
from sklearn.linear_model import LinearRegression
import orga_functions as org
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv(org.path("01_AirQuality_processed.csv"), sep=';', index_col='date')
df

Unnamed: 0_level_0,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-03-10 18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-02-06 19:00:00,1.6,985.0,,4.5,,227.0,891.0,165.0,875.0,774.0,6.0,38.0,0.3584
2005-02-06 20:00:00,1.8,1002.0,,5.3,780.0,252.0,855.0,179.0,892.0,857.0,5.8,36.4,0.3385
2005-02-06 21:00:00,1.4,938.0,,3.7,,193.0,937.0,149.0,805.0,737.0,5.8,35.4,0.3286
2005-02-06 22:00:00,1.1,896.0,,2.6,,158.0,1033.0,126.0,782.0,610.0,5.4,36.6,0.3304


### Define target & feature list

In [3]:
target = 'ah_target'

In [4]:
# define features list
features = ['co_gt', 'pt08_s1_co']

### Shift

In [5]:
# shift abs humidty values by 6 hrs
df[target] = df.ah.shift(periods=6)

In [6]:
df.head(8)

Unnamed: 0_level_0,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah,ah_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10 18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,
2004-03-10 19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,
2004-03-10 20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,
2004-03-10 21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,
2004-03-10 22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,
2004-03-10 23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.2,0.7848,
2004-03-11 00:00:00,1.2,1185.0,31.0,3.6,,62.0,,77.0,1333.0,733.0,11.3,56.8,0.7603,0.7578
2004-03-11 01:00:00,1.0,1136.0,31.0,3.3,,62.0,,76.0,1333.0,730.0,10.7,60.0,0.7702,0.7255


- false shift - targets are past values!!!

In [7]:
df[target] = df.ah.shift(periods=-6)
df.head(8)

Unnamed: 0_level_0,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah,ah_target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10 18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,0.7603
2004-03-10 19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,0.7702
2004-03-10 20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,0.7648
2004-03-10 21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,0.7517
2004-03-10 22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,0.7465
2004-03-10 23:00:00,1.2,1197.0,38.0,4.7,750.0,89.0,1337.0,96.0,1393.0,949.0,11.2,59.2,0.7848,0.7366
2004-03-11 00:00:00,1.2,1185.0,31.0,3.6,,62.0,,77.0,1333.0,733.0,11.3,56.8,0.7603,0.7353
2004-03-11 01:00:00,1.0,1136.0,31.0,3.3,,62.0,,76.0,1333.0,730.0,10.7,60.0,0.7702,0.7417


- correct shift - targets are values in the future

### Limitation of the ML_DataFrame

In [8]:
# drop related cols to target that are unknown
df.drop(columns=['ah', 'rh'], inplace=True)

# drop rows where target is unknown
df.dropna(subset=[target], inplace=True)

In [9]:
df = df[features + [target]]

In [10]:
# Missing value treatment goes here
df = df.dropna()

In [11]:
df.shape

(5852, 3)

## Preparation

In [12]:
X = df[features]
y = df[target]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, shuffle=False, random_state=1997)

## Model

In [14]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.5186838523839459

In [15]:
y_test[0]

0.5867

In [16]:
# define train/test cv-split
#tscv = TimeSeriesSplit(gap=0, n_splits=5, test_size=1000, max_train_size=None)

In [17]:
#tscv
#X.index

In [18]:
#for train_index, test_index in tscv.split(X):
#    print("TRAIN:", train_index, "TEST:", test_index)
#    X_train, X_test = X.iloc[train_index], X[test_index]
#    y_train, y_test = y.iloc[train_index], y[test_index]