In [1]:
import torch

In [2]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
import numpy as np
import pandas as pd
import torch.nn as nn
import os
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
import utils
import importlib

In [4]:
os.listdir('hka-aqm-am/')[:10]
pd.read_csv('hka-aqm-am/hka-aqm-am001_2022_09_05.dat', skiprows=1, sep=';', engine='python').head()

Unnamed: 0,date_time,device_id,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,rssi,channel_rssi,snr,gateway,channel_index,spreading_factor,bandwidth,f_cnt
0,2022-09-05 12:57:18,hka-aqm-am001,25.05,51.89,991,611,106,16,0,0,-127,-127,-18.2,drag-lps8-02,1,10,125000,2
1,2022-09-05 12:58:00,hka-aqm-am001,25.05,51.89,995,602,30,7,2,1,-128,-128,-15.5,drag-lps8-02,4,10,125000,3
2,2022-09-05 13:20:12,hka-aqm-am001,25.05,52.08,1000,544,109,20,2,0,-133,-133,-13.8,drag-lps8-01,2,10,125000,5
3,2022-09-05 13:35:18,hka-aqm-am001,25.17,51.79,728,450,109,17,2,0,-124,-124,-6.8,drag-lps8-02,5,10,125000,6
4,2022-09-05 13:50:25,hka-aqm-am001,25.3,51.98,670,465,95,14,2,0,-132,-132,-15.2,drag-lps8-02,0,10,125000,7


In [5]:
# load all files into one single df
# df = pd.concat([pd.read_csv('hka-aqm-am/' + f, skiprows=1, sep=';', engine='python') for f in os.listdir('hka-aqm-am/')])
df = pd.concat([pd.read_csv('hka-aqm-am/' + f.removeprefix('._'), skiprows=1, sep=';', engine='python') for f in os.listdir('hka-aqm-am/')])
df.shape


(1216072, 18)

In [6]:
train_df, test_df, train_loader, test_loader, scaler, y_test = utils.get_data_for_transformer(df, aggregation_level='quarter_hour', window_size=20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['CO2_scaled'] = scaler.fit_transform(df_train[['CO2']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['CO2_scaled'] = scaler.transform(df_test[['CO2']])


Training data shape: torch.Size([245095, 20, 1]) torch.Size([245095, 1])
Testing data shape: torch.Size([63746, 20, 1]) torch.Size([63746, 1])


In [7]:
# print number of training samples
print(f"Number of training samples: {len(train_df)}")

Number of training samples: 245095


In [8]:
trained_model = utils.train_transformer_model(device, train_loader, test_loader, epochs=10)

  from .autonotebook import tqdm as notebook_tqdm
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch 1/10, Validation Loss: 0.0141
Epoch 2/10, Validation Loss: 0.0127
Epoch 3/10, Validation Loss: 0.0174
Epoch 4/10, Validation Loss: 0.0320
Epoch 5/10, Validation Loss: 0.0292
Epoch 6/10, Validation Loss: 0.0359
Early stopping!


In [19]:
# save the trained model and the scaler
torch.save(trained_model.state_dict(), 'models/transformer_model.pth')
torch.save(scaler, 'models/scaler_transformer.pth')

In [9]:
utils.evaluate_transformer_model(device, test_loader, trained_model, scaler, y_test)

Score (RMSE): 21.2643


In [None]:
importlib.reload(utils)
_, _, _, _, _, _ = utils.get_data_for_transformer(df, aggregation_level='quarter_hour', window_size=20)

In [18]:
importlib.reload(utils)
utils.predict_data(trained_model, scaler, df)

cpy:  (1197180, 19)
help:  (309157, 9)
NaN count:  230702
(1197180, 20)
230702
torch.Size([966478, 20, 1])
53274 rows processed out of 966478
102450 rows processed out of 966478
151626 rows processed out of 966478
200802 rows processed out of 966478
254076 rows processed out of 966478
303252 rows processed out of 966478
352428 rows processed out of 966478
401604 rows processed out of 966478
450780 rows processed out of 966478
504054 rows processed out of 966478
553230 rows processed out of 966478
602406 rows processed out of 966478
651582 rows processed out of 966478
700758 rows processed out of 966478
754032 rows processed out of 966478
803208 rows processed out of 966478
852384 rows processed out of 966478
901560 rows processed out of 966478
950736 rows processed out of 966478
NaN count after prediction:  230702


Unnamed: 0,date_time,device_id,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,...,channel_rssi,snr,gateway,channel_index,spreading_factor,bandwidth,f_cnt,date_time_rounded,CO2_context,CO2_pred
0,2022-09-05 12:57:18,hka-aqm-am001,25.05,51.89,991,611,106,16,0,0,...,-127,-18.2,drag-lps8-02,1,10,125000,2,2022-09-05 12:45:00,,
1,2022-09-05 12:58:00,hka-aqm-am001,25.05,51.89,995,602,30,7,2,1,...,-128,-15.5,drag-lps8-02,4,10,125000,3,2022-09-05 12:45:00,,
2,2022-09-05 13:20:12,hka-aqm-am001,25.05,52.08,1000,544,109,20,2,0,...,-133,-13.8,drag-lps8-01,2,10,125000,5,2022-09-05 13:15:00,,
3,2022-09-05 13:35:18,hka-aqm-am001,25.17,51.79,728,450,109,17,2,0,...,-124,-6.8,drag-lps8-02,5,10,125000,6,2022-09-05 13:30:00,,
4,2022-09-05 13:50:25,hka-aqm-am001,25.30,51.98,670,465,95,14,2,0,...,-132,-15.2,drag-lps8-02,0,10,125000,7,2022-09-05 13:45:00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197175,2023-09-26 22:50:49,hka-aqm-am308,27.38,37.58,520,950,7,3,2,0,...,-73,12.2,drag-lps8-05,4,8,125000,12637,2023-09-26 22:45:00,"[507.0, 510.0, 514.0, 511.0, 513.0, 513.0, 513...",1966.095093
1197176,2023-09-26 23:05:56,hka-aqm-am308,27.36,37.58,529,948,4,2,1,0,...,-72,10.0,drag-lps8-05,6,8,125000,12638,2023-09-26 23:00:00,"[510.0, 514.0, 511.0, 513.0, 513.0, 513.0, 518...",1956.046875
1197177,2023-09-26 23:21:03,hka-aqm-am308,27.35,37.61,529,966,4,0,6,27,...,-75,11.8,drag-lps8-05,0,8,125000,12639,2023-09-26 23:15:00,"[514.0, 511.0, 513.0, 513.0, 513.0, 518.0, 516...",1969.131104
1197178,2023-09-26 23:36:09,hka-aqm-am308,27.34,37.65,525,944,4,0,0,0,...,-75,12.0,drag-lps8-05,3,8,125000,12640,2023-09-26 23:30:00,"[511.0, 513.0, 513.0, 513.0, 518.0, 516.0, 517...",1969.001343


In [None]:
df.describe()

Unnamed: 0,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,rssi,channel_rssi,channel_index,spreading_factor,bandwidth,f_cnt
count,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0
mean,25.142613,36.72473,607.723873,925.523094,801.135497,227.006169,2.476212,5.361508,-95.298145,-95.298145,3.497678,7.568132,125000.0,7406.347787
std,4.123532,9.568186,1058.630803,1161.608153,3225.759145,1355.515628,1.74345,11.378924,21.379657,21.379657,2.292534,0.683923,0.0,7658.187112
min,6.92,7.76,264.0,447.0,0.0,0.0,0.0,0.0,-139.0,-139.0,0.0,7.0,125000.0,1.0
25%,22.73,29.65,421.0,611.0,8.0,1.0,1.0,0.0,-113.0,-113.0,1.0,7.0,125000.0,1221.0
50%,24.65,36.12,448.0,675.0,121.0,28.0,2.0,1.0,-99.0,-99.0,3.0,7.0,125000.0,4477.0
75%,27.6,43.03,506.0,858.0,634.0,159.0,4.0,5.0,-79.0,-79.0,5.0,8.0,125000.0,11820.0
max,583.72,622.15,24001.0,21930.0,65535.0,46896.0,13.0,128.0,-32.0,-32.0,7.0,12.0,125000.0,37271.0


In [None]:
df.columns

Index(['date_time', 'device_id', 'tmp', 'hum', 'CO2', 'VOC', 'vis', 'IR',
       'WIFI', 'BLE', 'rssi', 'channel_rssi', 'snr', 'gateway',
       'channel_index', 'spreading_factor', 'bandwidth', 'f_cnt'],
      dtype='object')

In [None]:
df.date_time = pd.to_datetime(df.date_time)
# sort by date ascendin, get 80% percentil
df.sort_values(by='date_time', ascending=True).reset_index(drop=True)['date_time'].quantile(0.8)

Timestamp('2023-07-17 12:26:11')

In [None]:
# Data Preprocessing
start_id = max(df[df['obs_num'] == 0].index.tolist()) + 1
df = df[start_id:].copy()
df['sn_value'] = df['sn_value'].astype(float)
df_train = df[df['year'] < 2000]
df_test = df[df['year'] >= 2000]

spots_train = df_train['sn_value'].to_numpy().reshape(-1, 1)
spots_test = df_test['sn_value'].to_numpy().reshape(-1, 1)

scaler = StandardScaler()
spots_train = scaler.fit_transform(spots_train).flatten().tolist()
spots_test = scaler.transform(spots_test).flatten().tolist()

KeyError: 'obs_num'