In [1]:
import torch

In [2]:
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


In [3]:
import numpy as np
import pandas as pd
import torch.nn as nn
import os
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau
import utils
import importlib

In [4]:
pd.read_csv('hka-aqm-am/hka-aqm-am001_2022_09_05.dat', skiprows=1, sep=';', engine='python')

Unnamed: 0,date_time,device_id,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,rssi,channel_rssi,snr,gateway,channel_index,spreading_factor,bandwidth,f_cnt
0,2022-09-05 12:57:18,hka-aqm-am001,25.05,51.89,991,611,106,16,0,0,-127,-127,-18.2,drag-lps8-02,1,10,125000,2
1,2022-09-05 12:58:00,hka-aqm-am001,25.05,51.89,995,602,30,7,2,1,-128,-128,-15.5,drag-lps8-02,4,10,125000,3
2,2022-09-05 13:20:12,hka-aqm-am001,25.05,52.08,1000,544,109,20,2,0,-133,-133,-13.8,drag-lps8-01,2,10,125000,5
3,2022-09-05 13:35:18,hka-aqm-am001,25.17,51.79,728,450,109,17,2,0,-124,-124,-6.8,drag-lps8-02,5,10,125000,6
4,2022-09-05 13:50:25,hka-aqm-am001,25.3,51.98,670,465,95,14,2,0,-132,-132,-15.2,drag-lps8-02,0,10,125000,7
5,2022-09-05 14:05:31,hka-aqm-am001,25.39,51.54,646,518,96,14,3,1,-136,-136,-14.5,drag-outd-01,7,10,125000,8
6,2022-09-05 14:35:45,hka-aqm-am001,25.58,51.63,642,579,116,19,0,0,-134,-134,-15.5,drag-outd-01,1,10,125000,10
7,2022-09-05 15:47:39,hka-aqm-am001,25.71,50.27,617,633,25,9,1,0,-138,-138,-16.5,drag-lps8-01,7,10,125000,15


In [5]:
# load all files into one single df
# df = pd.concat([pd.read_csv('hka-aqm-am/' + f, skiprows=1, sep=';', engine='python') for f in os.listdir('hka-aqm-am/')])
df = pd.concat([pd.read_csv('hka-aqm-am/' + f.removeprefix('._'), skiprows=1, sep=';', engine='python') for f in os.listdir('hka-aqm-am/')])
df.shape


(608036, 18)

In [6]:
importlib.reload(utils)
train_df, test_df, train_loader, test_loader, scaler, y_test = utils.get_data_for_transformer(df, aggregation_level='quarter_hour', window_size=20)

training data cutoff:  2023-07-15 04:00:00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['CO2_scaled'] = scaler.fit_transform(df_train[['CO2']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['CO2_scaled'] = scaler.transform(df_test[['CO2']])


Training data shape: torch.Size([245095, 20, 1]) torch.Size([245095, 1])
Testing data shape: torch.Size([63746, 20, 1]) torch.Size([63746, 1])


In [7]:
# print number of training samples
print(f"Number of training samples: {len(train_df)}")

Number of training samples: 245095


In [8]:
trained_model = utils.train_transformer_model(device, train_loader, test_loader, scaler, epochs=10)

     actual   predicted
0     429.0  481.647949
1     429.0  475.280457
2     429.0  471.442200
3     436.0  470.414154
4     408.0  470.744598
..      ...         ...
123   446.0  472.253845
124   451.0  478.653809
125   452.0  480.361084
126   453.0  476.426575
127   458.0  471.951691

[128 rows x 2 columns]
Epoch 1/10, Validation Loss: 0.1943
     actual   predicted
0     429.0  432.102936
1     429.0  426.151947
2     429.0  424.158142
3     436.0  423.777710
4     408.0  414.291992
..      ...         ...
123   446.0  416.292572
124   451.0  419.934479
125   452.0  421.243805
126   453.0  419.697205
127   458.0  419.178101

[128 rows x 2 columns]
Epoch 2/10, Validation Loss: 0.0270
     actual   predicted
0     429.0  427.274200
1     429.0  423.061981
2     429.0  422.946167
3     436.0  423.584564
4     408.0  412.824158
..      ...         ...
123   446.0  418.639618
124   451.0  421.086273
125   452.0  420.422638
126   453.0  417.935333
127   458.0  419.496338

[128 rows x 2 c

In [9]:
model_file = 'models/transformer_model.pth'
scaler_file = 'models/scaler_transformer.pth'

utils.save_model(trained_model, model_file)
utils.save_scaler(scaler, scaler_file)

In [10]:
importlib.reload(utils)
utils.evaluate_transformer_model(device, test_loader, trained_model, scaler, y_test)

       actual   predicted
0       429.0  434.997076
1       429.0  428.147924
2       429.0  428.074467
3       436.0  427.954775
4       408.0  412.128690
...       ...         ...
63741   520.0  541.661265
63742   529.0  536.592044
63743   529.0  544.858355
63744   525.0  527.103041
63745   525.0  524.914237

[63746 rows x 2 columns]
Score (RMSE): 14.5382


In [16]:
importlib.reload(utils)
_, _, _, _, _, _ = utils.get_data_for_transformer(df, aggregation_level='quarter_hour', window_size=20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['CO2_scaled'] = scaler.fit_transform(df_train[['CO2']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['CO2_scaled'] = scaler.transform(df_test[['CO2']])


Training data shape: torch.Size([245095, 20, 1]) torch.Size([245095, 1])
Testing data shape: torch.Size([63746, 20, 1]) torch.Size([63746, 1])


In [37]:
importlib.reload(utils)
utils.predict_data(trained_model, scaler, df)

cpy:  (598590, 19)
help:  (309157, 10)
NaN count:  115351
(598590, 20)
115351
torch.Size([483239, 20, 1])
50048 rows processed out of 483239
100096 rows processed out of 483239
150016 rows processed out of 483239
200064 rows processed out of 483239
250112 rows processed out of 483239
300032 rows processed out of 483239
350080 rows processed out of 483239
400000 rows processed out of 483239
450048 rows processed out of 483239
NaN count after prediction:  115351


Unnamed: 0,date_time,device_id,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,...,channel_rssi,snr,gateway,channel_index,spreading_factor,bandwidth,f_cnt,date_time_rounded,CO2_context,CO2_pred
0,2022-10-18 00:08:48,hka-aqm-am117,23.24,57.57,497,785,29,1,3,15,...,-63,9.0,drag-lps8-01,3,7,125000,778,2022-10-18 00:00:00,"[-0.3408824944928015, -0.31949503430793347, -0...",480.846527
1,2022-10-18 00:23:55,hka-aqm-am117,23.26,57.19,503,771,29,1,0,0,...,-63,8.8,drag-lps8-01,4,7,125000,779,2022-10-18 00:15:00,"[-0.31949503430793347, -0.2909784207281094, -0...",487.861694
2,2022-10-18 00:39:02,hka-aqm-am117,23.26,56.91,502,748,29,1,3,15,...,-63,8.2,drag-lps8-01,6,7,125000,780,2022-10-18 00:30:00,"[-0.2909784207281094, -0.30523672751802144, -0...",497.003357
3,2022-10-18 00:54:09,hka-aqm-am117,23.26,56.70,504,717,29,1,3,15,...,-61,6.8,drag-lps8-01,5,7,125000,781,2022-10-18 00:45:00,"[-0.30523672751802144, -0.23394519356846138, -...",497.617493
4,2022-10-18 01:09:15,hka-aqm-am117,23.25,56.41,504,661,29,1,1,0,...,-65,10.0,drag-lps8-01,7,7,125000,782,2022-10-18 01:00:00,"[-0.23394519356846138, -0.16265365961890133, -...",500.481049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598585,2023-03-13 22:47:47,hka-aqm-am211,21.85,42.57,543,955,5,0,4,7,...,-128,-8.5,drag-lps8-03,4,8,125000,5099,2023-03-13 22:45:00,"[0.9922691903639718, 1.0136566505488398, 0.992...",448.632141
598586,2023-03-13 23:02:53,hka-aqm-am211,21.81,42.57,531,943,5,2,4,7,...,-126,-7.5,drag-lps8-03,1,8,125000,5100,2023-03-13 23:00:00,"[1.0136566505488398, 0.9922691903639718, 0.963...",445.328033
598587,2023-03-13 23:18:00,hka-aqm-am211,21.80,42.57,520,918,5,2,1,0,...,-130,-10.8,drag-lps8-03,3,8,125000,5101,2023-03-13 23:15:00,"[0.9922691903639718, 0.9637525767841478, 1.042...",431.707214
598588,2023-03-13 23:33:06,hka-aqm-am211,21.77,42.57,516,919,7,2,4,7,...,-65,8.2,drag-lps8-05,2,8,125000,5102,2023-03-13 23:30:00,"[0.9637525767841478, 1.0421732641286638, 1.070...",419.417938


In [None]:
df.describe()

Unnamed: 0,tmp,hum,CO2,VOC,vis,IR,WIFI,BLE,rssi,channel_rssi,channel_index,spreading_factor,bandwidth,f_cnt
count,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0,608036.0
mean,25.142613,36.72473,607.723873,925.523094,801.135497,227.006169,2.476212,5.361508,-95.298145,-95.298145,3.497678,7.568132,125000.0,7406.347787
std,4.123532,9.568186,1058.630803,1161.608153,3225.759145,1355.515628,1.74345,11.378924,21.379657,21.379657,2.292534,0.683923,0.0,7658.187112
min,6.92,7.76,264.0,447.0,0.0,0.0,0.0,0.0,-139.0,-139.0,0.0,7.0,125000.0,1.0
25%,22.73,29.65,421.0,611.0,8.0,1.0,1.0,0.0,-113.0,-113.0,1.0,7.0,125000.0,1221.0
50%,24.65,36.12,448.0,675.0,121.0,28.0,2.0,1.0,-99.0,-99.0,3.0,7.0,125000.0,4477.0
75%,27.6,43.03,506.0,858.0,634.0,159.0,4.0,5.0,-79.0,-79.0,5.0,8.0,125000.0,11820.0
max,583.72,622.15,24001.0,21930.0,65535.0,46896.0,13.0,128.0,-32.0,-32.0,7.0,12.0,125000.0,37271.0


In [None]:
df.columns

Index(['date_time', 'device_id', 'tmp', 'hum', 'CO2', 'VOC', 'vis', 'IR',
       'WIFI', 'BLE', 'rssi', 'channel_rssi', 'snr', 'gateway',
       'channel_index', 'spreading_factor', 'bandwidth', 'f_cnt'],
      dtype='object')

In [None]:
df.date_time = pd.to_datetime(df.date_time)
# sort by date ascendin, get 80% percentil
df.sort_values(by='date_time', ascending=True).reset_index(drop=True)['date_time'].quantile(0.8)

Timestamp('2023-07-17 12:26:11')

In [None]:
# Data Preprocessing
start_id = max(df[df['obs_num'] == 0].index.tolist()) + 1
df = df[start_id:].copy()
df['sn_value'] = df['sn_value'].astype(float)
df_train = df[df['year'] < 2000]
df_test = df[df['year'] >= 2000]

spots_train = df_train['sn_value'].to_numpy().reshape(-1, 1)
spots_test = df_test['sn_value'].to_numpy().reshape(-1, 1)

scaler = StandardScaler()
spots_train = scaler.fit_transform(spots_train).flatten().tolist()
spots_test = scaler.transform(spots_test).flatten().tolist()

KeyError: 'obs_num'