In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch 

In [2]:
df = pd.read_csv('aggregated_hourly.csv')
#for every distinct device_id create a new column target with the value of WIFI shiftied by 1
df['target'] = 0
for i in df['device_id'].unique():
    df.loc[df['device_id'] == i, 'target'] = df.loc[df['device_id'] == i, 'CO2'].shift(-1)
df.sort_values(by=['date_time'], inplace=True)
#drop
df = df.dropna()   
#onehot encode device_id into int
df = pd.get_dummies(df, columns=['device_id'], prefix = 'device_id')
#delete date_time column and device_id column
df = df.drop(columns=['date_time'])
df

Unnamed: 0,tmp,hum,snr,CO2,VOC,vis,IR,WIFI,BLE,rssi,...,device_id_hka-aqm-am201a,device_id_hka-aqm-am201b,device_id_hka-aqm-am204,device_id_hka-aqm-am205,device_id_hka-aqm-am209,device_id_hka-aqm-am210,device_id_hka-aqm-am211,device_id_hka-aqm-am301,device_id_hka-aqm-am307,device_id_hka-aqm-am308
46711,25.080000,44.9700,-16.800000,754,558,379,64,4,0,-131,...,False,False,False,False,False,False,False,False,False,False
54194,23.900000,52.1100,-15.200000,686,593,255,35,5,0,-135,...,False,False,False,False,False,False,False,False,False,False
54195,24.137500,51.8300,-11.625000,800,633,256,36,3,0,-125,...,False,False,False,False,False,False,False,False,False,False
54196,24.432500,52.0000,-15.650000,902,825,289,55,5,2,-125,...,False,False,False,False,False,False,False,False,False,False
24671,24.908000,52.3160,-2.550000,1128,450,213,91,3,1,-115,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24669,24.595000,45.5850,-14.900000,469,790,5,1,2,1,-125,...,False,False,False,False,False,False,False,False,False,False
84148,27.086667,38.9300,9.066667,441,836,32,4,5,0,-102,...,False,False,False,False,False,False,False,False,False,False
61166,24.035000,48.9550,6.000000,10026,9997,12,3,2,2,-96,...,False,False,False,False,False,False,False,False,False,False
7473,24.492500,45.9275,-9.700000,445,865,8,2,1,4,-120,...,False,False,False,False,False,False,False,False,False,False


In [3]:
X = df.drop(columns=['target']).astype('float32')
y = df['target'].astype('float32')
X = torch.tensor(X.values).float()
y = torch.tensor(y.values).float()
print(X.shape)  
print(y.shape)

torch.Size([143802, 58])
torch.Size([143802])


In [4]:
X_new = []
window_size = 500
#for i in range(0, len(X), window_size):
for i in range(0, len(X)):
    if i < window_size:
        number_of_padding = window_size - i
        padding = torch.zeros(number_of_padding, X.shape[1])
        X_new.append(torch.cat((padding, X[:i])))
    else:
        X_new.append(X[i-window_size:i])
X_new = torch.stack(X_new)
X_new = X_new[1:]
X_new.shape

torch.Size([143801, 500, 58])

In [5]:
from torch.utils.data import DataLoader, TensorDataset
batch_size=6
# Create a TensorDataset
y = y[1:]
data = TensorDataset(X_new, y)

from torch.utils.data import DataLoader, TensorDataset
# Create a DataLoader
data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)

In [6]:
class LSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_layer_size, output_size, device,batch_size):
        super().__init__()
        self.output_size = output_size
        self.hidden_layer_size = hidden_layer_size
        self.lstm = torch.nn.LSTM(input_size=input_size,hidden_size=hidden_layer_size,batch_first=True).to(device)
        self.linear = torch.nn.Linear(hidden_layer_size, output_size).to(device)
        self.hidden_cell = (torch.zeros(1,batch_size,self.hidden_layer_size).to(device),
                            torch.zeros(1,batch_size,self.hidden_layer_size).to(device))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        predictions = self.linear(lstm_out)
        return predictions[:,-1].squeeze()

In [10]:
import torch.optim as optim

# Define the loss function
criterion = torch.nn.MSELoss()
# Check if CUDA is available and set PyTorch to use GPU or CPU accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

# Define the LSTM model
# input_size = X.shape[1]
# hidden_layer_size = 64
# output_size = 1
model = LSTM(input_size=X_new.shape[2], hidden_layer_size=26, output_size=1, device=device,batch_size=batch_size).to(device)  # Move model to GPU

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Set the number of epochs
num_epochs = 100

In [8]:
#how many batches in dataloader
num_batches = len(data_loader)
print("Number of batches:", num_batches)

Number of batches: 23967


In [11]:
# Train the LSTM model
model.train()
for epoch in range(num_epochs):
    # Clear the gradients
    optimizer.zero_grad()
    counter = 0
    for inputs, labels in data_loader:
        # Move data to GPU
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(inputs)

        # Calculate the loss
        loss = criterion(outputs, labels)
        # Backward pass and optimize
        loss.backward(retain_graph=True)
        counter += 1
        if (counter+1) % 200 == 0:
            print(f'Epoch: {counter+1}, Loss: {loss.item()}')

    optimizer.step()
    optimizer.zero_grad()
    
    # Print the loss for every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

Epoch: 200, Loss: 284635.03125
Epoch: 400, Loss: 245178.5625
Epoch: 600, Loss: 354613.21875


KeyboardInterrupt: 

In [38]:
X_new.shape

torch.Size([143801, 500, 58])

In [30]:
inputs[:10].shape[0]

10

In [22]:
labels.shape

torch.Size([64])