In [1]:
import numpy as np
import pandas as pd
from src.prepare_dataset import prepare_data
from utils import data_wrapper
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import joblib

In [2]:
prepare_data("data/final_dataset_aggregated.csv", "prep_data", scale_method = "min_max")

[32m2024-07-28 23:54:52.810[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m17[0m - [1mArgs:  data/final_dataset_aggregated.csv ; prep_data ; 0.7 ; 0.1 ; min_max[0m


data/final_dataset_aggregated.csv


[32m2024-07-28 23:54:53.207[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m34[0m - [1mDropped the unnecessary columns[0m
[32m2024-07-28 23:54:53.208[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m39[0m - [1mDataset size BEFORE dropping the rows by date: 28299[0m
[32m2024-07-28 23:54:53.267[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m41[0m - [1mDataset size AFTER dropping the rows by date: 28026[0m
[32m2024-07-28 23:54:53.303[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m58[0m - [1mOverall encoded 5 cateforical features[0m
[32m2024-07-28 23:54:53.303[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m59[0m - [1m['classroom_type', 'break_time', 'school_day', 'school_hours', 'power_on'][0m
[32m2024-07-28 23:54:53.332[0m | [1mINFO    [0m | [36msrc.prepare_dataset[0m:[36mprepare_data[0m:[36m78[

In [3]:
train = pd.read_csv("prep_data/train.csv")
val = pd.read_csv("prep_data/valid.csv")
test = pd.read_csv("prep_data/test.csv")

In [4]:
train.head()

Unnamed: 0,classroom_type,measured_t_mean,measured_t_min,measured_t_max,school_day,school_hours,break_time,power_on,tmstamp,trackerwm_avg_mean,...,uva_avg_mean,uva_avg_min,uva_avg_max,uvb_avg_mean,uvb_avg_min,uvb_avg_max,Month,Day,Hour,Minute
0,0,0.647059,0.649485,0.639456,1,0,0,1,2023-03-16 07:43:00,0.011026,...,0.008539,0.008333,0.008796,0.093667,0.095624,0.088407,3,16,7,43
1,1,0.638739,0.632302,0.642857,1,0,0,1,2023-03-16 07:55:00,0.011313,...,0.008752,0.008179,0.009105,0.093545,0.095624,0.088187,3,16,7,55
2,2,0.679674,0.680412,0.666667,1,0,0,1,2023-03-16 07:55:00,0.033126,...,0.008796,0.008488,0.009105,0.093511,0.095624,0.087967,3,16,7,55
3,4,0.578334,0.594502,0.554422,1,0,0,1,2023-03-16 07:35:00,0.001889,...,0.009259,0.009259,0.009259,0.093511,0.095624,0.087967,3,16,7,35
4,0,0.690157,0.701031,0.666667,1,1,0,1,2023-03-16 08:47:00,0.464367,...,0.008796,0.008333,0.009259,0.093511,0.095624,0.087967,3,16,8,47


In [5]:
def rearrange(data):
    features = data.drop(['measured_t_mean', 'measured_t_max', 'measured_t_min', 'tmstamp'], axis=1)
    target = data['measured_t_max']
    combined = pd.concat([features, target], axis=1)
    return combined

In [6]:
train = rearrange(train)
val = rearrange(val)
test = rearrange(test)

In [7]:
train.head()

Unnamed: 0,classroom_type,school_day,school_hours,break_time,power_on,trackerwm_avg_mean,trackerwm_avg_min,trackerwm_avg_max,shadowwm_avg_mean,shadowwm_avg_min,...,uva_avg_min,uva_avg_max,uvb_avg_mean,uvb_avg_min,uvb_avg_max,Month,Day,Hour,Minute,measured_t_max
0,0,1,0,0,1,0.011026,0.0,0.031166,0.120501,0.09553,...,0.008333,0.008796,0.093667,0.095624,0.088407,3,16,7,43,0.639456
1,1,1,0,0,1,0.011313,0.0,0.058837,0.145285,0.097057,...,0.008179,0.009105,0.093545,0.095624,0.088187,3,16,7,55,0.642857
2,2,1,0,0,1,0.033126,0.007153,0.058837,0.180489,0.191298,...,0.008488,0.009105,0.093511,0.095624,0.087967,3,16,7,55,0.666667
3,4,1,0,0,1,0.001889,0.001889,0.001881,0.152176,0.171735,...,0.009259,0.009259,0.093511,0.095624,0.087967,3,16,7,35,0.554422
4,0,1,1,0,1,0.464367,0.070713,0.72513,0.192743,0.198293,...,0.008333,0.009259,0.093511,0.095624,0.087967,3,16,8,47,0.666667


In [8]:
def create_sequences(data, seq_length):
    xs = []
    ys = []
    for i in range(len(data)-seq_length):
        x = data.iloc[i:(i+seq_length), :-1].values
        y = data.iloc[i+seq_length, -1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [9]:
seq_length = 10
X_train, y_train = create_sequences(train, seq_length)
X_test, y_test = create_sequences(test, seq_length)
X_val, y_val = create_sequences(val, seq_length)

In [10]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [11]:
# Hyperparameters
input_size = X_train.shape[2]
hidden_size = 50
num_layers = 2
output_size = 1
num_epochs = 10
batch_size = 32
learning_rate = 0.001

In [12]:
# Create DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [13]:
# Initialize the model, loss function and optimizer
device = torch.device('cpu')
model = RNN(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()    

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


Epoch [1/10], Loss: 0.1052
Epoch [2/10], Loss: 0.0823
Epoch [3/10], Loss: 0.0768
Epoch [4/10], Loss: 0.0960
Epoch [5/10], Loss: 0.1193
Epoch [6/10], Loss: 0.0899
Epoch [7/10], Loss: 0.1077
Epoch [8/10], Loss: 0.1079
Epoch [9/10], Loss: 0.1219
Epoch [10/10], Loss: 0.0962


In [15]:
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

In [23]:
model.eval()
with torch.no_grad():
    predictions = model(X_val_tensor).cpu().numpy()

In [36]:
scaler =joblib.load("prep_data/assets/min_max_scaler_y.joblib",mmap_mode=None)

In [41]:
predictions_rescaled = scaler.inverse_transform(predictions.reshape(-1, 1))
y_val_rescaled = scaler.inverse_transform(y_val_tensor.cpu().numpy().reshape(-1, 1))

In [42]:
np.mean(np.abs(predictions_rescaled - y_val_rescaled))

2.310457