In [35]:
%load_ext autoreload
%autoreload 2

from tweedejaars_project import *
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
features = [
    "import_capacity",
    # "min_price_published",
    "mid_price_published",
    # "max_price_published",
    # "upward_dispatch_published",
    # "downward_dispatch_published",
    # 'min_ptu_price_known',
    # "max_ptu_price_known",
    # "settlement_price_bestguess",
    'PTU',
    # 'forecast_wind',
    # 'forecast_solar',
    # 'forecast_demand',
    'time_since_last_two_sided',
    'two_sided_daily_count',
    'ptu_id'
]
# already used
target = 'target_two_sided_ptu'

In [37]:
df = load_df()
splits = get_splits(df, features)

In [38]:
# Hyperparameters
ptu_window = 5 # Dit is het PTU aantal dus ptu_id's niet row aantal
batch_size = 15000
input_size = len(features)
hidden_size = 16
num_layers = 2
output_size = 1
num_epochs = 1
learning_rate = 0.001


# Split the data in vars
train_data = splits['train']
# test_data = splits['test']

rows = 20
print(train_data['in']['ptu_id'])


0            0
1            0
2            0
3            0
4            0
          ... 
123835    8255
123836    8255
123837    8255
123838    8255
123839    8255
Name: ptu_id, Length: 123840, dtype: int64


In [39]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, ptu_window, ptu_length=15):
        self.data_in = pd.DataFrame(data['in']).astype(np.float32)
        self.data_out = pd.Series(data['out']).astype(np.float32)
        self.ptu_window = ptu_window
        self.ptu_length = ptu_length

        self.ptu_history = self.ptu_window * self.ptu_length  # Entire window

        self.sequence_indices, self.sequence_lengths = self.create_sequences()

    def create_sequences(self):
        sequence_indices, sequence_lengths = [], []

        row_idx = 0  # Index of the current row
        start_idx = 0  # Index of the furthest row in history

        counter = self.ptu_length

        for _ in range(len(self.data_in)):
            sequence_indices.append((start_idx, row_idx))
            sequence_lengths.append(row_idx - start_idx)  # Add length of sequence

            if row_idx >= self.ptu_history:  # Start using the counter only after padding is passed
                counter -= 1
                if counter == 0:
                    start_idx += self.ptu_length
                    counter = self.ptu_length


            row_idx += 1


        return sequence_indices, sequence_lengths

    def __len__(self):
        return len(self.sequence_indices)

    def __getitem__(self, idx):
        start_idx, row_idx = self.sequence_indices[idx]
        length = self.sequence_lengths[idx]

        # np array sequence and target 
        sequence = self.data_in.iloc[start_idx:row_idx + 1].values  
        target = self.data_out.iloc[row_idx]

        sequence = torch.tensor(sequence, dtype=torch.float32).flip(0)  # draai het om

        # Dit wordt gebruikt voor padding alleen voor de eerdere getallen  waar de history nog incompleet is. 
        if length <= self.ptu_history and start_idx == 0:
            # print(length, start_idx, row_idx)
            sequence = F.pad(sequence, (0, 0, 0, (self.ptu_history - length) + self.ptu_length -1), mode='constant', value=np.nan)
        
        else:
            # Padding for the dynamic rows in the current ptu
            current_seq_len = self.ptu_length - ((row_idx - start_idx) % self.ptu_length)
            sequence = F.pad(sequence, (0, 0, 0, current_seq_len - 1), mode='constant', value=np.nan)  # Extra padding
        
        # print(sequence.shape)

        return sequence, torch.tensor(target, dtype=torch.float32), length


In [40]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        # Use the last valid output for each sequence
        idx = (lengths - 1).view(-1, 1).expand(output.size(0), output.size(2)).unsqueeze(1)
        output = output.gather(1, idx).squeeze(1)
        
        output = self.fc(output)
        return output


In [41]:
def prepare_data_loader(data, ptu_window, batch_size):
    dataset = TimeSeriesDataset(data, ptu_window)
    # all_sequences, all_targets = dataset.get_all_sequences()
    print('done')
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [42]:
train_loader = prepare_data_loader(train_data, ptu_window, batch_size)


done


In [43]:
# TODO: eerste batch eerste item is puur nans?
# TODO: Special NaN value for all inputs of NaN
train_features, train_labels, lenghts = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
print(f'lenghts: {lenghts}')


Feature batch shape: torch.Size([15000, 90, 6])
Labels batch shape: torch.Size([15000])
lenghts: tensor([ 0,  1,  2,  ..., 87, 88, 89])


In [44]:
x = 104
feature_tensor = train_features[x]
lenght = lenghts[x].item()
feature_tensor, lenght

(tensor([[7.0950e+02, 4.0390e+01, 1.1000e+01, 6.0000e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.9333e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.8667e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.8000e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.7333e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.6667e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.6000e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.5333e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.4667e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.4000e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.3333e+00, 1.0000e+00, 6.0000e+00],
         [7.0950e+02, 4.0390e+01, 1.1000e+01, 5.2667e+00, 1.0000e+00, 6.0000e+00],
    

In [45]:
for batch_idx, (sequences, targets, lengths) in enumerate(train_loader):
    print("Batch:", batch_idx)
    print("Sequence shape:", sequences.shape)  # Print the shape of the input sequences
    print("Targets shape:", targets.shape)  # Print the shape of the targets
    lengths += 1

    # Ensure lengths is a 1D CPU int64 tensor
    lengths = lengths.to(torch.int64).cpu()
    print(lengths)


Batch: 0
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([ 1,  2,  3,  ..., 88, 89, 90])
Batch: 1
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 89, 90])
Batch: 2
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 89, 90])
Batch: 3
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 89, 90])
Batch: 4
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 89, 90])
Batch: 5
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 89, 90])
Batch: 6
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 89, 90])
Batch: 7
Sequence shape: torch.Size([15000, 90, 6])
Targets shape: torch.Size([15000])
tensor([76, 77, 78,  ..., 88, 8

In [46]:
def train_rnn(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for batch_idx, (sequences, targets, lengths) in enumerate(train_loader):
            # Prepare the data
            sequences = sequences.float()
            targets = targets.float()
            lengths += 1

            # Ensure lengths is a 1D CPU int64 tensor
            lengths = lengths.to(torch.int64).cpu()

            # Debugging print statements
            print(f"Batch {batch_idx}, Sequence shape: {sequences.shape}, Lengths: {lengths}")
            
            # Forward pass
            outputs = model(sequences, lengths)
            
            # Compute loss
            targets = targets.unsqueeze(1)  # Ensure targets have the shape (batch_size, 1)
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (batch_idx + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


model = RNNModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_rnn(model, train_loader, criterion, optimizer, num_epochs)


Batch 0, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([ 1,  2,  3,  ..., 88, 89, 90])
Batch 1, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 2, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 3, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 4, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 5, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 6, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 7, Sequence shape: torch.Size([15000, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])
Batch 8, Sequence shape: torch.Size([3840, 90, 6]), Lengths: tensor([76, 77, 78,  ..., 88, 89, 90])


In [47]:
save_model(model, 'markov_2', 'rnn')

In [48]:
def test_model(model, data_loader):
    model.eval()
    all_outputs = []
    all_targets = []
    with torch.no_grad():
        for sequences, targets, lengths in data_loader:
            sequences = sequences.float()
            targets = targets.float()
            lengths += 1

            # Ensure lengths is a 1D CPU int64 tensor
            lengths = lengths.to(torch.int64).cpu()

            # Forward pass
            outputs = model(sequences, lengths)
            all_outputs.append(outputs)
            all_targets.append(targets)
            
    return torch.cat(all_outputs), torch.cat(all_targets)

loaded_model = load_model('markov_2', 'rnn')

# Test the model on the training set
outputs, targets = test_model(loaded_model, train_loader)

# Convert outputs to probabilities
probabilities = torch.sigmoid(outputs)

# Convert probabilities to binary predictions
predictions = (probabilities > 0.5).float()

# Evaluate the predictions
accuracy = (predictions == targets.unsqueeze(1)).float().mean()
print(f'Accuracy on training set: {accuracy:.4f}')


Accuracy on training set: 0.9492


In [60]:
torch.sum(predictions)


tensor(1215.)

In [50]:
# model = RNNModel(input_size, hidden_size, num_layers, output_size)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [51]:
# for batch_idx, (sequences, targets) in enumerate(train_loader):
#     print("Batch:", batch_idx)
#     print("Sequence shape:", sequences.shape)  # Print the shape of the input sequences
#     print("Targets shape:", targets.shape)  # Print the shape of the targets


In [52]:
# # Training loop (placeholder, implement training logic)
# for epoch in range(1):
#     for sequences, targets in train_loader:
#         lengths = [min(len(seq), sequence_length) for seq in sequences]
#         optimizer.zero_grad()
#         outputs = model(sequences, lengths)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()
