In [1]:
%load_ext autoreload
%autoreload 2

from tweedejaars_project import *
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence




[32m2024-06-13 12:32:23.519[0m | [1mINFO    [0m | [36mtweedejaars_project.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/nordin/Desktop/project/tweedejaars_project[0m


In [2]:
features = [
    'import_capacity',
    # 'min_price_published',
    'mid_price_published',
    # 'max_price_published',
    # 'min_ptu_price_known',
    # 'max_ptu_price_known',
    # 'settlement_price_bestguess',
    'time_since_last_two_sided',
    'two_sided_daily_count',
    'PTU',
    'naive_strategy_action',
    'forecast_wind',
    'forecast_solar',
    'forecast_demand',
    'ptu_id',
    'fix_two_sided_ptu_realtime'
]

# already used
target = 'target_two_sided_ptu'

In [3]:
df = load_df()
splits = get_splits(df, features)

In [4]:
# Hyperparameters
sequence_length = 5 # Dit is het PTU aantal dus ptu_id's niet row aantal
batch_size = 15000
input_size = len(features)
hidden_size = 16
num_layers = 2
output_size = 1
num_epochs = 1
learning_rate = 0.001


# Split the data in vars
train_data = splits['train']
# test_data = splits['test']

rows = 20
print(train_data['in']['ptu_id'])


0            0
1            0
2            0
3            0
4            0
          ... 
123835    8255
123836    8255
123837    8255
123838    8255
123839    8255
Name: ptu_id, Length: 123840, dtype: int64


In [5]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length, ptu_length=15, nan_val1 = 100000, nan_val2 = -100000):
        self.data_in = data['in'].astype(np.float32)
        self.data_out = data['out'].astype(np.float32)
        self.seq_length = seq_length
        self.ptu_length = ptu_length
        self.max_length = (self.seq_length * self.ptu_length)
        self.data_in_padded = self.create_padding()

        self.sequence_indices = self.create_sequences()

        # # DIT IS EEN HACK DUS WSS VERANDEREN
        # self.data_in = self.data_in.fillna({'min_price_published':nan_val1, 'max_price_published':nan_val2}).astype(np.float32)

    def create_padding(self, padding_value=np.nan):
        new_rows = pd.DataFrame([[padding_value] * len(self.data_in.columns)] * self.max_length, columns=self.data_in.columns)
        return pd.concat([new_rows, self.data_in], ignore_index=True)

    def create_sequences(self):
        sequence_indices = []        
        first_idx = 0
        last_idx = self.max_length

        counter = self.ptu_length
        for _ in range(len(self.data_in)):
            sequence_indices.append((first_idx, last_idx))
            counter -= 1
            if counter == 0:
                first_idx += self.ptu_length
                counter = self.ptu_length
            
            last_idx += 1

        return sequence_indices
    
    def __len__(self):
        return len(self.sequence_indices)
    
    def __getitem__(self, idx):
        start_idx, end_idx = self.sequence_indices[idx]
        
        # np array sequence and target
        sequence = self.data_in_padded.iloc[start_idx:end_idx].values
        target = self.data_out.iloc[idx]

        sequence = torch.tensor(sequence, dtype=torch.float32)        
        current_seq_len = self.ptu_length - ((end_idx - start_idx) % self.ptu_length)
        corrected_sequence = F.pad(sequence, (0, 0, current_seq_len, 0), mode='constant', value=np.nan)
                
        return corrected_sequence, torch.tensor(target, dtype=torch.float32)





In [31]:
import torch
import torch.nn.functional as F

seq_batch = [torch.tensor([[1, 1],
                           [2, 2],
                           [3, 3],
                           [4, 4],
                           [5, 5]]),
             torch.tensor([[10, 10],
                           [20, 20]])]

seq_lens = [5, 2]
padded_seq_batch = torch.nn.utils.rnn.pad_sequence(seq_batch, batch_first=True)
print(padded_seq_batch)
print('')
packed_seq_batch = torch.nn.utils.rnn.pack_padded_sequence(padded_seq_batch, lengths=seq_lens, batch_first=True)
print(padded_seq_batch)
lstm = nn.LSTM(input_size=2, hidden_size=3, batch_first=True)
output, (hn, cn) = lstm(packed_seq_batch.float()) # pass float tensor instead long tensor.
output
# padded_output, output_lens = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True, total_length=5)
# padded_output, output_lens

tensor([[[ 1,  1],
         [ 2,  2],
         [ 3,  3],
         [ 4,  4],
         [ 5,  5]],

        [[10, 10],
         [20, 20],
         [ 0,  0],
         [ 0,  0],
         [ 0,  0]]])

tensor([[[ 1,  1],
         [ 2,  2],
         [ 3,  3],
         [ 4,  4],
         [ 5,  5]],

        [[10, 10],
         [20, 20],
         [ 0,  0],
         [ 0,  0],
         [ 0,  0]]])


PackedSequence(data=tensor([[ 0.0827, -0.0810,  0.0157],
        [ 0.3794, -0.4931, -0.1602],
        [ 0.1901, -0.1844,  0.0018],
        [ 0.4819, -0.6494, -0.2332],
        [ 0.2595, -0.2653, -0.0329],
        [ 0.2934, -0.3286, -0.0866],
        [ 0.3108, -0.3795, -0.1488]], grad_fn=<CatBackward0>), batch_sizes=tensor([2, 2, 1, 1, 1]), sorted_indices=None, unsorted_indices=None)

In [37]:
seq1 = torch.tensor([1, 2, 3, 4])    # Length 4
seq2 = torch.tensor([5, 6, 7])       # Length 3
seq3 = torch.tensor([8, 9])          # Length 2

# List of sequences
sequences = [seq1, seq2, seq3]

# Desired fixed length to pad sequences to
desired_length = 5

# Pad sequences to the desired length
padded_sequences = []
lengths = []

for seq in sequences:
    length = len(seq)
    lengths.append(length)
    if length < desired_length:
        # Pad sequence with zeros (or another value if needed)
        padded_seq = F.pad(seq, (0, desired_length - length), "constant", 0)
    else:
        padded_seq = seq
    padded_sequences.append(padded_seq)

# Stack the padded sequences into a single tensor
padded_sequences = torch.stack(padded_sequences)
print(padded_sequences)

# Convert lengths to a tensor and sort them in descending order
lengths = torch.tensor(lengths)
lengths, perm_idx = lengths.sort(0, descending=True)
padded_sequences = padded_sequences[perm_idx]

# Pack the padded sequences
packed_sequences = pack_padded_sequence(padded_sequences, lengths, batch_first=True, enforce_sorted=True)

print("Padded sequences:\n", padded_sequences)
print("Packed sequences:\n", packed_sequences)


tensor([[1, 2, 3, 4, 0],
        [5, 6, 7, 0, 0],
        [8, 9, 0, 0, 0]])
Padded sequences:
 tensor([[1, 2, 3, 4, 0],
        [5, 6, 7, 0, 0],
        [8, 9, 0, 0, 0]])
Packed sequences:
 PackedSequence(data=tensor([1, 5, 8, 2, 6, 9, 3, 7, 4]), batch_sizes=tensor([3, 3, 2, 1]), sorted_indices=None, unsorted_indices=None)


In [6]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        # Use the last valid output for each sequence
        idx = (lengths - 1).view(-1, 1).expand(output.size(0), output.size(2)).unsqueeze(1)
        output = output.gather(1, idx).squeeze(1)
        
        output = self.fc(output)
        return output


In [7]:
def prepare_data_loader(data, sequence_length, batch_size):
    dataset = TimeSeriesDataset(data, sequence_length)
    # all_sequences, all_targets = dataset.get_all_sequences()
    print('done')
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [8]:
train_loader = prepare_data_loader(train_data, sequence_length, batch_size)


done


  return pd.concat([new_rows, self.data_in], ignore_index=True)


In [40]:
# TODO: eerste batch eerste item is puur nans?
# TODO: Special NaN value for all inputs of NaN
train_features, train_labels = next(iter(train_loader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[14000]
label = train_labels[0]
img

Feature batch shape: torch.Size([15000, 90, 11])
Labels batch shape: torch.Size([15000])


tensor([[1.3769e+03, 8.6870e+01, 1.9067e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.9066e+03, 0.0000e+00, 1.4695e+04, 9.2800e+02, 0.0000e+00],
        [1.3769e+03, 8.6870e+01, 1.9133e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.9066e+03, 0.0000e+00, 1.4695e+04, 9.2800e+02, 0.0000e+00],
        [1.3769e+03, 8.5090e+01, 1.9200e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.9066e+03, 0.0000e+00, 1.4695e+04, 9.2800e+02, 0.0000e+00],
        [1.3769e+03, 8.5090e+01, 1.9267e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.9066e+03, 0.0000e+00, 1.4695e+04, 9.2800e+02, 0.0000e+00],
        [1.3769e+03, 8.5090e+01, 1.9333e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.9066e+03, 0.0000e+00, 1.4695e+04, 9.2800e+02, 0.0000e+00],
        [1.3769e+03, 8.5090e+01, 1.9400e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.9066e+03, 0.0000e+00, 1.4695e+04, 9.2800e+02, 0.0000e+00],
        [1.3769e+03, 8.5090e+01, 1.9467e+01, 1.0000e+00, 6.9000e+01, 0.0000e+00,
         4.90

In [20]:
for batch_idx, (sequences, targets) in enumerate(train_loader):
    print("Batch:", batch_idx)
    print("Sequence shape:", sequences.shape)  # Print the shape of the input sequences
    print("Targets shape:", targets.shape)  # Print the shape of the targets


Batch: 0
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 1
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 2
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 3
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 4
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 5
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 6
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 7
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 8
Sequence shape: torch.Size([3840, 90, 11])
Targets shape: torch.Size([3840])


In [21]:
def train_rnn(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            # Prepare the data
            sequences = sequences.float()
            targets = targets.float()
            lengths = torch.tensor([len(seq) for seq in sequences])
            
            # Debugging print statements
            print(f"Batch {batch_idx}, Sequence shape: {sequences.shape}, Lengths: {lengths}")
            
            # Forward pass
            outputs = model(sequences, lengths)
            
            # Compute loss
            targets = targets.unsqueeze(1)  # Ensure targets have the shape (batch_size, 1)
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (batch_idx + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


model = RNNModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_rnn(model, train_loader, criterion, optimizer, num_epochs)


Batch 0, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 1, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 2, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 3, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 4, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 5, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 6, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 7, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 8, Sequence shape: torch.Size([3840, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])


In [25]:
save_model(model, 'markov', 'rnn')

In [9]:
def test_model(model, data_loader):
    model.eval()
    all_outputs = []
    all_targets = []
    with torch.no_grad():
        for sequences, targets in data_loader:
            sequences = sequences.float()
            targets = targets.float()
            lengths = torch.tensor([len(seq) for seq in sequences])
            
            # Forward pass
            outputs = model(sequences, lengths)
            all_outputs.append(outputs)
            all_targets.append(targets)
            
    return torch.cat(all_outputs), torch.cat(all_targets)

loaded_model = load_model('markov', 'rnn')

# Test the model on the training set
outputs, targets = test_model(loaded_model, train_loader)

# Convert outputs to probabilities
probabilities = torch.sigmoid(outputs)

# Convert probabilities to binary predictions
predictions = (probabilities > 0.5).float()

# Evaluate the predictions
accuracy = (predictions == targets.unsqueeze(1)).float().mean()
print(f'Accuracy on training set: {accuracy:.4f}')


Accuracy on training set: 0.9553


In [13]:
probabilities.unique

<bound method Tensor.unique of tensor([[nan],
        [nan],
        [nan],
        ...,
        [nan],
        [nan],
        [nan]])>

In [22]:
# model = RNNModel(input_size, hidden_size, num_layers, output_size)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
# for batch_idx, (sequences, targets) in enumerate(train_loader):
#     print("Batch:", batch_idx)
#     print("Sequence shape:", sequences.shape)  # Print the shape of the input sequences
#     print("Targets shape:", targets.shape)  # Print the shape of the targets


In [24]:
# # Training loop (placeholder, implement training logic)
# for epoch in range(1):
#     for sequences, targets in train_loader:
#         lengths = [min(len(seq), sequence_length) for seq in sequences]
#         optimizer.zero_grad()
#         outputs = model(sequences, lengths)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()
