In [12]:
%load_ext autoreload
%autoreload 2

from tweedejaars_project import *
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
features = [
    'import_capacity',
    # 'min_price_published',
    'mid_price_published',
    # 'max_price_published',
    # 'min_ptu_price_known',
    # 'max_ptu_price_known',
    # 'settlement_price_bestguess',
    'time_since_last_two_sided',
    'two_sided_daily_count',
    'PTU',
    'naive_strategy_action',
    'forecast_wind',
    'forecast_solar',
    'forecast_demand',
    'ptu_id',
    'fix_two_sided_ptu_realtime'
]

# already used
target = 'target_two_sided_ptu'

In [14]:
df = load_df()
splits = get_splits(df, features)

In [15]:
# Hyperparameters
sequence_length = 5 # Dit is het PTU aantal dus ptu_id's niet row aantal
batch_size = 15000
input_size = len(features)
hidden_size = 16
num_layers = 2
output_size = 1
num_epochs = 1
learning_rate = 0.001


# Split the data in vars
train_data = splits['train']
# test_data = splits['test']

rows = 20
print(train_data['in']['ptu_id'])


0            0
1            0
2            0
3            0
4            0
          ... 
123835    8255
123836    8255
123837    8255
123838    8255
123839    8255
Name: ptu_id, Length: 123840, dtype: int64


In [16]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length, ptu_length=15, nan_val1 = 100000, nan_val2 = -100000):
        self.data_in = data['in'].astype(np.float32)
        self.data_out = data['out'].astype(np.float32)
        self.seq_length = seq_length
        self.ptu_length = ptu_length
        self.max_length = (self.seq_length * self.ptu_length)
        self.data_in_padded = self.create_padding()

        self.sequence_indices = self.create_sequences()

        # # DIT IS EEN HACK DUS WSS VERANDEREN
        # self.data_in = self.data_in.fillna({'min_price_published':nan_val1, 'max_price_published':nan_val2}).astype(np.float32)

    def create_padding(self, padding_value=np.nan):
        new_rows = pd.DataFrame([[padding_value] * len(self.data_in.columns)] * self.max_length, columns=self.data_in.columns)
        return pd.concat([new_rows, self.data_in], ignore_index=True)

    def create_sequences(self):
        sequence_indices = []        
        first_idx = 0
        last_idx = self.max_length

        counter = self.ptu_length
        for _ in range(len(self.data_in)):
            sequence_indices.append((first_idx, last_idx))
            counter -= 1
            if counter == 0:
                first_idx += self.ptu_length
                counter = self.ptu_length
            
            last_idx += 1

        return sequence_indices
    
    def __len__(self):
        return len(self.sequence_indices)
    
    def __getitem__(self, idx):
        start_idx, end_idx = self.sequence_indices[idx]
        
        # np array sequence and target
        sequence = self.data_in_padded.iloc[start_idx:end_idx].values
        target = self.data_out.iloc[idx]

        sequence = torch.tensor(sequence, dtype=torch.float32)        
        current_seq_len = self.ptu_length - ((end_idx - start_idx) % self.ptu_length)
        corrected_sequence = F.pad(sequence, (0, 0, 0, current_seq_len), mode='constant', value=np.nan)
                
        return corrected_sequence, torch.tensor(target, dtype=torch.float32)

    
    # def get_all_sequences(self):
    #     all_sequences = []
    #     all_targets = []
    #     for index in range(len(self.data_in)):
    #         sequence, target = self.__getitem__(index)
    #         all_sequences.append(sequence)
    #         all_targets.append(target)
    #     return all_sequences, all_targets





In [17]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.rnn(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        # Use the last valid output for each sequence
        idx = (lengths - 1).view(-1, 1).expand(output.size(0), output.size(2)).unsqueeze(1)
        output = output.gather(1, idx).squeeze(1)
        
        output = self.fc(output)
        return output


In [18]:
def prepare_data_loader(data, sequence_length, batch_size):
    dataset = TimeSeriesDataset(data, sequence_length)
    # all_sequences, all_targets = dataset.get_all_sequences()
    print('done')
    return DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [19]:
train_loader = prepare_data_loader(train_data, sequence_length, batch_size)


done


  return pd.concat([new_rows, self.data_in], ignore_index=True)


In [20]:
for batch_idx, (sequences, targets) in enumerate(train_loader):
    print("Batch:", batch_idx)
    print("Sequence shape:", sequences.shape)  # Print the shape of the input sequences
    print("Targets shape:", targets.shape)  # Print the shape of the targets


Batch: 0
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 1
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 2
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 3
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 4
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 5
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 6
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 7
Sequence shape: torch.Size([15000, 90, 11])
Targets shape: torch.Size([15000])
Batch: 8
Sequence shape: torch.Size([3840, 90, 11])
Targets shape: torch.Size([3840])


In [21]:
def train_rnn(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            # Prepare the data
            sequences = sequences.float()
            targets = targets.float()
            lengths = torch.tensor([len(seq) for seq in sequences])
            
            # Debugging print statements
            print(f"Batch {batch_idx}, Sequence shape: {sequences.shape}, Lengths: {lengths}")
            
            # Forward pass
            outputs = model(sequences, lengths)
            
            # Compute loss
            targets = targets.unsqueeze(1)  # Ensure targets have the shape (batch_size, 1)
            loss = criterion(outputs, targets)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (batch_idx + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


model = RNNModel(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_rnn(model, train_loader, criterion, optimizer, num_epochs)


Batch 0, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 1, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 2, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 3, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 4, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 5, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 6, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 7, Sequence shape: torch.Size([15000, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])
Batch 8, Sequence shape: torch.Size([3840, 90, 11]), Lengths: tensor([90, 90, 90,  ..., 90, 90, 90])


In [25]:
save_model(model, 'markov', 'rnn')

In [26]:
def test_model(model, data_loader):
    model.eval()
    all_outputs = []
    all_targets = []
    with torch.no_grad():
        for sequences, targets in data_loader:
            sequences = sequences.float()
            targets = targets.float()
            lengths = torch.tensor([len(seq) for seq in sequences])
            
            # Forward pass
            outputs = model(sequences, lengths)
            all_outputs.append(outputs)
            all_targets.append(targets)
            
    return torch.cat(all_outputs), torch.cat(all_targets)

loaded_model = load_model('markov', 'rnn')

# Test the model on the training set
outputs, targets = test_model(loaded_model, train_loader)

# Convert outputs to probabilities
probabilities = torch.sigmoid(outputs)

# Convert probabilities to binary predictions
predictions = (probabilities > 0.5).float()

# Evaluate the predictions
accuracy = (predictions == targets.unsqueeze(1)).float().mean()
print(f'Accuracy on training set: {accuracy:.4f}')


Accuracy on training set: 0.9553


In [22]:
# model = RNNModel(input_size, hidden_size, num_layers, output_size)
# criterion = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [23]:
# for batch_idx, (sequences, targets) in enumerate(train_loader):
#     print("Batch:", batch_idx)
#     print("Sequence shape:", sequences.shape)  # Print the shape of the input sequences
#     print("Targets shape:", targets.shape)  # Print the shape of the targets


In [24]:
# # Training loop (placeholder, implement training logic)
# for epoch in range(1):
#     for sequences, targets in train_loader:
#         lengths = [min(len(seq), sequence_length) for seq in sequences]
#         optimizer.zero_grad()
#         outputs = model(sequences, lengths)
#         loss = criterion(outputs, targets)
#         loss.backward()
#         optimizer.step()
