In [101]:
%load_ext autoreload
%autoreload 2

from tweedejaars_project import *
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import torch
import torch.nn as nn
import torch.optim as optim

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
# def custom_scorer(estimator, X, y):
#     try:
#         return f1_score(y, estimator.predict(X))
#     except Exception as e:
#         print(f"Error during scoring: {e}")
#         return np.nan

# def train(features: pd.DataFrame, target: pd.DataFrame, model, params={}, splits=10):
#     """Train a model with grid search"""
#     cv = TimeSeriesSplit(n_splits=splits)

#     model = GridSearchCV(estimator=model, param_grid=params, scoring='f1',
#                          refit=True, cv=cv.split(features), n_jobs=-1, verbose=1)
#     model.fit(features, target)
#     best_model = model.best_estimator_
#     print(model.best_params_)
#     train_predictions = best_model.predict(features)

#     return np.array(train_predictions), best_model

In [103]:
features = [
    'import_capacity',
    'min_price_published',
    'mid_price_published',
    'max_price_published',
    'min_ptu_price_known',
    'max_ptu_price_known',
    'settlement_price_bestguess',
    'time_since_last_two_sided',
    'two_sided_daily_count',
    'PTU',
    'naive_strategy_action',
    'forecast_wind',
    'forecast_solar',
    'forecast_demand',
    'ptu_id'
]

# already used
target = 'target_two_sided_ptu'

In [104]:
df = load_df()
splits = get_splits(df, features)

In [105]:
# Hyperparameters
# Dit is het PTU aantal dus ptu_id's
sequence_length = 50

# Split the data in vars
train_data = splits['train']
# test_data = splits['test']

train_data['in']['ptu_id']

0            0
1            0
2            0
3            0
4            0
          ... 
123835    8255
123836    8255
123837    8255
123838    8255
123839    8255
Name: ptu_id, Length: 123840, dtype: int64

In [110]:
# Markov assumption

def create_sequences(data:dict, seq_length:int, ptu_length:int = 15) -> list:
    sequence_indices = []

    ptu_ids_total = data['in']['ptu_id']
    ptu_ids = ptu_ids_total.unique()

    for ptu_id in ptu_ids:
        last_idx_start_ptu = np.searchsorted(ptu_ids_total, ptu_id, side='left')
        
        # bereken eerste index vanaf huidige
        first_idx = last_idx_start_ptu - (seq_length * ptu_length)
        if first_idx < 0:
            first_idx = 0

        # ga over alle rows in de ptu
        for row_idx in range(ptu_length):
            last_idx = last_idx_start_ptu + row_idx

            sequence_indices.append((first_idx, last_idx))

    return sequence_indices

# Example usage:
indices = create_sequences(train_data, sequence_length)
indices

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10),
 (0, 11),
 (0, 12),
 (0, 13),
 (0, 14),
 (0, 15),
 (0, 16),
 (0, 17),
 (0, 18),
 (0, 19),
 (0, 20),
 (0, 21),
 (0, 22),
 (0, 23),
 (0, 24),
 (0, 25),
 (0, 26),
 (0, 27),
 (0, 28),
 (0, 29),
 (0, 30),
 (0, 31),
 (0, 32),
 (0, 33),
 (0, 34),
 (0, 35),
 (0, 36),
 (0, 37),
 (0, 38),
 (0, 39),
 (0, 40),
 (0, 41),
 (0, 42),
 (0, 43),
 (0, 44),
 (0, 45),
 (0, 46),
 (0, 47),
 (0, 48),
 (0, 49),
 (0, 50),
 (0, 51),
 (0, 52),
 (0, 53),
 (0, 54),
 (0, 55),
 (0, 56),
 (0, 57),
 (0, 58),
 (0, 59),
 (0, 60),
 (0, 61),
 (0, 62),
 (0, 63),
 (0, 64),
 (0, 65),
 (0, 66),
 (0, 67),
 (0, 68),
 (0, 69),
 (0, 70),
 (0, 71),
 (0, 72),
 (0, 73),
 (0, 74),
 (0, 75),
 (0, 76),
 (0, 77),
 (0, 78),
 (0, 79),
 (0, 80),
 (0, 81),
 (0, 82),
 (0, 83),
 (0, 84),
 (0, 85),
 (0, 86),
 (0, 87),
 (0, 88),
 (0, 89),
 (0, 90),
 (0, 91),
 (0, 92),
 (0, 93),
 (0, 94),
 (0, 95),
 (0, 96),
 (0, 97),
 (0, 98),
 (0, 99),
 (0, 100),

tensor([])

In [None]:



def create_sequences(data, seq_length):
    sequences = []
    labels = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        labels.append(data[i + seq_length])
    return torch.FloatTensor(sequences), torch.FloatTensor(labels)

X_train, y_train = create_sequences(train_data, sequence_length)
X_test, y_test = create_sequences(test_data, sequence_length)

# Step 2: Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Choice of RNN cell: nn.RNN, nn.LSTM, nn.GRU
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Initialize hidden state and cell state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate RNN
        out, _ = self.rnn(x, (h0, c0))
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 1
hidden_size = 64  # Number of features in hidden state
num_layers = 2  # Number of stacked RNN layers
output_size = 1
num_epochs = 20
learning_rate = 0.001

# Step 3: Instantiate the model, define the loss function and the optimizer
model = RNNModel(input_size, hidden_size, num_layers, output_size).to('cuda')
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Step 4: Train the model
for epoch in range(num_epochs):
    model.train()
    outputs = model(X_train.to('cuda'))
    optimizer.zero_grad()
    loss = criterion(outputs, y_train.to('cuda'))
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Step 5: Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test.to('cuda'))
    test_loss = criterion(test_outputs, y_test.to('cuda'))
    print(f'Test Loss: {test_loss.item():.4f}')

# Step 6: Save the model to a pickle file
model_filename = 'rnn_model.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(model.state_dict(), f)

print(f'Model saved to {model_filename}')
