#### Initalise dataset and config object

In [1]:
from config import cfg
from dataset_loaders import TweetXPriceY, SentimentPriceXPriceY
from torch.utils.data import DataLoader

train_data = cfg.dataloader(
    start_date=cfg.train_start_date,
    end_date=cfg.train_end_date,
    **cfg.dataset_loader_args
)
eval_data = cfg.dataloader(
    start_date=cfg.eval_start_date,
    end_date=cfg.eval_end_date,
    **cfg.dataset_loader_args
)

test_data = cfg.dataloader(
    start_date=cfg.test_start_date,
    end_date=cfg.test_end_date,
    **cfg.dataset_loader_args
)


train_dataloader = DataLoader(train_data, batch_size=cfg.BATCH_SIZE, num_workers=cfg.num_workers, shuffle=True)
eval_dataloader = DataLoader(eval_data, batch_size=cfg.BATCH_SIZE, num_workers=cfg.num_workers,shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=cfg.BATCH_SIZE, num_workers=cfg.num_workers,shuffle=False)



Normalization fitting done for feature Sentiment
Normalization fitting done for feature Twitter_Volume
Normalization fitting done for feature Movement_Percent
Normalization fitting done for feature Open
Normalization fitting done for feature High
Normalization fitting done for feature Low
Normalization fitting done for feature Volume
Normalization fitting done for feature Sentiment
Normalization fitting done for feature Twitter_Volume
Normalization fitting done for feature Movement_Percent
Normalization fitting done for feature Open
Normalization fitting done for feature High
Normalization fitting done for feature Low
Normalization fitting done for feature Volume
Normalization fitting done for feature Sentiment
Normalization fitting done for feature Twitter_Volume
Normalization fitting done for feature Movement_Percent
Normalization fitting done for feature Open
Normalization fitting done for feature High
Normalization fitting done for feature Low
Normalization fitting done for feature

In [2]:
from tqdm import tqdm

inspect_dataset = False
if inspect_dataset:
        for batch_idx, (x, y) in enumerate(tqdm(train_dataloader)):
                input(f'Length train set: {train_data.__len__()}, Length Eval set: {eval_data.__len__()}')
                print(f"Sentiment batch shape: {x[0].size()}")
                print(f"Price Feature batch shape: {x[1].size()}")
                print(f"Labels batch shape: {y.size()}")
                print(x[0], x[1])

## Create the model

In [3]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt


class RNN_simple(nn.Module):
    def __init__(self) -> None:
        super(RNN_simple, self).__init__()
        #self.embedding = nn.Embedding(cfg.vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(input_size=cfg.dataset_loader_args['tweets_per_day']*cfg.dataset_loader_args['words_per_tweet'], hidden_size=cfg.rnn_hidden_size, num_layers=cfg.rnn_hidden_layers, batch_first=True)
        self.fc = nn.Linear(cfg.rnn_hidden_size, 2)

        
    def forward(self, x):
        #x = self.embedding(x)
        h0 = torch.zeros(cfg.rnn_hidden_layers, x.size(0), cfg.rnn_hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]  # Use the output from the last time step
        out = self.fc(out)
        return out
    


class RNN_simple_v2(nn.Module):
    def __init__(self) -> None:
        super(RNN_simple, self).__init__()
        #self.embedding = nn.Embedding(cfg.vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(input_size=cfg.dataset_loader_args['tweets_per_day']*cfg.dataset_loader_args['words_per_tweet'], hidden_size=cfg.rnn_hidden_size, num_layers=cfg.rnn_hidden_layers, batch_first=True)
        self.fc = nn.Linear(cfg.rnn_hidden_size, 2)

        
    def forward(self, x):
        #x = self.embedding(x)
        h0 = torch.zeros(cfg.rnn_hidden_layers, x.size(0), cfg.rnn_hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = out[:, -1, :]  # Use the output from the last time step
        out = self.fc(out)
        return out
    



In [4]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt


class LSTM_v1(nn.Module):
    def __init__(self) -> None:
        super(LSTM_v1, self).__init__()
        #self.embedding = nn.Embedding(cfg.vocab_size, embedding_dim=50)
        self.embedding = nn.Embedding(cfg.vocab_size, embedding_dim=128, padding_idx=0)
        #self.rnn = nn.LSTM(input_size=cfg.dataset_loader_args['tweets_per_day']*cfg.dataset_loader_args['words_per_tweet'], hidden_size=cfg.rnn_hidden_size, num_layers=cfg.rnn_hidden_layers, batch_first=True)
        self.lstm = nn.LSTM(
            input_size=128,  # Embedding dimension
            hidden_size=cfg.rnn_hidden_size, 
            num_layers=cfg.rnn_hidden_layers, 
            batch_first=True,
            dropout=0.3
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(cfg.rnn_hidden_size, 2)

        
    def forward(self, x):
        x = x.long()
        x = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        out, _ = self.lstm(x)  # Shape: (batch_size, seq_length, hidden_size)
        out = self.dropout(out[:, -1, :])  # Use the output from the last time step
        out = self.fc(out)  # Shape: (batch_size, 2)
        return out  # Raw logits
    

from inital_models import Depth_First_GRU2
from models import GRU_Shallow_1fc_AntiOverfit

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def evaluate_model(dataloader, model):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    all_logits = []
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(tqdm(dataloader)):
            if isinstance(x, list):
                x = [t.to(device) for t in x]
                x = [t.float() for t in x]
                y = y.to(device)
            else:
                x, y = x.to(device), y.to(device)

                x = x.float()
                    #x = x.view(x.size(0), x.size(1), -1)  # Ensure input shape is (batch_size, seq_length, input_size)
                x = x.view(x.size(0), -1)

            outputs = model(x).squeeze()

            outputs = model(x)  # Outputs are raw logits of shape (batch_size, num_classes)
            _, predicted = torch.max(outputs.data, 1)

            

            total += y.size(0)
            correct += (predicted == y).sum().item()

            # Collect data for debugging
            all_logits.append(outputs.cpu())
            all_preds.append(predicted.cpu())
            all_targets.append(y.cpu())

    accuracy = correct / total

    #print(f'Accuracy: {accuracy:.4f}%')

    # Concatenate all the collected and transform to numpy array
    y_hat_logits = torch.cat(all_logits).numpy()
    y_hat = torch.cat(all_preds).numpy()
    y = torch.cat(all_targets).numpy()

    return  accuracy, y, y_hat, y_hat_logits

### Train loop cell

In [None]:

from time import time

model = GRU_Shallow_1fc_AntiOverfit(cfg, test_data.get_input_size())

criterion = cfg.loss_func()
optimizer = cfg.optimizer(model.parameters(), lr=cfg.LEARNING_RATE, weight_decay=1e-4)      #Added weight decay

EPOCHS = cfg.EPOCHS
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

eval_accuracy_across_epochs = []
train_accuracy_across_epochs = []
test_accuracy_across_epochs = []
loss_across_epochs = []
training_time = []

for epoch in range(EPOCHS):
    epoch_start_time = time()
    epoch_loss = 0
    total, correct = 0, 0
    model.train()

    for batch_idx, (x, y) in enumerate(tqdm(train_dataloader)):
        if isinstance(x, list):
            x = [t.to(device) for t in x]
            x = [t.float() for t in x]
            y = y.to(device)
        else:
            x, y = x.to(device), y.to(device)

            x = x.float()
                #x = x.view(x.size(0), x.size(1), -1)  # Ensure input shape is (batch_size, seq_length, input_size)
            x = x.view(x.size(0), -1)

        outputs = model(x).squeeze()
        #Extracting predictions to evaluate test set performance
        _, predicted = torch.max(outputs.data, 1)

        total += y.size(0)
        correct += (predicted == y).sum().item()
        

        loss = criterion(outputs, y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Added Gradient clipping
        optimizer.step()

        # Accumulate loss
        epoch_loss += loss.item()

         # Print progress
        if (batch_idx + 1) % 100 == 0:
            #print(f"Epoch [{epoch+1}/{EPOCHS}], Step [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
            pass

    epoch_end_time = time()
    training_time.append(epoch_end_time-epoch_start_time)

    print(f"Epoch [{epoch+1}/{EPOCHS}], Average Loss: {epoch_loss/len(train_dataloader):.4f}")

    average_epoch_loss = epoch_loss/len(train_dataloader)
    loss_across_epochs.append(average_epoch_loss)

    #Get accuracy for each epoch on train and eval set
    train_accuracy = correct/total
    eval_accuracy, _, _, _ = evaluate_model(eval_dataloader, model)
    test_accuracy, _, _, _ = evaluate_model(test_dataloader, model)
    model.train()
    print(f'Train Accuracy: {train_accuracy}')
    print(f'Eval Accuracy: {eval_accuracy}')
    print(f'Test Accuracy: {test_accuracy}')
    train_accuracy_across_epochs.append(train_accuracy)
    
    eval_accuracy_across_epochs.append(eval_accuracy)

    test_accuracy_across_epochs.append(test_accuracy)

total_training_time = sum(training_time)
h, rem = divmod(total_training_time, 3600)
m, s = divmod(rem, 60)


log_object = {
    'Dataclass': type(train_data).__name__,
    'Model': type(model).__name__,
    'Report from Training': {
        'training_time': f'{h}h {m}m {s}s',
        'loss_across_epochs': loss_across_epochs,
        'eval_accuracy_per_epoch': eval_accuracy_across_epochs,
        'train_accuracy_per_epoch': train_accuracy_across_epochs,
        'test_accuarcy_per_epoch': test_accuracy_across_epochs
    }
}



100%|██████████| 619/619 [00:14<00:00, 43.49it/s]


Epoch [1/60], Average Loss: 0.7010


100%|██████████| 64/64 [00:01<00:00, 35.75it/s]


Accuracy: 0.5162%


100%|██████████| 97/97 [00:03<00:00, 31.83it/s]


Accuracy: 0.5378%
Train Accuracy: 0.5019948487450129


100%|██████████| 619/619 [00:18<00:00, 33.16it/s]


Epoch [2/60], Average Loss: 0.6950


100%|██████████| 64/64 [00:02<00:00, 28.63it/s]


Accuracy: 0.5284%


100%|██████████| 97/97 [00:03<00:00, 29.94it/s]


Accuracy: 0.5430%
Train Accuracy: 0.5073986162315035


100%|██████████| 619/619 [00:18<00:00, 33.67it/s] 


Epoch [3/60], Average Loss: 0.6944


100%|██████████| 64/64 [01:20<00:00,  1.26s/it]


Accuracy: 0.5162%


100%|██████████| 97/97 [-1:58:45<00:00, -1.29it/s]


Accuracy: 0.5209%
Train Accuracy: 0.5069440937326397


100%|██████████| 619/619 [00:19<00:00, 32.56it/s]


Epoch [4/60], Average Loss: 0.6933


100%|██████████| 64/64 [00:02<00:00, 27.44it/s]


Accuracy: 0.4990%


100%|██████████| 97/97 [00:03<00:00, 29.81it/s]


Accuracy: 0.5304%
Train Accuracy: 0.512398363719004


100%|██████████| 619/619 [00:19<00:00, 32.38it/s]


Epoch [5/60], Average Loss: 0.6928


100%|██████████| 64/64 [00:02<00:00, 27.78it/s]


Accuracy: 0.4873%


100%|██████████| 97/97 [00:03<00:00, 29.08it/s]


Accuracy: 0.5248%
Train Accuracy: 0.5179031362052422


100%|██████████| 619/619 [00:20<00:00, 30.94it/s]


Epoch [6/60], Average Loss: 0.6927


100%|██████████| 64/64 [01:20<00:00,  1.26s/it]


Accuracy: 0.4691%


100%|██████████| 97/97 [-1:58:45<00:00, -1.29it/s]


Accuracy: 0.5015%
Train Accuracy: 0.5197717287005706


 77%|███████▋  | 474/619 [01:34<00:04, 31.55it/s] 

## Logging cell

In [None]:
import os
import json
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score



test_accuracy, y, y_hat, y_hat_logits = evaluate_model(test_dataloader, model)
accuracy_train, _, _, _ = evaluate_model(train_dataloader, model)
accuracy_eval, _, _, _ = evaluate_model(eval_dataloader, model)


F1 = f1_score(y, y_hat)
precision = precision_score(y, y_hat)
recall = recall_score(y, y_hat)

log_object['Results'] = {
    'accuracy_test': test_accuracy,
    'accuracy_eval': accuracy_eval,
    'accuracy_train': accuracy_train,
    'F1_eval': F1,
    'precision_eval': precision,
    'recall_eval': recall,
    'y_eval': y.tolist(),
    'y_hat_eval': y_hat.tolist(),
    'y_hat_logits_eval': y_hat_logits.tolist()
}





In [None]:
def write_log_to_file(experiment_name:str, log_obj:dict):
    '''experiment_name is the name the file will be stored with. Suggested as f"model_{model_class}_dataset_{dataset_class}". The name gets "_{id}.json" appended'''
    root = 'results_final_day/'
    result_dir = os.path.join(root, experiment_name)

    if not result_dir.split('/')[-1] in os.listdir(root):
        os.makedirs(result_dir)

    #Create new id with 4 digits incrementally
    dir_ids = [int(path.split(".")[-2].split('_')[0]) for path in os.listdir(result_dir)]
    new_id = str(max(dir_ids)+ 1) if len(dir_ids) > 0 else '0'     #Increment max id by 1 or set to 0 if no id present
    id = '0'*(4-len(new_id)) + new_id    #Make id 4 digits

    target_file = os.path.join(result_dir, f'{id}.json')
    with open(target_file, 'w') as f:
        json.dump(log_obj, f, indent=4)
    
    return target_file

import inspect
def log_config(log_object, config):
    config_to_log = {}
    for key, value in vars(config).items():
        #print(key, inspect.isclass(value), inspect.isfunction(value))
        if inspect.isclass(value) or inspect.isfunction(value): #or isinstance(value, types.FunctionType):  # Check if it's a class instance
            config_to_log[key] = value.__name__  # Log the class name
            #print(config_to_log[key])
        elif isinstance(value, np.ndarray):
            config_to_log[key] = value.tolist()
        else:
            config_to_log[key] = value  # Log the value directly for primitive types
    log_object['Config'] = config_to_log
    #Rearrange dict so config comes after dataset and model
    log_object = {k: log_object[k] for k in list(log_object.keys())[:2] + ['Config'] + list(log_object.keys())[2:-1]}
    return log_object


log_object = log_config(log_object, cfg)

created_file_path = write_log_to_file(f"model_{type(model).__name__}_dataset_{type(train_data).__name__}", log_object)

from result_dataprocessing import generate_training_plot_from_file

generate_training_plot_from_file(created_file_path)

## Hyper Parameter Tuning