In [1]:
import math
from typing import Tuple
from transformers import BertTokenizer, BertModel, AdamW
import numpy as np
import torch
from torch import nn, Tensor
from torch.utils.data import dataset
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn as nn
import numpy as np

# Define a simple model using PyTorch's TransformerDecoder
class SimpleTransformerDecoderModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward):
        super(SimpleTransformerDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, batch_first=True), 
            num_layers
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, tgt, memory):
        tgt = self.embedding(tgt) * np.sqrt(self.d_model)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_decoder(tgt, memory)
        output = self.fc_out(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        return x + self.encoding[:x.size(0), :]


In [3]:
class BERTSentenceEncoder:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

    def encode_sentences(self, input_sentences):
        tokenized_input = self.tokenizer(input_sentences, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**tokenized_input)
        encoded_sentences = outputs.last_hidden_state

        # take only the CLS mode
        context_vector = encoded_sentences[:, 0,:]
        return tokenized_input, encoded_sentences, context_vector

In [4]:
note_mapping = {'0': 1, 'C0': 2, 'C0#': 3, 'D0': 4, 'D0#': 5, 'E0': 6, 'F0': 7, 'F0#': 8, 'G0': 9, 'G0#': 10, 'A0': 11, 'A0#': 12, 'B0': 13,
                'C1': 14, 'C1#': 15, 'D1': 16, 'D1#': 17, 'E1': 18, 'F1': 19, 'F1#': 20, 'G1': 21, 'G1#': 22, 'A1': 23, 'A1#': 24, 'B1': 25,
                'C2': 26, 'C2#': 27, 'D2': 28, 'D2#': 29, 'E2': 30, 'F2': 31, 'F2#': 32, 'G2': 33, 'G2#': 34, 'A2': 35, 'A2#': 36, 'B2': 37,
                'C3': 38, 'C3#': 39, 'D3': 40, 'D3#': 41, 'E3': 42, 'F3': 43, 'F3#': 44, 'G3': 45, 'G3#': 46, 'A3': 47, 'A3#': 48, 'B3': 49,
                'C4': 50, 'C4#': 51, 'D4': 52, 'D4#': 53, 'E4': 54, 'F4': 55, 'F4#': 56, 'G4': 57, 'G4#': 58, 'A4': 59, 'A4#': 60, 'B4': 61,
                'C5': 62, 'C5#': 63, 'D5': 64, 'D5#': 65, 'E5': 66, 'F5': 67, 'F5#': 68, 'G5': 69, 'G5#': 70, 'A5': 71, 'A5#': 72, 'B5': 73,
                'C6': 74, 'C6#': 75, 'D6': 76, 'D6#': 77, 'E6': 78, 'F6': 79, 'F6#': 80, 'G6': 81, 'G6#': 82, 'A6': 83, 'A6#': 84, 'B6': 85,
                'C7': 86, 'C7#': 87, 'D7': 88, 'D7#': 89, 'E7': 90, 'F7': 91, 'F7#': 92, 'G7': 93, 'G7#': 94, 'A7': 95, 'A7#': 96, 'B7': 97,
                'C8': 98, 'C8#': 99, 'D8': 100, 'D8#': 101, 'E8': 102, 'F8': 103, 'F8#': 104, 'G8': 105, 'G8#': 106, 'A8': 107, 'A8#': 108, 'B8': 109,'-1':100 }
reverse_note_mapping = {v: k for k, v in note_mapping.items()}

In [5]:
tmp_file = pd.read_excel('/speech/dbwork/mul/spielwiese4/students/desengus/dry_crepe_pesto/excels/train/0.xlsx', index_col=0)
training_words = tmp_file['words']
training_words = [sentence.replace(';', ' ') for sentence in training_words]
training_words[0] = '<BOS> ' + training_words[0]
training_words[-1] = training_words[-1] + ' <EOS>'
training_labels = [[note_mapping[note] for note in d.split(' ; ')] for d in tmp_file['mean_note_crepe']]


In [6]:
for item_no,i in enumerate(training_labels):
    # print((training_labels[item_no]))
    # print(type((training_labels[item_no])))
    if len(i)<200:
        if len(training_labels[item_no]) < 200:
            training_labels[item_no].extend([100] * (200 - len(training_labels[item_no])))
training_labels = torch.tensor(training_labels)



In [7]:
# for item_no,i in enumerate(training_labels):
#     # print((training_labels[item_no]))
#     # print(type((training_labels[item_no])))
#     if len(i)<200:
#         if len(training_labels[item_no]) < 200:
#             training_labels[item_no].extend([1] * (200 - len(training_labels[item_no])))
# training_labels = torch.tensor(training_labels)



In [8]:
# Parameters
vocab_size = len(note_mapping)  # As there are 108 notes, 1 silence
d_model = 768  # smaller d_model for simplicity
nhead = 8  # ensure d_model is divisible by nhead
num_layers = 3
dim_feedforward = 768

# Model
encoder = BERTSentenceEncoder()
decoder = SimpleTransformerDecoderModel(vocab_size, d_model, nhead, num_layers, dim_feedforward)

# Set up optimizer and loss function
optimizer = AdamW(decoder.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()




In [9]:
# tokenized_input, encoded_sentences, context_vector = encoder.encode_sentences(training_words)
_, encoded, _ = encoder.encode_sentences(training_words)
# output = decoder(training_labels, encoded)
#one_hot_encoded_output = torch.nn.functional.one_hot(training_labels, num_classes=len(note_mapping))

In [10]:
encoded.shape
vocab_size

110

In [11]:
output = decoder(training_labels, encoded)

In [12]:
print(len(training_words))
#print(tokenized_input)
print(encoded.shape)
print(output.shape)
#print(one_hot_encoded_output.shape)


10
torch.Size([10, 56, 768])
torch.Size([10, 200, 110])


In [13]:
# Reshape predictions to match the shape of targets
predictions = output.view(-1, vocab_size).detach()
targets = training_labels.view(-1)

# Define the CrossEntropyLoss criterion
criterion = nn.CrossEntropyLoss()

# Compute the loss
loss = criterion(predictions, targets)

print("Categorical CrossEntropy Loss:", loss.item())

Categorical CrossEntropy Loss: 4.696470260620117
