In [3]:

class NoteGenerator:
    def __init__(self, notes, length=8):
        self.notes = notes  # List of possible notes, like ['A', 'B', 'C', ...]
        self.store = []     # Stores generated sequences
        self.genre = "rock" # Default genre
        self.length = length  # Desired length of generated sequence
        
        # Define genre patterns
        self.patterns = {
            "rock": ["C", "C", "G", "G", "A", "A", "G"],
            "pop": ["C", "G", "A", "F"],
            "blues": ["E", "E", "G", "G", "A", "Bb", "B"]
        }
        
    def set_genre(self, genre):
        # Set the genre and validate if it exists in patterns
        if genre.lower() in self.patterns:
            self.genre = genre.lower()
        else:
            raise ValueError("Unsupported genre. Available genres: rock, pop, blues")
    
    def generate(self):
        # Fetch pattern based on the genre
        base_pattern = self.patterns.get(self.genre, [])
        
        if not base_pattern:
            raise ValueError(f"No pattern defined for genre '{self.genre}'")
        
        # Repeat the base pattern to reach the specified length
        generated_sequence = []
        for _ in range(self.length // len(base_pattern) + 1):
            generated_sequence.extend(base_pattern)
        
        # Trim the generated sequence to the specified length
        generated_sequence = generated_sequence[:self.length]
        
        # Store the generated sequence in `store`
        self.store.append(generated_sequence)
        
        return generated_sequence

    def add_genre(self, genre_name, pattern):
        """
        Adds a new genre with a specified note pattern.
        """
        self.patterns[genre_name.lower()] = pattern





In [28]:
# Usage
notes = ['C', 'D', 'E', 'F', 'G', 'A', 'B']  # Just a list of possible notes for initialization
note_generator = NoteGenerator(notes, length=1000)

# Set genre to rock and generate a sequence
note_generator.set_genre("rock")
#print("Rock sequence:", note_generator.generate())

rock_notes = ['<rock>'] + note_generator.generate()

# Set genre to pop and generate a sequence
note_generator.set_genre("pop")
#print("Pop sequence:", note_generator.generate())
pop_notes = ['<pop>'] + note_generator.generate()

# # Adding and generating a sequence for a custom genre
# note_generator.add_genre("custom", ["D", "D", "G", "A", "C"])
# note_generator.set_genre("custom")
# print("Custom sequence:", note_generator.generate())

# Set genre to pop and generate a sequence
note_generator.set_genre("blues")
#print("Pop sequence:", note_generator.generate())
blues_notes = ['<blues>'] + note_generator.generate()

In [29]:
print(len(pop_notes))

1001


In [30]:
all_notes = rock_notes + pop_notes + blues_notes

In [182]:
# Example list of notes


# Open a text file in write mode
with open('all_notes.txt', mode='w') as file:
    # Write each note on a new line
    for note in all_notes:
        file.write(f"{note}\n")

In [181]:
print(all_notes)

['<rock>', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C',

In [179]:
#let create decoder only model

import torch
import torch.nn as nn
import torch.nn.functional as F

class Magic(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super(Magic, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.d_ff = d_ff

        # Multi-Head Attention components
        self.Q_linear = nn.Linear(d_model, d_model)
        self.K_linear = nn.Linear(d_model, d_model)
        self.V_linear = nn.Linear(d_model, d_model)

        # Output projection
        self.out_linear = nn.Linear(d_model, d_model)

        # Feedforward Network (FFN)
        self.ffn1 = nn.Linear(d_model, d_ff)
        self.ffn2 = nn.Linear(d_ff, d_model)

        # Layer Normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        # Compute Q, K, V
        Q = self.Q_linear(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        K = self.K_linear(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
        V = self.V_linear(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)

        # Compute attention scores: Q @ K.T and apply scaling
        attn = torch.matmul(Q, K.transpose(-2, -1)) / self.d_k ** 0.5  # Scale attention scores

        # Create a mask to prevent attending to future tokens
        mask = torch.triu(torch.ones(seq_len, seq_len, device=x.device), diagonal=1).bool()
        attn = attn.masked_fill(mask, float('-inf'))

        # Apply softmax to normalized scores
        attn = F.softmax(attn, dim=-1)

        # Compute the attention output
        output = torch.matmul(attn, V).transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        output = self.out_linear(output)

        # Add & Normalize
        x = self.norm1(x + output)

        # Feed-Forward Network (FFN)
        ffn_out = F.relu(self.ffn1(x))
        ffn_out = self.ffn2(ffn_out)

        # Add & Normalize
        x = self.norm2(x + ffn_out)

        return x


class TransDecoder(nn.Module):
    def __init__(self, vocab_size, batch_size, d_model=80, n_heads=2, d_ff=2048, max_length=10000):
        super(TransDecoder, self).__init__()
        self.batch_size = batch_size
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_ff = d_ff
        self.max_length = max_length
        self.emb = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = self.create_positional_encoding(max_length, d_model)
        self.magics = nn.ModuleList([Magic(d_model, n_heads, d_ff) for _ in range(3)])
        self.output_layer = nn.Linear(d_model, vocab_size)


    def create_positional_encoding(self, seq_len, embed_dim):
        position = torch.arange(seq_len, dtype=torch.float).unsqueeze(1)
        dim = torch.arange(embed_dim, dtype=torch.float).unsqueeze(0)
        angles = position / (10000 ** (dim / embed_dim))
        pos_encoding = torch.zeros(seq_len, embed_dim)
        pos_encoding[:, 0::2] = torch.sin(angles[:, 0::2])
        pos_encoding[:, 1::2] = torch.cos(angles[:, 1::2])
        return pos_encoding

    def forward(self, inputs):
        # Adjust for potential singleton middle dimension in testing
        if inputs.dim() == 3 and inputs.size(1) == 1:
            inputs = inputs.squeeze(1)  # Squeeze the middle dimension if it’s just [batch_size, 1, seq_len]

        batch_size, seq_len = inputs.size()
        embs = self.emb(inputs)  # Shape: [batch_size, seq_len, d_model]
        print(f"embs shape: {embs.shape}")



        pos_enc = self.positional_encoding[:seq_len, :].unsqueeze(0).expand(batch_size, -1, -1).to(embs.device)
        print(f"pos_enc shape: {pos_enc.shape}")

        embs = embs + pos_enc
        print(f"embs shape after adding pos_enc: {embs.shape}")

        for magic in self.magics:
            embs = magic(embs)

        logits = self.output_layer(embs)
        return logits

        # # Create positional encoding with the correct batch and sequence dimensions
        # pos_enc = self.create_positional_encoding(seq_len, self.d_model).unsqueeze(0).expand(batch_size, -1, -1).to(embs.device)
        # print(f"pos_enc shape: {pos_enc.shape}")

        # # Add positional encoding to input embeddings
        # embs = embs + pos_enc
        # print(f"embs shape after adding pos_enc: {embs.shape}")

        # for magic in self.magics:
        #     embs = magic(embs)

        # logits = self.output_layer(embs)
        # return logits

if __name__ == "__main__":
    main()

input shape is torch.Size([3, 6])
embs shape: torch.Size([3, 6, 40])
pos_enc shape: torch.Size([3, 6, 40])
embs shape after adding pos_enc: torch.Size([3, 6, 40])
torch.Size([3, 6, 12])
tensor([[[-8.1787e-01, -1.2786e-01, -2.6768e-01,  1.5458e-02,  6.9480e-02,
          -8.9042e-01,  2.4884e-01, -3.2139e-01, -5.8098e-01,  2.1978e-01,
          -1.1136e-01,  6.9214e-01],
         [-1.2042e+00, -4.0555e-01, -5.0192e-02,  2.4976e-01, -1.1256e+00,
          -4.4741e-02,  3.4121e-01, -3.8747e-01, -7.6250e-02, -4.4969e-01,
           3.4417e-01, -4.4099e-02],
         [-3.8031e-01, -7.3198e-02, -1.1705e-01,  1.4237e-01, -2.0626e-01,
           1.7949e-01, -1.9510e-01, -1.5474e-01, -8.0424e-01,  2.9966e-01,
          -2.0770e-01,  9.0650e-01],
         [-9.8126e-01, -1.3597e-01, -1.7067e-01,  6.4180e-01,  2.0481e-01,
          -4.2721e-01,  3.7970e-01, -1.0480e+00, -6.9986e-01,  7.6501e-01,
           5.6108e-04,  5.5375e-01],
         [-7.5134e-01, -2.3798e-01,  8.3690e-01,  1.6542e+00, -5.1

In [177]:
def main():
    # Example usage
    vocab_size = 12
    d_model = 40
    n_heads = 2
    d_ff = 80
    max_length = 512
    batch_size = 1

    model = TransDecoder(vocab_size, batch_size, d_model, n_heads, d_ff, max_length)
    inputs = torch.tensor([[6, 0, 8, 11, 3, 3], [6, 4, 8, 11, 3, 2], [6, 5, 10, 11, 3, 7]])
    print("input shape is", inputs.shape)
    outputs = model(inputs)
    print(outputs.shape)
    print(outputs)

if __name__ == "__main__":
    main()

input shape is torch.Size([3, 6])
embs shape: torch.Size([3, 6, 40])
pos_enc shape: torch.Size([3, 6, 40])
embs shape after adding pos_enc: torch.Size([3, 6, 40])
torch.Size([3, 6, 12])
tensor([[[ 2.3131e-01, -4.8160e-01, -3.5713e-01, -3.9369e-01,  2.0665e-01,
          -5.0323e-01,  8.4316e-01,  9.7482e-01, -7.8712e-01,  2.9079e-03,
          -1.2800e+00,  2.7560e-01],
         [ 4.7081e-01, -1.4443e+00,  6.3215e-01,  9.8232e-01,  2.9088e-01,
           3.6610e-02,  4.2767e-01,  9.4300e-01, -4.1064e-01, -4.3242e-02,
          -7.6113e-01,  3.1141e-01],
         [ 4.8520e-01, -8.0273e-01,  6.2346e-01,  4.8529e-01,  2.4321e-02,
          -2.8222e-01,  3.7729e-01,  5.5123e-01, -2.5428e-01, -4.5207e-01,
          -9.7834e-01, -2.4100e-01],
         [ 3.4002e-02,  8.7531e-02,  1.9515e-01,  9.1648e-02, -1.1729e+00,
          -1.1199e-01, -4.2721e-01, -2.1157e-01, -6.0469e-01, -5.0898e-01,
          -3.9545e-01,  4.7066e-01],
         [ 3.2034e-01, -1.1637e+00,  3.5254e-01,  6.9355e-01, -2.7

In [143]:
# Create a test case
batch_size = 1
seq_len = 40
d_model = 80
vocab_size = 1000
batch_size = 1

inputs = torch.randint(0, vocab_size, (batch_size, seq_len), dtype=torch.long)

model = TransDecoder(vocab_size, batch_size, d_model=d_model)

# Run the test case
outputs = model(inputs)
print(f"outputs shape: {outputs.shape}")

embs shape: torch.Size([1, 40, 80])
pos_enc shape: torch.Size([1, 40, 80])
embs shape after adding pos_enc: torch.Size([1, 40, 80])
outputs shape: torch.Size([1, 40, 1000])


In [46]:
print(all_notes)

['<rock>', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C', 'G', 'G', 'A', 'A', 'G', 'C', 'C',

In [154]:
#let make training loop
# Define the dataset
data = all_notes

# Create the vocab dictionary, including start and end tokens
special_tokens = ['<s>', '</s>']
vocab = {char: idx for idx, char in enumerate(set(data).union(special_tokens))}
vocab_size = len(vocab)  # Total number of unique tokens

# Function to get indices of data
def get_indices(natural_data):
    return [vocab[char] for char in natural_data]

# Encode the data with start and end tokens
rock_train_data = [vocab['<s>']] + get_indices(rock_notes)
pop_train_data = [vocab['<s>']] + get_indices(pop_notes)
blues_train_data = [vocab['<s>']] + get_indices(blues_notes)

rock_target_data = get_indices(rock_notes) + [vocab['</s>']]
pop_target_data = get_indices(pop_notes) + [vocab['</s>']]
blues_target_data = get_indices(blues_notes) + [vocab['</s>']]

print(vocab_size)
print(vocab)
print(len(data))
print(len(rock_train_data))
print(rock_train_data[:3])
print(pop_target_data)

12
{'<rock>': 0, 'B': 1, 'F': 2, 'A': 3, '<pop>': 4, '<blues>': 5, '<s>': 6, 'Bb': 7, 'C': 8, '</s>': 9, 'E': 10, 'G': 11}
3003
1002
[6, 0, 8]
[4, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3, 2, 8, 11, 3

In [155]:
print(vocab)

{'<rock>': 0, 'B': 1, 'F': 2, 'A': 3, '<pop>': 4, '<blues>': 5, '<s>': 6, 'Bb': 7, 'C': 8, '</s>': 9, 'E': 10, 'G': 11}


In [165]:

#turn into tensors

rock_input = torch.tensor(rock_train_data, dtype=torch.long)
rock_target = torch.tensor(rock_target_data, dtype=torch.long)

# For Pop genre
pop_input = torch.tensor(pop_train_data, dtype=torch.long)
pop_target = torch.tensor(pop_target_data, dtype=torch.long)

# For Blues genre
blues_input = torch.tensor(blues_train_data, dtype=torch.long)
blues_target = torch.tensor(blues_target_data, dtype=torch.long)

# Organize these as lists of inputs and targets per genre
inputs = [rock_input, pop_input, blues_input]
targets = [rock_target, pop_target, blues_target]




print(inputs)


[tensor([ 6,  0,  8,  ..., 11,  3,  3]), tensor([ 6,  4,  8,  ..., 11,  3,  2]), tensor([ 6,  5, 10,  ..., 11,  3,  7])]


In [166]:
print(len(inputs[0]))
print(len(targets[0]))

1002
1002


In [158]:
d_model = 40  # Dimension of embeddings, can be adjusted
n_heads = 2  # Number of attention heads
d_ff = 80  # Dimension of feed-forward network
batch_size = 1  # Number of sequences processed in parallel

model = TransDecoder(vocab_size=vocab_size, batch_size=batch_size, d_model=d_model, n_heads=n_heads, d_ff=d_ff)

In [None]:
# # Reshape the input and target tensors to include batch dimension
# rock_input = rock_input.unsqueeze(0)  # Shape: [batch_size, seq_len]
# rock_target = rock_target.unsqueeze(0)  # Shape: [batch_size, seq_len]

In [178]:
import torch.optim as optim
import torch.nn.functional as F
import wandb

# Define the batch size
batch_size = 1

model = TransDecoder(vocab_size, batch_size=batch_size)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

# Initialize W&B
wandb.init(project="transformers", name="transforming_decoder")

epochs = 150  # Number of training epochs

# Reshape the input and target tensors to include batch dimension
rock_input = rock_input.unsqueeze(0)  # Shape: [batch_size, seq_len]
rock_target = rock_target.unsqueeze(0)  # Shape: [batch_size, seq_len]

print(f"Training data shapes - Input: {rock_input.shape}, Target: {rock_target.shape}")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    # Forward pass
    predictions = model(rock_input)  # Shape: [batch_size, seq_len, vocab_size]
    
    # Compute the loss
    loss = criterion(predictions.view(-1, predictions.size(-1)), rock_target.view(-1))
    total_loss += loss.item()

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Calculate average loss
    avg_loss = total_loss
    wandb.log({'average_loss': avg_loss})

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

wandb.finish()

CommError: Run initialization has timed out after 90.0 sec. 
Please refer to the documentation for additional information: https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-

In [175]:
# #decoder with unzipping



import torch.optim as optim
import torch.nn.functional as F
import wandb



# Initialize W&B
wandb.init(project="transformers", name="transforming_decoder")
epochs = 150  # Number of training epochs

d_model = 50  # Dimension of embeddings, can be adjusted
n_heads = 2  # Number of attention heads
d_ff = 80  # Dimension of feed-forward network
batch_size = 1  # Number of sequences processed in parallel

model2 = TransDecoder(vocab_size=vocab_size, batch_size=batch_size, d_model=d_model, n_heads=n_heads, d_ff=d_ff)

optimizer = optim.Adam(model2.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    model2.train()
    total_loss = 0
    
    for genre_inputs, genre_targets in zip(inputs, targets):
        # Ensure genre inputs and targets are tensors
        genre_inputs = torch.tensor(genre_inputs, dtype=torch.long).unsqueeze(0)  # Add batch dimension
        print("genere inputs shape is", genre_inputs.shape, "genre  len is", len(genre_inputs))
        genre_targets = torch.tensor(genre_targets, dtype=torch.long).unsqueeze(0)  # Add batch dimension
        print(genre_targets.shape)  
        

        # Forward pass
        predictions = model(genre_inputs)  # Shape: (batch_size, seq_len, vocab_size)

        # Compute the loss
        #loss = criterion(predictions.view(-1, predictions.size(-1)), genre_targets.view(-1))
        loss = criterion(predictions.view(-1, vocab_size), genre_targets.view(-1))
        #total_loss += loss.item()
        avg_loss = total_loss / (epoch + 1)  # Average loss over epochs

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log the loss value
        avg_loss = total_loss / len(inputs)
        wandb.log({'average_loss': avg_loss})

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

wandb.finish()

# model2 = TransDecoder(vocab_size=vocab_size, batch_size=batch_size, d_model=d_model, n_heads=n_heads, d_ff=d_ff)
# optimizer = optim.Adam(model2.parameters(), lr=0.001)
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)  # Learning rate scheduler
# criterion = nn.CrossEntropyLoss()

# # Initialize W&B
# #wandb.init(project="transformers", name="transforming_decoder")
# epochs = 100

# d_model = 50
# n_heads = 2
# d_ff = 80
# batch_size = 1



# for epoch in range(epochs):
#     model2.train()
#     total_loss = 0

#     for genre_inputs, genre_targets in zip(inputs, targets):
#         genre_inputs = torch.tensor(genre_inputs, dtype=torch.long).unsqueeze(0)  # Shape: [1, seq_len]
#         genre_targets = torch.tensor(genre_targets, dtype=torch.long).unsqueeze(0)

#         # Forward pass
#         predictions = model2(genre_inputs)  # Shape: [batch_size, seq_len, vocab_size]

#         # Compute the loss
#         loss = criterion(predictions.view(-1, vocab_size), genre_targets.view(-1))  # Ensure correct shape
#         total_loss += loss.item()

#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     avg_loss = total_loss / len(inputs)  # Average over the genre batches
#     wandb.log({'average_loss': avg_loss})

#     if epoch % 10 == 0:
#         print(f"Epoch {epoch}, Loss: {avg_loss:.4f}")

#     # Step the learning rate scheduler
#     scheduler.step()

# wandb.finish()



CommError: Run initialization has timed out after 90.0 sec. 
Please refer to the documentation for additional information: https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-

In [161]:
def generate_sequence(model, input_seq, max_len=20):
    model.eval()  # Set the model to evaluation mode
    input_tensor = torch.tensor(input_seq).unsqueeze(0)  # Shape: [1, seq_len]
    predictions = []  # Store predicted tokens

    # Iterate to generate the sequence
    for _ in range(max_len):
        with torch.no_grad():
            output = model(input_tensor)  # Shape: [batch_size, seq_len, vocab_size]
        
        # Take the last token's logits (for next token prediction)
        last_token_logits = output[0, -1, :]  # Get logits for the last token in the sequence
        
        # Get the predicted token by sampling (you could use argmax or a sampling method)
        next_token = torch.argmax(last_token_logits).item()  # Take the token with the highest logit
        
        predictions.append(next_token)  # Append to the list of predictions
        
        # Add the predicted token to the input sequence for next prediction
        input_tensor = torch.cat((input_tensor, torch.tensor([[next_token]]).to(input_tensor.device)), dim=1)
    
    return predictions




embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 7, 80])
pos_enc shape: torch.Size([1, 7, 80])
embs shape after adding pos_enc: torch.Size([1, 7, 80])
embs shape: torch.Size([1, 8, 80])
pos_enc shape: torch.Size([1, 8, 80])
embs shape after adding pos_enc: torch.Size([1, 8, 80])
embs shape: torch.Size([1, 9, 80])
pos_enc shape: torch.Size([1, 9, 80])
embs shape after adding pos_enc: torch.Size([1, 9, 80])
embs shape: torch.Size([1, 10, 80])
pos_enc shape: torch.Size([1, 10, 80])
embs shape after adding pos_enc: torch.Size([1, 10, 80])
embs shape: torch.Size([1, 11, 80])
pos_enc shape: torch.Size([1, 11, 80])
embs shape after adding pos_enc: torch.Size([1, 11, 80])
embs shape: torch.Size([1, 12, 80])
pos_enc shape: torch.Size([1, 12, 80])
embs shape after adding pos_enc: torch.Size([1, 12, 80])
embs shape: torch.Size([1, 13, 80])
pos_enc shape: torch.Size([1, 13, 80])
embs shape af

In [None]:
# Example usage:
input_seq = [6, 0, 8, 11, 3, 3]  # Example input sequence
generated_output = generate_sequence(model, input_seq, max_len=10)
print(f"Generated Sequence: {generated_output}")

In [42]:
def predict_next(model, input_seq, vocab):
    model.eval()
    input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        output = model(input_tensor)
        last_output = output[:, -1, :]  # Get the last time step output
        predicted_idx = torch.argmax(last_output, dim=-1).item()
        for char, idx in vocab.items():
            if idx == predicted_idx:
                return char

In [170]:
# Predict the next 20 characters
predicted = []
start_seq = ["<rock>", "C", "C", "G", "G", "A"]

while len(predicted) < 10:
    input_seq_tokens = [vocab.get(token) for token in start_seq]
    predicted_char = predict_next(model, input_seq_tokens, vocab)
    predicted.append(predicted_char)
    start_seq.append(predicted_char)  # Append character itself, not its vocab ID
    start_seq = start_seq[1:]  # Keep the length of start_seq fixed

print("Predicted sequence:", predicted)




    #"blues": ["E", "E", "G", "G", "A", "Bb", "B"]
    #  "rock": ["C", "C", "G", "G", "A", "A", "G"],
    #         "pop": ["C", "G", "A", "F"],
    #         "blues": ["E", "E", "G", "G", "A", "Bb", "B"]

embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding pos_enc: torch.Size([1, 6, 80])
embs shape: torch.Size([1, 6, 80])
pos_enc shape: torch.Size([1, 6, 80])
embs shape after adding 

In [192]:


# Add the '<UNK>' token with a unique index
vocab['<UNK>'] = len(vocab)
int_to_vocab = {idx: word for word, idx in vocab.items()}

print(int_to_vocab)

def infer(model, input_sequence, vocab_to_int, int_to_vocab):
    """
    Inference function to predict the next sequence based on the model.
    
    Args:
    - model: The trained model for inference.
    - input_sequence: The input string to infer from.
    - vocab_to_int: A dictionary mapping vocab words to integer IDs.
    - int_to_vocab: A dictionary mapping integer IDs back to vocab words.
    
    Returns:
    - A string with the predicted output sequence.
    """
    model.eval()  # Set the model to evaluation mode
    tokens = [vocab_to_int.get(char, vocab_to_int['<UNK>']) for char in input_sequence]  # Convert input characters to tokens
    
    # Add start token
    tokens = [vocab_to_int['<s>']] + tokens  # Add start token at the beginning of the sequence
    # Check if tokens have any out-of-range values
    vocab_size = len(vocab_to_int)
    if any(token >= vocab_size for token in tokens):
        print(f"Warning: Out-of-range token detected in input {input_sequence}. Tokens: {tokens}")


    # Convert tokens to tensor and reshape for batch processing
    #tokens_tensor = torch.tensor(tokens).unsqueeze(0).to(model.device)  # Shape: [1, seq_len]
    tokens_tensor = torch.tensor(tokens).unsqueeze(0).to(next(model.parameters()).device)  # Shape: [1, seq_len]
    
    with torch.no_grad():
        logits = model(tokens_tensor)  # Get model output (logits)

    # Apply softmax to get probabilities
    probabilities = F.softmax(logits, dim=-1)

    # Get the predicted token indices (most probable)
    predicted_tokens = torch.argmax(probabilities, dim=-1)

    # Convert the predicted token indices back to characters using int_to_vocab
    predicted_tokens = predicted_tokens.squeeze(0).cpu().numpy()  # Squeeze the batch dimension and move to CPU

    # Handle out-of-range predictions gracefully
    output_tokens = [int_to_vocab.get(token, '<UNK>') for token in predicted_tokens]
    
    # Join the output tokens into a single string (excluding the start token)
    return ''.join(output_tokens[1:])  # Skip the start token

# Example test strings
test_strings = ['aabb','a','aa']

# Assuming vocab_to_int and int_to_vocab are dictionaries you've already defined
for test_string in test_strings:
    result = infer(model, test_string, vocab, int_to_vocab)
    print(f"Result for {test_string}: {result}")

{0: '<rock>', 1: 'B', 2: 'F', 3: 'A', 4: '<pop>', 5: '<blues>', 6: '<s>', 7: 'Bb', 8: 'C', 9: '</s>', 10: 'E', 11: 'G', 13: '<UNK>'}


IndexError: index out of range in self