# Libraries

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset, random_split
from torch.optim import Adam
import torch.nn.functional as F
import numpy as np

# for teacher
import random

from sklearn.model_selection import train_test_split

# Side variables
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
base_type = np.float32
torch_type = torch.float32
batch_size = 64
window_length = 1024 # Data points will hold 1024 tokens of observations (should be 1/4 average song length in tokens, remember to remove outliers)
window_step_size = 32 # Sliding window will move this much each time (higher numbers means less data, but less overfitting to similar data)


# Load Data / Tokenize

In [2]:
# load
data = []
with open("./out.txt", "r") as file:
    data = file.read().splitlines()

print(data[5:10])

['rest', 'len_3', 'bar', 'rest', 'len_3']


In [3]:
# now tokenize
unique_tokens = set(list(data))
print(len(unique_tokens))
# mapping
note_to_token = {note: idx for idx, note in enumerate(unique_tokens)}
token_to_note = {idx: note for note, idx in note_to_token.items()}


464


# Data Preprocessing

In [4]:
data = [note_to_token[tok] for tok in data]


# Assume already tokenized (this is for transformer, must be adapted for other models)
class MusicDataset(Dataset):
    def __init__(self, data, window_length, step_size):
        self.temp_data = data
        self.window_length = window_length
        self.step_size = step_size
        self.final_data = self.apply_window()

    def apply_window(self):
        # return sliding window data + labels
        train_examples = []
        # cycle through each window configuration, calculating start index and end index
        for start_idx in range(0, len(self.temp_data) - self.window_length + 1, self.step_size):
            end_idx = start_idx + self.window_length
            train_example = self.temp_data[start_idx:end_idx] # training of length window_length
            train_examples.append(train_example)
            
        return train_examples
                

    def __len__(self):
        return len(self.final_data)

    def __getitem__(self, idx):
        window = self.final_data[idx]

        return torch.tensor(window).to(device)
    

dataset = MusicDataset(data, window_length, window_step_size)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

training_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


# RNN

In [53]:
# RNN Variables
vocab_len = len(unique_tokens)
seq_len = 256
embed_size = 512 # larger embed size may require larger dropout
dropout = 0.2
lr = 1e-4
epochs = 20

In [54]:
class Music_RNN(nn.Module):
    def __init__(self, vocab_len, input_size, hidden_size, dropout):
        super(Music_RNN, self).__init__()
        self.embed = nn.Embedding(vocab_len, input_size)
        self.RNN = nn.RNN(input_size, hidden_size, batch_first=True, dropout=dropout)
        self.ff = nn.Linear(hidden_size, vocab_len) # predicting next input autoregressively

    def forward(self, x):
        # x should be (batch_size, seq_len)
        x = self.embed(x) # (batch_size, seq_len, input_size)
        x, _ = self.RNN(x) # (batch_size, seq_len, hidden_size)
        x_pred = self.ff(x) # batch_size, seq_len, vocab_len)

        return x_pred 

In [55]:
model_rnn = Music_RNN(vocab_len, seq_len, embed_size, dropout).to(device)



In [57]:
# adapt loss and optimizer as needed
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model_rnn.parameters(), lr=lr)

# train
for epoch in range(epochs):
    # Set to train
    model_rnn.train()
    # keep cumalitive losses
    total_losses = 0.0

    for batch in training_dataloader:
        inp = batch[:, :-1] # grabs all but last
        tgt = batch[:, 1:] # grabs all except first (shifted once)

        optimizer.zero_grad()

        preds = model_rnn.forward(inp) # make sure dimensions line up

        loss = criterion(preds.view(-1, vocab_len), tgt.reshape(-1))
        loss.backward()

        optimizer.step()

        total_losses += loss.item()

    print(f"Epoch: {epoch}, Loss: {total_losses / len(training_dataloader)}")


Epoch: 0, Loss: 2.6006146458242214


In [58]:
# Validation

model_rnn.eval()

total_losses = 0.0

# validation
with torch.no_grad():
    for batch in validation_dataloader:
        inp = batch[:, :-1] # grabs all but last
        tgt = batch[:, 1:] # grabs all except first (shifted once)

        preds = model_rnn.forward(inp) # make sure dimensions line up

        loss = criterion(preds.view(-1, vocab_len), tgt.reshape(-1))
        total_losses += loss.item()

    print(f"Loss: {total_losses / len(validation_dataloader)}")

Loss: 2.1302893652635464


# LSTM

# Transformer

YAYYYYYYYYY Transformer Time WOOOOOOOOO

In [5]:
#  Model variables
d_model = 128 # Embed Dim
n_decoder_layers = 4 # lower if not that complex and wanting speedup
n_heads = 4 # Number of Attention Heads
d_ff = 512 # Feed Forward Dimensionality (AIAYN paper reccomends 4 times d_model size)
learning_rate = 1e-5 # Maybe increase?
num_epochs = 1 # Change this later so it doesn't take 10 years to run

In [16]:
# Standard positonal encoding used here, could also try time encoding since notes have different timestamps
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, win_len):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(win_len, d_model) # (win_len, d_model)
        position = torch.arange(0, win_len, dtype=torch_type).unsqueeze(1) # (win_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float()*(-torch.log(torch.tensor(10000.0))/d_model)) # (d_model/2)
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        pe = pe.unsqueeze(0) # (1, win_len, d_model)
        self.pe = pe.to(device)

    def forward(self, x):
        print(x.size(1))
        x = x + self.pe[:, :x.size(1), :]
        return x


# mask
def generate_causal_mask(size):
    # size should be target size
    mask = torch.triu(torch.ones(size, size, device=device), diagonal=1)
    mask = mask.masked_fill(mask == 1, float('-inf'))
    '''
    [0, -inf, -inf, -inf]
    [0,   0,  -inf, -inf]
    [0,   0,    0,  -inf]
    [0,   0,    0,    0 ]
    Yay for triangle masking
    '''
    return mask


In [17]:
# Actual decoder
class MusicTransformer(nn.Module):
    def __init__(self, num_tokens, d_model=d_model, nhead=n_heads, dim_ff=d_ff, win_len=window_length, layers=n_decoder_layers):
        super(MusicTransformer, self).__init__()
        self.d_model = d_model
        self.nheads = nhead
        self.dim_ff = dim_ff
        self.win_len = win_len
        self.layers = layers

        # pre-transformer
        self.tok_embed = nn.Embedding(num_tokens, d_model)
        self.pos_enc = PositionalEncoding(d_model, win_len)
        self.out_proj = nn.Linear(d_model, num_tokens) # final projection for token prediction

        # transformer part
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_ff, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=self.layers)


    def forward(self, x, tgt_mask=None):
        '''
        x is the tokenized notes  # (batch_size, win_len)
        tgt_mask masks the predictions
        '''

        x_seq = self.tok_embed(x) # (batch_size, 1, d_model)
        x_seq = self.pos_enc(x_seq) # (batch_size, 1, d_model)

        decoder_output = self.decoder(x_seq, x_seq, tgt_mask=tgt_mask) # (batch_size, win_len, d_model)
        pred = self.out_proj(decoder_output) # (batch, win_len, num_tokens) 
      
        return pred



In [18]:
my_mask = generate_causal_mask(window_length) # win_len by win_len mask

# Create model
num_tokens = len(set(list(data)))
model = MusicTransformer(num_tokens, d_model, n_heads, d_ff, window_length, n_decoder_layers).to(device)

# Use Adam cause he's so cool
optimizer = Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0 # cumalative loss

    # Cycle through each batch
    for batch in training_dataloader:
        inp = batch[:, :-1] # grabs all but last
        tgt = batch[:, 1:] # grabs all except first (shifted once)


        optimizer.zero_grad()
        # Pass through model
        output = model(inp)
        # Determine loss
        loss = criterion(output.view(-1, num_tokens), tgt.reshape(-1))
        # Update weights
        loss.backward()
        optimizer.step()
        # Add to epoch_loss
        epoch_loss += loss.item()

    # Now show average loss for epoch
    avg_epoch_loss = epoch_loss / len(training_dataloader)
    print(f"Epoch: [{epoch+1}/{num_epochs}]   Epoch Average Loss: {avg_epoch_loss}")


1023
1023
1023
1023
1023
1023
1023
1023
1023
1023
1023
1023


KeyboardInterrupt: 

In [None]:
# generate music
# for now, generate a fixed length, if we add start and end tokens then we can also do that
def generate_music(model, start_tokens, temperature=1.0, num_to_generate=window_length):
    model.eval()

    predicted_tokens = torch.tensor(start_tokens).unsqueeze(0) # (1, len(start_tokens))
    # iteratively predict then add to start_tokens
    for i in range(num_to_generate):
        with torch.no_grad():
            logits = model(predicted_tokens) # (1, len(predicted_tokens), num_tokens)
            logits = logits[:, -1, :] # grabs last token 
            probs = torch.softmax(logits / temperature, dim=-1) # (1, num_tokens)
            pred_token = torch.multinomial(probs, num_samples=1) # (1, 1)

        predicted_tokens = torch.cat([predicted_tokens, pred_token], dim=1) # (1, len(current sequence length))

    return predicted_tokens.squeeze(0).to_list()




In [22]:
for batch in validation_dataloader:
    bleh = generate_music(model, batch[0][:4], temperature=1.0)
    print(bleh)
    break

  predicted_tokens = torch.tensor(start_tokens).unsqueeze(0) # (1, len(start_tokens))


4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
27

RuntimeError: The size of tensor a (1025) must match the size of tensor b (1024) at non-singleton dimension 1