In [None]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()
import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Previous Transformer

In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Get the data

In [None]:
import numpy as np
import pandas as pd
import json
import itertools


In [None]:
main_path = '/content/drive/MyDrive/MyProject/Moonboard/Videos/' # To modify with your path
my_path = '/content/drive/MyDrive/MyProject/Moonboard/' # To modify with your path
move_seq_path = my_path + 'MoveSeqs/'
holds_seq_path = my_path + 'HoldsSeqs/'

In [None]:
worksheet = pd.read_csv(my_path + 'videos.csv')
worksheet = worksheet.iloc[:,1:]
worksheet

Unnamed: 0,name
0,0000.mp4
1,0001.mp4
2,0002.mp4
3,0003.mp4
4,0004.mp4
5,0005.mp4
6,0006.mp4
7,0007.mp4
8,0008.mp4
9,0009.mp4


### Test with one sequence

In [None]:
video = worksheet.iloc[0,0]
target = pd.read_csv(move_seq_path + video + '_MOVE_SEQ.csv').iloc[:,2:]

target_coords = target.iloc[:,:2]
target_tokens = target.iloc[:,2]

In [None]:
# target_coords

We discretize the coordinates on a grid by rounding them up

In [None]:
nb_decimals = 2
def standardize_df(df):
    return df.round(nb_decimals)

In [None]:
target_coords = standardize_df(target_coords)
target_coords

Unnamed: 0,x,y
0,0.24,0.63
1,0.42,0.57
2,0.42,0.4
3,0.21,0.66
4,0.74,0.32
5,0.46,0.57
6,0.4,0.41


We define a position vocabulary for the discretized coordinates

In [None]:
position_vocabulary = {i/(10**nb_decimals) : i for i in range(10**nb_decimals)}
position_vocabulary[-1] = 10**nb_decimals

# Convert the coordinates using the above defined tokenizer
target_coords["x"].replace(position_vocabulary, inplace = True)
target_coords["y"].replace(position_vocabulary, inplace = True)

target_coords

Unnamed: 0,x,y
0,24.0,63.0
1,42.0,57.0
2,42.0,40.0
3,21.0,66.0
4,74.0,32.0
5,46.0,57.0
6,40.0,41.0


In order to work with this sequence, we need to put it in a tensor

In [None]:
def convert_df_into_tensor(df):
    """Concatenates all the rows of a dataframe into a big list of strings.
    WARNING: the column names are not registered, so the order has to be implicitly respected"""
    df_list = torch.empty(df.shape)

    for i in range(df.shape[0]):
        df_list[i] = (torch.Tensor(df.iloc[i]))

    # df_list = list(itertools.chain.from_iterable(df_list))
    
    return df_list

target_coords_tensor = convert_df_into_tensor(target_coords)

Now we generate permutations of the target sequence to get the input sequences. We also generate the token sequence (0,1,2...), which will be shuffled accordingly to get the target output sequence. We also generate the trivial token vocabulary

In [None]:
MAX_LENGTH = 20
def generate_token_sequences(nb_seqs, length = MAX_LENGTH):
  token_sequence = torch.linspace(0,length-1, length, dtype=torch.int64)
  return token_sequence.repeat(nb_seqs, 1)

token_vocabulary = {f'hold_{i}' : i for i in range(MAX_LENGTH + 1)} # the last one is the additional token that will be used for padding
inverse_token_vocabulary = {i: f'hold_{i}' for i in range(MAX_LENGTH + 1)}

In [None]:
def pad_target_coords(target_coords_tensor):
  """NEW: pad with the last hold instead of an imaginary hold"""
  padded_seq = torch.empty((MAX_LENGTH, 2))
  padded_seq[:target_coords_tensor.shape[0], :] = target_coords_tensor
  padded_seq[target_coords_tensor.shape[0]:, :] = target_coords_tensor[-1]

  padded_indices = torch.full((MAX_LENGTH,1), fill_value = target_coords_tensor.shape[0])
  padded_indices[:target_coords_tensor.shape[0]] = torch.arange(0,target_coords_tensor.shape[0]).view(-1,1)

  return padded_seq, padded_indices

In [None]:
# From the target sequence, generate random holds sequence for input 
def generate_input_sequences(target, nb_perms = 2):
  input_seqs = []
  output_seqs = []

  last_index = target.shape[0]-1
  target_padded, padded_indices = pad_target_coords(target)

  input_token_seqs = generate_token_sequences(nb_seqs = nb_perms, length = target.shape[0])
  output_token_seqs = []
  for n in range(nb_perms):
    full_input = torch.cat((target, input_token_seqs[n].view(-1,1)),1)
    shuffled_input = torch.Tensor(np.random.permutation(full_input[:-1]))

    shuffled_input = torch.cat((shuffled_input,torch.cat((target_padded[last_index:],padded_indices[last_index:]),1)),0)


    input_seqs.append((torch.Tensor(shuffled_input[:,:2])))
    output_token_seqs.append(torch.Tensor(shuffled_input[:,2:]).view(-1))
    output_seqs.append(target_padded)
    

  return input_seqs, output_seqs, torch.vstack(output_token_seqs)

# target_coords_tensor = pad_target_coords(target_coords_tensor)
input_seqs, output_seqs, output_token_seqs = generate_input_sequences(target_coords_tensor)

In [None]:
output_seqs 

[tensor([[24., 63.],
         [42., 57.],
         [42., 40.],
         [21., 66.],
         [74., 32.],
         [46., 57.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.]]), tensor([[24., 63.],
         [42., 57.],
         [42., 40.],
         [21., 66.],
         [74., 32.],
         [46., 57.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.]])]

Finally, we have to pad the sequences

In [None]:
# We define a function to generate a dummy input sequence of length MAX_LENGTH
def generate_dummy_sequence(dummy_char = -1):
  return torch.full((MAX_LENGTH, 2), fill_value = dummy_char)

dummy_seq = generate_dummy_sequence()
outputs = output_seqs + [dummy_seq]
inputs = input_seqs + [dummy_seq]

In [None]:
# Now we use this function to pad the coordinates sequences
inputs_coords = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=position_vocabulary[-1])
outputs_coords = torch.nn.utils.rnn.pad_sequence(outputs, batch_first=True, padding_value=position_vocabulary[-1])


# print(inputs_coords)

In [None]:
# To get the full input token sequences, we just generate them with length = MAX_LENGTH
inputs_token_seqs = generate_token_sequences(nb_seqs = inputs_coords.shape[0] - 1, length = MAX_LENGTH)
inputs_token_seqs[:,output_token_seqs.shape[1]:] = MAX_LENGTH
# To get the full target token sequences, we take the input ones and replace the first part (non padded) by the target sequences previously generated
# output_token_seqs = torch.where()

We now have 4 list:
- output_seqs (nb_perms, len(sequence), 2) containing the coordinates and the limbs of the target holds sequence, ordered, repeated nb_perms times
- input_seqs (nb_perms, len(sequence), 2), same, but the nb_perms objects are permuted version of the target sequence
- input_token_seqs (nb_perms, len(sequence)) containing the numbers from 0 to len(sequence) - 1, representing the holds in the input sequence
- output_token_seqs (nb_perms, len(sequence)), same, but they are permuted in the same way as the coordinates, so that we have the actual target sequence to look for

In [None]:
print(inputs_token_seqs.shape)

torch.Size([2, 20])


In [None]:
outputs_token_seqs = inputs_token_seqs
outputs_token_seqs[:,:output_token_seqs.shape[1]] = output_token_seqs
print(outputs_token_seqs.shape)

torch.Size([2, 20])


Now, in order to work (?), the Transformer should complete the sequences by looking at a concatenation of the inputs and outputs. We do so by obtaining 2 sequences of length 39, one made of the 20 inputs and first 19 outputs, this will be our INPUT. The other is made of the last 19 inputs and the 20 outputs, this is our OUTPUT.

In [None]:
inputs_tokens_cat = torch.cat((inputs_token_seqs, outputs_token_seqs[:,:-1]), dim=1)
outputs_tokens_cat = torch.cat((inputs_token_seqs[:,1:], outputs_token_seqs), dim=1)
inputs_coords_cat = torch.cat((inputs_coords, outputs_coords[:,:-1]), dim=1)
outputs_coords_cat = torch.cat((inputs_coords[:,1:], outputs_coords), dim=1)

In [None]:
inputs_coords_cat

tensor([[[21., 66.],
         [46., 57.],
         [42., 57.],
         [42., 40.],
         [24., 63.],
         [74., 32.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [24., 63.],
         [42., 57.],
         [42., 40.],
         [21., 66.],
         [74., 32.],
         [46., 57.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.],
         [40., 41.]],

        [[42., 57.],
         [74., 32.],
         [46., 57.],
         [42., 40.],
         [21., 66.],
         [24., 63.],
         [40., 41.],
         [40., 41.],
         [4

### Now that we have all of these functions, we can process the entire dataframe by reading the data for all sequences.

In [None]:
def prepare_data(sheet, nb_perms = 30):
  """Returns 2 lists of arrays containing all sequences of holds and moves for each climb"""
  input_seqs = []
  output_seqs = []
  output_token_seqs = []

  inputs_tokens = torch.Tensor()
  outputs_tokens = torch.Tensor()

  for i in range(sheet.shape[0]):
    if i%10 == 0:
      print(f'Preparing data for video {i}/{sheet.shape[0] - 1}')
    video = sheet.iloc[i,0]
    try:
      target = pd.read_csv(move_seq_path + video + '_MOVE_SEQ.csv').iloc[:,2:]

      target_coords = target.iloc[:,:2]
      target_tokens = target.iloc[:,2]
    except FileNotFoundError:
      continue

    if(target_coords.shape[0] > 4):
      try:
        target_coords = standardize_df(target_coords)
        target_coords["x"].replace(position_vocabulary, inplace = True)
        target_coords["y"].replace(position_vocabulary, inplace = True)

        target_coords_tensor = convert_df_into_tensor(target_coords)
        
        input_seqs_video, output_seqs_video, output_token_seqs_video = generate_input_sequences(target_coords_tensor, nb_perms=nb_perms)
        input_seqs += input_seqs_video
        output_seqs += output_seqs_video

        # Copy the shuffled token sequences and add them to the list
        input_token_seqs = generate_token_sequences(nb_perms)
        input_token_seqs[:,output_token_seqs_video.shape[1]:] = MAX_LENGTH # the last ones are padded with the additional token
        outputs_token_seqs = torch.clone(input_token_seqs)
        outputs_token_seqs[:,:output_token_seqs_video.shape[1]] = output_token_seqs_video

        inputs_tokens_cat = torch.cat((input_token_seqs, outputs_token_seqs[:,:-1]), dim=1)
        outputs_tokens_cat = torch.cat((input_token_seqs[:,1:], outputs_token_seqs), dim=1)
        

        inputs_tokens = torch.cat((inputs_tokens, inputs_tokens_cat))
        outputs_tokens = torch.cat((outputs_tokens, outputs_tokens_cat))

      except TypeError:
          continue

  # Pad the sequences
  dummy_seq = generate_dummy_sequence()
  outputs = output_seqs + [dummy_seq]
  inputs = input_seqs + [dummy_seq]

  inputs_coords = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value=position_vocabulary[-1])
  outputs_coords = torch.nn.utils.rnn.pad_sequence(outputs, batch_first=True, padding_value=position_vocabulary[-1])


  # Remove the dummy sequence
  nb_seqs = inputs_coords.shape[0] - 1
  inputs_coords = inputs_coords[:nb_seqs]
  outputs_coords = outputs_coords[:nb_seqs]


  inputs_coords_cat = torch.cat((inputs_coords, outputs_coords[:,:-1]), dim=1)
  outputs_coords_cat = torch.cat((inputs_coords[:,1:], outputs_coords), dim=1)

  return inputs_coords_cat, outputs_coords_cat, inputs_tokens, outputs_tokens

inputs_coords, outputs_coords, inputs_tokens, outputs_tokens = prepare_data(worksheet, nb_perms=30)
print(f'Prepared sequences of shape {inputs_coords.shape}')

Preparing data for video 0/19
Preparing data for video 10/19
Prepared sequences of shape torch.Size([600, 39, 2])


In [None]:
print(inputs_coords.shape)
print(outputs_coords.shape)
print(inputs_tokens.shape)
print(outputs_tokens.shape)

torch.Size([600, 39, 2])
torch.Size([600, 39, 2])
torch.Size([600, 39])
torch.Size([600, 39])


In [None]:
inputs_coords[1]

tensor([[42., 40.],
        [46., 57.],
        [74., 32.],
        [42., 57.],
        [24., 63.],
        [21., 66.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [24., 63.],
        [42., 57.],
        [42., 40.],
        [21., 66.],
        [74., 32.],
        [46., 57.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.],
        [40., 41.]])

In [None]:
inputs_tokens[1]

tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19.,  2.,  5.,  4.,  1.,  0.,  3.,  6.,  7.,
         7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.])

In [None]:
outputs_tokens[1]

tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
        15., 16., 17., 18., 19.,  2.,  5.,  4.,  1.,  0.,  3.,  6.,  7.,  7.,
         7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.,  7.])

Finally we concatenate all the data to create the input and target tensors

In [None]:
sequence_length = 2 * MAX_LENGTH - 1
input = torch.cat((inputs_coords, inputs_tokens.view(-1, sequence_length, 1)), 2)
target = outputs_tokens.view(-1)

In [None]:
print(input.shape)
print(target.shape)

torch.Size([600, 39, 3])
torch.Size([23400])


# At that point, we have all the input and output sequences prepared, both for coordinates and tokens. We now have to feed them into the Transformer.

First, we adapt the position embedding to work with our data format

In [None]:
class PositionalEncoding_modified(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len = 5000):
        super(PositionalEncoding_modified, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model


    def pe(self, position):
        pe = torch.zeros(position.shape[0], position.shape[1], self.d_model).to(device)
        
        # # Test by adding and scaling the 2 coords by div_term --> NOT GOOD
        # div_term = (torch.exp(torch.arange(0, self.d_model).float() * (-math.log(10000.0) / self.d_model))).view(1,-1).repeat(2,1).to(device)

        # # Multiply by div_term for later
        # position_scaled = position @ div_term

        # pe[:, :, 0::2] = torch.sin(position_scaled[:,:,0::2]) 
        # pe[:, :, 1::2] = torch.cos(position_scaled[:,:,1::2])

        # # Just embedd the x-coordinate
        div_term_half = (torch.exp(torch.arange(0, self.d_model,2).float() * (-math.log(10000.0) / self.d_model))).view(1,-1).to(device)
        div_term = torch.empty(self.d_model).view(1,-1)
        div_term[:,0::2] = div_term_half
        div_term[:,1::2] = div_term_half

        pos_x_scaled = position[:,:,0].view(position.shape[0], position.shape[1],1) @ div_term
        pos_y_scaled = position[:,:,1].view(position.shape[0], position.shape[1],1) @ div_term


        # Embed x
        pe[:, :, 0::4] = torch.sin(pos_x_scaled[:,:,0::4]) 
        pe[:, :, 1::4] = torch.cos(pos_x_scaled[:,:,1::4]) 

        # Embed y
        pe[:, :, 2::4] = torch.sin(pos_y_scaled[:,:,2::4]) 
        pe[:, :, 3::4] = torch.cos(pos_y_scaled[:,:,3::4]) 

        return pe


    def forward(self, x_encoded, coords):
        pos_emb = self.pe(coords)
        y = x_encoded + pos_emb
        return self.dropout(y)

In [None]:
# # Test the position embedding with no token embedding for now (keeping the 30 holds sequence)
# x = input[0].view(1,30,3).to(device)
# # print(x)
# pos_model = PositionalEncoding_modified(MAX_LENGTH)
# pos_emb = pos_model(x[:,:,2], x[:,:,:2])
# # pos_emb

In [None]:
class TransformerModel_modified(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel_modified, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding_modified(ninp, dropout)
        self.regular_pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src_encoded = self.encoder(src[:,:,2].int()) * math.sqrt(self.ninp)
        #Position embedding based on the coordinates
        src = self.pos_encoder(src_encoded, src[:,:,:2])
        #Add also regular position embedding
        src = self.regular_pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

Actual use of the transformer

In [None]:
train_per, val_per = 0.6, 0.2
train_size = int(train_per * input.shape[0])
target_train_size= train_size * input.shape[1]
val_size = int(val_per * input.shape[0])
target_val_size= val_size * input.shape[1]

device = torch.device("cpu")
train_data = input[:train_size].to(device)
train_target = target[:target_train_size].to(device)
val_data = input[train_size: train_size + val_size].to(device)
val_target = target[target_train_size: target_train_size + target_val_size].to(device)
test_data = input[train_size + val_size:].to(device)
test_target = target[target_train_size + target_val_size:].to(device)

In [None]:
bptt = 100
def get_batch(input_source, target_source, i):
    i_target = input_source.shape[1] * i
    target_seq_len = min(bptt*input_source.shape[1], len(target_source) - input_source.shape[1] - i_target)
    seq_len = min(bptt, len(input_source) - 1 - i)
    data = input_source[i:i+seq_len]
    target = target_source[i_target:i_target+target_seq_len].long()
    return data, target

data, targets = get_batch(train_data, train_target,  100)
print(data.shape, targets.shape)

torch.Size([100, 39, 3]) torch.Size([3900])


In [None]:
ntokens = MAX_LENGTH + 1# the size of vocabulary
emsize = 512 # embedding dimension
nhid = 512 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 4 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel_modified(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [None]:
bptt = 100
# Just a test to check the whole Transformer's application
src_mask = model.generate_square_subsequent_mask(bptt).to(device)
src, targets = get_batch(train_data, train_target, 0)
print("data ", data.shape)
src_encoded = model.encoder(src[:,:,2].int()) * math.sqrt(model.ninp)
print("encoded ", src_encoded.shape)
src = model.pos_encoder(src_encoded, src[:,:,:2])
output = model.transformer_encoder(src, src_mask)
output = model.decoder(output)

data  torch.Size([100, 39, 3])
encoded  torch.Size([100, 39, 512])


In [None]:
import time

criterion = nn.CrossEntropyLoss()
lr = 4.5*10**-4 # learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.96), eps=10**(-8), weight_decay=4.5**-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, train_target, i)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 1
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source, target_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, target_source, i)
            if data.size(0) != bptt:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [None]:
best_val_loss = float("inf")
epochs = 10 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data, val_target)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |     1/    3 batches | lr 0.00 | ms/batch 5525.71 | loss  7.18 | ppl  1312.65
| epoch   1 |     2/    3 batches | lr 0.00 | ms/batch 3280.56 | loss  2.44 | ppl    11.50
| epoch   1 |     3/    3 batches | lr 0.00 | ms/batch 2647.08 | loss  2.62 | ppl    13.71
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 13.23s | valid loss  2.58 | valid ppl    13.16
-----------------------------------------------------------------------------------------
| epoch   2 |     1/    3 batches | lr 0.00 | ms/batch 10673.17 | loss  3.40 | ppl    30.04
| epoch   2 |     2/    3 batches | lr 0.00 | ms/batch 4094.71 | loss  1.70 | ppl     5.45
| epoch   2 |     3/    3 batches | lr 0.00 | ms/batch 1529.45 | loss  1.08 | ppl     2.94
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 17.28s | valid loss  2.53 | valid ppl    12.49
-----------------------------------

In [None]:
test_loss = evaluate(best_model, test_data, test_target)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| End of training | test loss  2.15 | test ppl     8.56


In [None]:
# Just to stop the execution if execute all
# br

In [None]:
def tensor_to_tokens(my_tensor):
  """Converts the output tensor holding the holds embedding to the holds word"""
  x = [int(t) for t in my_tensor]
  return [inverse_token_vocabulary[t] for t in x]

In [None]:
def write_sentence(xx):
    sentence = ""
    for word in tensor_to_tokens(xx.reshape(-1)):
        sentence+= word +" "
    print(sentence)
    
def complete_sentence(xx, length, src_mask):
    sentence = ""
    # print(xx)
    # for word in tensor_to_tokens(xx[0,:,2]):
    #     sentence+= word +" "
    sentence += "|"
    # crete new tokens
    output_tensor = torch.zeros(MAX_LENGTH + length - 1)
    output_tensor[:MAX_LENGTH-1] = xx[0,1:MAX_LENGTH,2]
    for i in range(length):
        # Compute output of the model from the current sequence
        out = model(xx.to(device), src_mask)[0]
        # Take the label with max probability
        labels = torch.argmax(out, 1).view(-1)
        # Extract the corresponding token and append it to the sequence
        # IMPORTANT: we have to take the token corresponding to the actual hold, so the i-th member of the output
        # which is placed at the MAX_LENGTH + i index of the full concatenation
        idx_new_pred = MAX_LENGTH+i-1
        # print('idx', idx_new_pred)
        new_token = tensor_to_tokens(labels.reshape(-1))[idx_new_pred]
        # print('token', new_token)
        sentence += new_token + " "

        # Now we want to update the current sequence by appending the information 
        # corresponding to the computed token

        # Compute the index of the corresponding token in the input sequence
        output_tensor[idx_new_pred] = labels.reshape(-1)[idx_new_pred].item()
        index_xx_hold = torch.argwhere(xx[:,:,2] == labels.reshape(-1)[idx_new_pred].item())[0][-1]
        prediction = xx[:,index_xx_hold,:].reshape(1,1,3)

        # We update the corresponding hold in the sentence
        if (xx.shape[1] > idx_new_pred + 1):
          xx[:, idx_new_pred+1, :] = prediction
          # print(xx)
          src_mask = model.generate_square_subsequent_mask(len(xx)).to(device)
    # print(sentence[sentence.find('|'):])

    return sentence, output_tensor
    
    
# t = "At the time of his marriage, William's father, John Yeats, was studying law, but would later pursue art studies at Heatherley School of Fine Art, in London. William's mother, Susan Mary Pollexfen, came from Sligo, from a wealthy merchant family, which owned a"
# t = torch.tensor(vocab(tokenizer(t)))
# src_mask = model.generate_square_subsequent_mask(len(t)).to(device)
# t = t.reshape([-1, 1]).to(device)

# complete_sentence(t, 100, src_mask)  

In [None]:
idx_to_check = range(100,120)
for i in idx_to_check:
  t = input[i].view(1, input.shape[1],-1)
  t_input = t.clone()
  t_input[:,MAX_LENGTH:,:] = t_input[:,0,:].repeat(1,MAX_LENGTH-1,1) # Replace last parts of the input with garbage, it is anyway not used at inference
  src_mask = model.generate_square_subsequent_mask(len(t_input)).to(device)
  t_input = t_input.to(device)

  sentence, output_tensor = complete_sentence(t_input, MAX_LENGTH, src_mask) 
  print("=================================================================")
  print(f"Checking sequence {i}")
  print('OUTPUT:', output_tensor[MAX_LENGTH-1:])
  print('TARGET:', outputs_tokens[i][MAX_LENGTH-1:])
# print(t_input)

Checking sequence 100
OUTPUT: tensor([1., 2., 3., 4., 5., 6., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
        7., 7.])
TARGET: tensor([12.,  0.,  7.,  4., 11.,  8., 10.,  5.,  9.,  3.,  1.,  2.,  6., 13.,
        14., 14., 14., 14., 14., 14.])
Checking sequence 101
OUTPUT: tensor([1., 2., 3., 4., 5., 6., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
        7., 7.])
TARGET: tensor([11.,  1.,  7.,  4.,  6.,  5., 12., 10.,  0.,  9.,  2.,  8.,  3., 13.,
        14., 14., 14., 14., 14., 14.])
Checking sequence 102
OUTPUT: tensor([1., 2., 3., 4., 5., 6., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
        7., 7.])
TARGET: tensor([ 9.,  2., 10.,  6., 12.,  4.,  0.,  8.,  5., 11.,  7.,  3.,  1., 13.,
        14., 14., 14., 14., 14., 14.])
Checking sequence 103
OUTPUT: tensor([1., 2., 3., 4., 5., 6., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7., 7.,
        7., 7.])
TARGET: tensor([ 7.,  4.,  1.,  0., 11.,  9.,  2.,  6.,  3.,  8., 12., 10.,  5., 13.,
        14., 14., 14., 14., 14.

In [None]:
# idx_to_check = 869
# t = input[idx_to_check].view(1, input.shape[1],-1)
# t_input = t.clone()
# # t_input[:,MAX_LENGTH:,:] = t_input[:,0,:].repeat(1,MAX_LENGTH-1,1)
# print(t_input)

In [None]:
# src_mask = model.generate_square_subsequent_mask(len(t_input)).to(device)
# t_input = t_input.to(device)

# sentence = complete_sentence(t_input, 60, src_mask) 

In [None]:
# outputs_tokens[idx_to_check]