In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import json
import regex
import os
from tqdm.notebook import tqdm
import math
import sys
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import *


In [2]:
# Read Character to Ordinal Encoding Mapping
with open('/kaggle/input/asl-fingerspelling/character_to_prediction_index.json') as json_file:
    char2ord = json.load(json_file)

display(pd.Series(char2ord).to_frame('Ordinal Encoding'))

Unnamed: 0,Ordinal Encoding
,0
!,1
#,2
$,3
%,4
&,5
',6
(,7
),8
*,9


In [3]:
frames=np.load('/kaggle/input/aslr-hands/X.npy', allow_pickle=True)
phrases = np.load('/kaggle/input/aslr-hands/Y.npy', allow_pickle=True)

# GLOBAL CONFIG

In [4]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

In [5]:
MAX_SEQUENCE_LENGTH = 500
NUM_EPOCHS = 50
START_TOKEN = '<START>'
END_TOKEN = '<END>'
PADDING_TOKEN = '<PAD>'
BATCH_SIZE =32
d_model = 512
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 2
learning_rate = 1e-4
k=5
load_model = False
num_columns = frames[0].shape[1]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
sos_eos_pad= {START_TOKEN: len(char2ord), END_TOKEN: len(char2ord)+1, PADDING_TOKEN: len(char2ord)+2}
char2ord.update(sos_eos_pad)

phrase_vocab_size = len(char2ord)

display(char2ord)

{' ': 0,
 '!': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '=': 27,
 '?': 28,
 '@': 29,
 '[': 30,
 '_': 31,
 'a': 32,
 'b': 33,
 'c': 34,
 'd': 35,
 'e': 36,
 'f': 37,
 'g': 38,
 'h': 39,
 'i': 40,
 'j': 41,
 'k': 42,
 'l': 43,
 'm': 44,
 'n': 45,
 'o': 46,
 'p': 47,
 'q': 48,
 'r': 49,
 's': 50,
 't': 51,
 'u': 52,
 'v': 53,
 'w': 54,
 'x': 55,
 'y': 56,
 'z': 57,
 '~': 58,
 '<START>': 59,
 '<END>': 60,
 '<PAD>': 61}

# Preprocess

In [7]:
def sequence_pad(frame):
    if len(frame)<MAX_SEQUENCE_LENGTH:
        # Specify the number of empty rows to add
        num_empty_rows = MAX_SEQUENCE_LENGTH-len(frame)
        # Create empty rows filled with zeros
        empty_rows = torch.zeros((num_empty_rows, frame.shape[1])).to(device)

        # Concatenate the empty rows to the original array
        frame = torch.cat((frame, empty_rows), dim=0)
    else:
        frame = frame[:MAX_SEQUENCE_LENGTH]
        
    # Check for NaN values in the tensor
    nan_mask = torch.isnan(frame)

    # Replace NaN values with 0 using torch.where
    frame = torch.where(nan_mask, torch.tensor(0.0), frame)
    return(frame)

def sequence_normalize(frame):
    # Check for NaN values in the tensor
    nan_mask = torch.isnan(frame)

    # Replace NaN values with 0 using torch.where
    frame = torch.where(nan_mask, torch.tensor(0.0), frame)

    # Calculate the mean and standard deviation along the desired dimension (column-wise)
    mean = torch.mean(frame, dim=0)
    std = torch.std(frame, dim=0)

    # Normalize the data by subtracting the mean and dividing by the standard deviation
    normalized_frame = (frame-mean)/std
    return normalized_frame

# Masking

In [8]:
NEG_INFTY = -1e9

def create_masks(frame, phrase):
    look_ahead_mask = torch.full([MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , True) # Creates a tensor with all values = True
    #print(look_ahead_mask)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1) # Upper traingle = True only
    #print(look_ahead_mask)
    encoder_padding_mask = torch.full([MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , False)
    decoder_padding_mask_self_attention = torch.full([MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , False)
    decoder_padding_mask_cross_attention = torch.full([MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH] , False)
    #print(encoder_padding_mask)

    frame_length, eng_sentence_length = len(frame), len(phrase)
    frame_chars_to_padding_mask = np.arange(frame_length + 1, MAX_SEQUENCE_LENGTH)
    eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, MAX_SEQUENCE_LENGTH)
    encoder_padding_mask[:, frame_chars_to_padding_mask] = True
    encoder_padding_mask[frame_chars_to_padding_mask, :] = True
    decoder_padding_mask_self_attention[:, eng_chars_to_padding_mask] = True
    decoder_padding_mask_self_attention[eng_chars_to_padding_mask, :] = True
    decoder_padding_mask_cross_attention[:, eng_chars_to_padding_mask] = True
    decoder_padding_mask_cross_attention[eng_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [9]:
class CustomDataset():
    def __init__(self, frames, targets):
        self.frames = frames
        self.targets = targets
    
    def __len__(self):
        return self.frames.shape[0]
    
    def tokenize_and_pad(self, target, use_start):
        sentence_word_indices = np.array([char2ord[token] for token in list(target.lower())])
        if use_start:
            sentence_word_indices= np.insert(sentence_word_indices, 0, char2ord[START_TOKEN])
        sentence_word_indices = np.append(sentence_word_indices, char2ord[END_TOKEN])
        for _ in range(len(sentence_word_indices), MAX_SEQUENCE_LENGTH):
            sentence_word_indices = np.append(sentence_word_indices, char2ord[PADDING_TOKEN])

        return sentence_word_indices

    def __getitem__(self, idx):
        frame = torch.tensor(np.array(self.frames[idx])).to(device)
        phrase = self.targets[idx]
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(frame, phrase)
        frame = sequence_normalize(frame)
        frame = sequence_pad(frame)
        input_phrase = self.tokenize_and_pad(phrase, use_start = True)
        target = self.tokenize_and_pad(phrase, use_start = False)
        
        return {
            'frame': frame,
            'input_phrase': torch.tensor(input_phrase).to(device),
            'target': torch.tensor(target).to(device),
            'encoder_self_attention_mask' : encoder_self_attention_mask.to(device), 
            'decoder_self_attention_mask' : decoder_self_attention_mask.to(device),
            'decoder_cross_attention_mask' : decoder_cross_attention_mask.to(device) 
            
        }

In [10]:
dataset = CustomDataset(frames, phrases)
print('Processed frame shape:',dataset[0]['frame'].shape)
print('Processed input_phrase shape:',dataset[0]['input_phrase'].shape)
print('Processed target shape:',dataset[0]['target'].shape)
print('encoder_self_attention_mask shape:',dataset[0]['encoder_self_attention_mask'].shape)
print('decoder_self_attention_mask shape:',dataset[0]['decoder_self_attention_mask'].shape)
print('decoder_cross_attention_mask shape:',dataset[0]['decoder_cross_attention_mask'].shape)

Processed frame shape: torch.Size([500, 84])
Processed input_phrase shape: torch.Size([500])
Processed target shape: torch.Size([500])
encoder_self_attention_mask shape: torch.Size([500, 500])
decoder_self_attention_mask shape: torch.Size([500, 500])
decoder_cross_attention_mask shape: torch.Size([500, 500])


In [11]:
dataloader = DataLoader(dataset=dataset, batch_size = BATCH_SIZE)

In [12]:
for batch in dataloader:
    print('Frames shape of a batch:',batch['frame'].shape)
    print('Input phrase shape of a batch:',batch['input_phrase'].shape)
    break

Frames shape of a batch: torch.Size([32, 500, 84])
Input phrase shape of a batch: torch.Size([32, 500])


# Model

In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self):
        even_i = torch.arange(0, self.d_model, 2).float()
        denominator = torch.pow(10000, even_i/self.d_model)
        position = (torch.arange(self.max_sequence_length)
                          .reshape(self.max_sequence_length, 1))
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE

In [14]:
class CustomEmbedding(nn.Module):
    def __init__(self, max_sequence_length, num_columns, d_model, encoder_embed):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.num_columns = num_columns
        if encoder_embed:
            #self.embedding = nn.Linear(num_columns, d_model)
            self.conv1 = nn.Conv1d(in_channels=num_columns, out_channels = 1024, kernel_size = 3, stride=1, padding = 'same').to(device)
            self.conv2 = nn.Conv1d(in_channels=1024, out_channels = d_model, kernel_size = 3, stride=1, padding = 'same').to(device)
        else:
            self.embedding = nn.Embedding(max_sequence_length, d_model).to(device)

        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1).to(device)
    
    def forward(self, x, encoder_embed):
        if encoder_embed:
            x = torch.transpose(x, -2, -1)
            x = self.conv1(x)
            x = self.conv2(x)
            x = torch.transpose(x, -2, -1).to(device)
        else:
            x = self.embedding(x)
        pos = self.position_encoder().to(device)
        x = self.dropout(x + pos)
        return x

In [15]:
'''class CustomEmbedding(nn.Module):
    def __init__(self, max_sequence_length, num_columns, d_model, encoder_embed):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.num_columns = num_columns
        if encoder_embed:
            self.embedding = nn.Linear(num_columns, d_model).to(device)
        else:
            self.embedding = nn.Embedding(max_sequence_length, d_model).to(device)
        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)
        self.dropout = nn.Dropout(p=0.1)
        
    def forward(self, x):
        x = self.embedding(x)
        pos = self.position_encoder().to(device)
        x = self.dropout(x + pos)
        return x'''

'class CustomEmbedding(nn.Module):\n    def __init__(self, max_sequence_length, num_columns, d_model, encoder_embed):\n        super().__init__()\n        self.max_sequence_length = max_sequence_length\n        self.num_columns = num_columns\n        if encoder_embed:\n            self.embedding = nn.Linear(num_columns, d_model).to(device)\n        else:\n            self.embedding = nn.Embedding(max_sequence_length, d_model).to(device)\n        self.position_encoder = PositionalEncoding(d_model, max_sequence_length)\n        self.dropout = nn.Dropout(p=0.1)\n        \n    def forward(self, x):\n        x = self.embedding(x)\n        pos = self.position_encoder().to(device)\n        x = self.dropout(x + pos)\n        return x'

In [16]:
custom = CustomEmbedding(500, 84, 512,True)
input = torch.randn(32,500,84).to(device)
custom(input, True).shape

torch.Size([32, 500, 512])

In [17]:
class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out


class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [18]:
def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    scaled = torch.matmul(q, k.transpose(-1, -2)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float))
    if mask is not None:
        scaled = scaled.permute(1, 0, 2, 3) + mask
        scaled = scaled.permute(1, 0, 2, 3)
    attention = F.softmax(scaled, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        batch_size, sequence_length, d_model = x.size()
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask)
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

In [19]:
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.kv_layer = nn.Linear(d_model , 2 * d_model)
        self.q_layer = nn.Linear(d_model , d_model)
        self.linear_layer = nn.Linear(d_model, d_model)

    def forward(self, x, y, mask):
        batch_size, sequence_length, d_model = x.size() # in practice, this is the same for both languages...so we can technically combine with normal attention
        kv = self.kv_layer(x)
        q = self.q_layer(y)
        kv = kv.reshape(batch_size, sequence_length, self.num_heads, 2 * self.head_dim)
        q = q.reshape(batch_size, sequence_length, self.num_heads, self.head_dim)
        kv = kv.permute(0, 2, 1, 3)
        q = q.permute(0, 2, 1, 3)
        k, v = kv.chunk(2, dim=-1)
        values, attention = scaled_dot_product(q, k, v, mask) # We don't need the mask for cross attention, removing in outer function!
        values = values.permute(0, 2, 1, 3).reshape(batch_size, sequence_length, d_model)
        out = self.linear_layer(values)
        return out

In [20]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, self_attention_mask):
        residual_x = x.clone()
        x = self.attention(x, mask=self_attention_mask)
        x = self.dropout1(x)
        x = self.norm1(x + residual_x)
        residual_x = x.clone()
        x = self.ffn(x)
        x = self.dropout2(x)
        x = self.norm2(x + residual_x)
        return x
    
class SequentialEncoder(nn.Sequential):
    def forward(self, *inputs):
        x, self_attention_mask  = inputs
        for module in self._modules.values():
            x = module(x, self_attention_mask)
        return x
        
class Encoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, num_columns, encoder_embed=True):
        super().__init__()
        self.encoder_embedding = CustomEmbedding(max_sequence_length, num_columns, d_model, encoder_embed=True)
        self.layers = SequentialEncoder(*[EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                                      for _ in range(num_layers)])
    
    def forward(self, x, self_attention_mask):
        x = self.encoder_embedding(x, encoder_embed = True)
        x = self.layers(x, self_attention_mask)
        return x

In [21]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.encoder_decoder_attention = MultiHeadCrossAttention(d_model=d_model, num_heads=num_heads)
        self.layer_norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.layer_norm3 = LayerNormalization(parameters_shape=[d_model])
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        _y = y.clone()
        y = self.self_attention(y, mask=self_attention_mask)
        y = self.dropout1(y)
        y = self.layer_norm1(y + _y)

        _y = y.clone()
        y = self.encoder_decoder_attention(x, y, mask=cross_attention_mask)
        y = self.dropout2(y)
        y = self.layer_norm2(y + _y)

        _y = y.clone()
        y = self.ffn(y)
        y = self.dropout3(y)
        y = self.layer_norm3(y + _y)
        return y


class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        x, y, self_attention_mask, cross_attention_mask = inputs
        for module in self._modules.values():
            y = module(x, y, self_attention_mask, cross_attention_mask)
        return y

class Decoder(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, num_columns, encoder_embed=False):
        super().__init__()
        self.decoder_embedding = CustomEmbedding(max_sequence_length, num_columns, d_model, encoder_embed=False)
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, x, y, self_attention_mask, cross_attention_mask):
        y = self.decoder_embedding(y, encoder_embed = False)
        y = self.layers(x, y, self_attention_mask, cross_attention_mask)
        return y

In [22]:
class Transformer(nn.Module):
    def __init__(self,d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          num_columns,
                          phrase_vocab_size):
        super().__init__()
        self.encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, num_columns, encoder_embed=True)
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers, max_sequence_length, num_columns, encoder_embed=False)
        self.linear = nn.Linear(d_model, phrase_vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    def forward(self,
                x,
                y,
                encoder_self_attention_mask=None,
                decoder_self_attention_mask=None,
                decoder_cross_attention_mask=None): # x, y are batch of sentences
        x = self.encoder(x, encoder_self_attention_mask)
        out = self.decoder(x, y, decoder_self_attention_mask, decoder_cross_attention_mask)
        out = self.linear(out)
        return out

In [23]:
transformer = Transformer(d_model,ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          MAX_SEQUENCE_LENGTH,
                         num_columns,
                        phrase_vocab_size)
display(transformer)

Transformer(
  (encoder): Encoder(
    (encoder_embedding): CustomEmbedding(
      (conv1): Conv1d(84, 1024, kernel_size=(3,), stride=(1,), padding=same)
      (conv2): Conv1d(1024, 512, kernel_size=(3,), stride=(1,), padding=same)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiHeadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNormalization()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNormalization()
 

In [24]:
criterion = nn.CrossEntropyLoss(ignore_index=char2ord[PADDING_TOKEN],
                              reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optimizer = torch.optim.Adam(transformer.parameters(), lr=learning_rate)

In [25]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    steps = len(dataloader)
    total_train_epoch_loss = 0
    total_train_epoch_acc = 0
    transformer.train()
    for batch_idx, batch in tqdm(enumerate(dataloader)):
        frame = batch['frame']
        input_phrase = batch['input_phrase']
        targets = batch['target'].view(-1)
        encoder_self_attention_mask = batch['encoder_self_attention_mask']
        decoder_self_attention_mask = batch['decoder_self_attention_mask']
        decoder_cross_attention_mask = batch['decoder_cross_attention_mask']
        output = model(frame, input_phrase, encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask)

        batch_loss = criterion(output.view(-1, phrase_vocab_size).to(device), targets.to(device)).to(device)
        valid_indices = torch.where(targets == char2ord[PADDING_TOKEN], False, True)
        batch_loss = batch_loss.sum() / valid_indices.sum()

        optimizer.zero_grad()
        
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # Backpropagation
        batch_loss.backward() # Gradients calculation
        optimizer.step() # Updation of weights
        batch_acc = (output.argmax(dim=2) == batch['target']).sum() / valid_indices.sum()

        total_train_epoch_loss += batch_loss
        total_train_epoch_acc += batch_acc

        #total_train_samples+= len(batch['target'].view(-1))

        #print(f'Batch train Accuracy: {batch_acc} | Batch train Loss: {batch_loss} | Batch:{batch_idx}')
    train_epoch_acc = total_train_epoch_acc / steps
    train_epoch_loss = total_train_epoch_loss / steps
    return train_epoch_acc, train_epoch_loss

In [26]:
def eval_model(model, dataloader, device):
    steps = len(dataloader)
    total_val_epoch_loss = 0
    total_val_epoch_acc = 0
    total_val_samples = 0
    transformer.eval()
    with torch.no_grad():
        for batch_idx, batch in tqdm(enumerate(dataloader)):
            frame = batch['frame']
            input_phrase = batch['input_phrase']
            targets = batch['target'].view(-1)
            encoder_self_attention_mask = batch['encoder_self_attention_mask']
            decoder_self_attention_mask = batch['decoder_self_attention_mask']
            decoder_cross_attention_mask = batch['decoder_cross_attention_mask']
            output = model(frame, input_phrase, encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask)

            batch_loss = criterion(output.view(-1, phrase_vocab_size).to(device), targets.to(device)).to(device)
            valid_indices = torch.where(targets == char2ord[PADDING_TOKEN], False, True)
            batch_loss = batch_loss.sum() / valid_indices.sum()

            batch_acc = (output.argmax(dim=2) == batch['target']).sum() / valid_indices.sum()

            total_val_epoch_loss += batch_loss
            total_val_epoch_acc += batch_acc
    
        #total_val_samples+= len(batch['target'].view(-1))

            #print(f'Batch val Accuracy: {batch_acc} | Batch val Loss: {batch_loss} | Batch:{batch_idx}')
    val_epoch_acc = total_val_epoch_acc / steps
    val_epoch_loss = total_val_epoch_loss / steps
    return val_epoch_acc, val_epoch_loss

In [27]:
def save_checkpoint (state, filename="/kaggle/working/my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)
    
def load_checkpoint (model, optimizer, checkpoint):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

if load_model:
    load_checkpoint (transformer, optimizer, torch.load("/kaggle/working/my_checkpoint.pth.tar"))

In [None]:
transformer.to(device)
kf = KFold(n_splits=k)
# Iterate over the folds
for fold, (train_index, val_index) in enumerate(kf.split(dataset)):
    print(f"Training on fold {fold+1}/{k}...")

    # Create data loaders for training and validation
    train_dataset = Subset(dataset, train_index)
    val_dataset = Subset(dataset, val_index)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    for epoch in range(NUM_EPOCHS):
        print(f'Epoch {epoch + 1}/{NUM_EPOCHS}')
        
        checkpoint = {'state_dict' : transformer.state_dict(), 'optimizer': optimizer.state_dict()}
        save_checkpoint (checkpoint)
        train_epoch_acc, train_epoch_loss = train_one_epoch(transformer, train_loader, criterion, optimizer, device)
        print('-----------------------------------------------------------------')
        print()
        print(f'Epoch train Accuracy:{train_epoch_acc} | Epoch train Loss:{train_epoch_loss} | Epoch:{epoch + 1}')
        
        val_epoch_acc, val_epoch_loss = eval_model(transformer, val_loader, device)
        print(f'Epoch val Accuracy:{val_epoch_acc} | Epoch val Loss:{val_epoch_loss} | Epoch:{epoch + 1}')
        print()

Training on fold 1/5...
Epoch 1/50
=> Saving checkpoint


0it [00:00, ?it/s]

-----------------------------------------------------------------

Epoch train Accuracy:0.2443910539150238 | Epoch train Loss:2.5944783687591553 | Epoch:1


0it [00:00, ?it/s]

Epoch val Accuracy:0.32722100615501404 | Epoch val Loss:2.2152366638183594 | Epoch:1

Epoch 2/50
=> Saving checkpoint


0it [00:00, ?it/s]

-----------------------------------------------------------------

Epoch train Accuracy:0.36517980694770813 | Epoch train Loss:2.0930991172790527 | Epoch:2


0it [00:00, ?it/s]

Epoch val Accuracy:0.4434088170528412 | Epoch val Loss:1.8328925371170044 | Epoch:2

Epoch 3/50
=> Saving checkpoint


0it [00:00, ?it/s]

-----------------------------------------------------------------

Epoch train Accuracy:0.47233662009239197 | Epoch train Loss:1.7414511442184448 | Epoch:3


0it [00:00, ?it/s]

Epoch val Accuracy:0.5325650572776794 | Epoch val Loss:1.5448418855667114 | Epoch:3

Epoch 4/50
=> Saving checkpoint


0it [00:00, ?it/s]

-----------------------------------------------------------------

Epoch train Accuracy:0.539319634437561 | Epoch train Loss:1.5252717733383179 | Epoch:4


0it [00:00, ?it/s]

Epoch val Accuracy:0.5865300297737122 | Epoch val Loss:1.3785041570663452 | Epoch:4

Epoch 5/50
=> Saving checkpoint


0it [00:00, ?it/s]

In [None]:
'''import shutil
shutil.rmtree('/kaggle/working')'''