In [22]:
import numpy as np # linear algebra

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import tensorflow as tf
from sklearn.model_selection import train_test_split
import h5py
import os
import math
import pyarrow.parquet as pq
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from rotary_embedding_tensorflow import apply_rotary_emb, RotaryEmbedding

char_to_idx = {
    "0": 15,
    "1": 16,
    "2": 17,
    "3": 18,
    "4": 19,
    "5": 20,
    "6": 21,
    "7": 22,
    "8": 23,
    "9": 24,
    " ": 0,
    "!": 1,
    "#": 2,
    "$": 3,
    "%": 4,
    "&": 5,
    "'": 6,
    "(": 7,
    ")": 8,
    "*": 9,
    "+": 10,
    ",": 11,
    "-": 12,
    ".": 13,
    "/": 14,
    ":": 25,
    ";": 26,
    "=": 27,
    "?": 28,
    "@": 29,
    "[": 30,
    "_": 31,
    "a": 32,
    "b": 33,
    "c": 34,
    "d": 35,
    "e": 36,
    "f": 37,
    "g": 38,
    "h": 39,
    "i": 40,
    "j": 41,
    "k": 42,
    "l": 43,
    "m": 44,
    "n": 45,
    "o": 46,
    "p": 47,
    "q": 48,
    "r": 49,
    "s": 50,
    "t": 51,
    "u": 52,
    "v": 53,
    "w": 54,
    "x": 55,
    "y": 56,
    "z": 57,
    "~": 58,
    "<SOS>": 60, # start of sequence
    "<EOS>": 61, # end of sequence
    "<PAD>": 59  # sequence padding token
}
LANDMARK_GROUPS = {
    'face': list(range(0, 468)),
    'left_hand': list(range(468, 489)),
    'pose': list(range(489, 522)),
    'right_hand': list(range(522, 543))
}
SOS_TOKEN = 60  # New class for Start of Sequence
EOS_TOKEN = 61  # New class for End of Sequence
PAD_TOKEN = 59
NUM_CLASSES = 62 # Total number of classes including 3 special tokens
TIME = 128 # How many frames are we cutting down to? Some sequences go to 300-600 frames, so if hardware allows, increase this
# Get all physical devices (CPU and GPU)
gpus = tf.config.list_physical_devices('GPU')
# Enable GPU device only
if gpus:
    tf.config.set_visible_devices(gpus, 'GPU')  # Only use the first GPU
print(gpus)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [23]:
train_dataset = tf.data.Dataset.load("/kaggle/input/train-and-validation-datasets-use-tfsave/train_ds")
val_dataset = tf.data.Dataset.load("/kaggle/input/train-and-validation-datasets-use-tfsave/val_ds")

In [24]:
# Set max length for padding (adjust based on your data)
max_len = 64 # Including EOS and SOS
# Add SOS, EOS, and PAD tokens to the labels
# Set max length for padding (adjust based on your data)
# Add SOS, EOS, and PAD tokens to the labels

batch_size = 64


In [25]:
for sample_inputs, sample_outputs in train_dataset.take(1):
    # Access the first few elements from the batch
    print("Phrase (Decoder Inputs):", sample_inputs[0])
    print("Keypoints (Shape):", sample_inputs.shape)
    print("Decoder Outputs:", sample_outputs[0])

Phrase (Decoder Inputs): tf.Tensor(
[[-1.2144066   1.808098    2.1426094  ...  0.42330128  0.61128515
  -1.8838092 ]
 [-1.5048839   2.035796    1.7815506  ...  0.39550194  0.62154907
  -1.5194979 ]
 [-1.491255    2.0297668   1.612138   ...  0.42516923  0.6213869
  -1.3991629 ]
 ...
 [ 0.          0.          0.         ...  0.33687034  0.78608394
  -1.8386133 ]
 [ 0.          0.          0.         ...  0.37190276  0.7497251
  -1.9831192 ]
 [ 0.          0.          0.         ...  0.40220946  0.6746319
  -2.052107  ]], shape=(128, 78), dtype=float32)
Keypoints (Shape): (32, 128, 78)
Decoder Outputs: tf.Tensor(
[60 21 22 24 12 24 20 21 12 22 15 17 20 61 59 59 59 59 59 59 59 59 59 59
 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59], shape=(64,), dtype=int32)


In [26]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.keras.callbacks import LearningRateScheduler


CONV2DLAYER = 256
DENSELAYER = 128
ENCODER_DIM = 64
BLOCKS = 2
DECODER_DIM = 208
DECODER_DENSE_FFN_DIM = 208
FEATURE_EXTRACTOR_DIM = 52
MULTIPLICATION_FACTOR_ENCODER = 4
NUM_ENCODER_BLOCKS = 2 # Set to either 7 or 14


class LandmarkEmbedding(layers.Layer):
    def __init__(self, num_hid=200, maxlen=TIME):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, padding="same", activation="relu"
        )
        self.pos_emb = self.positional_encoding(maxlen, num_hid)
        self.maxlen = maxlen
        self.num_hid = num_hid

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        
        x = tf.math.multiply(x, tf.math.sqrt(tf.cast(self.num_hid, tf.float32)))
        x = x + self.pos_emb
        
        return x
    
    def positional_encoding(self, maxlen, num_hid):
        depth = num_hid/2
        positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
        depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
        angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
        angle_rads = tf.linalg.matmul(positions, angle_rates)
        pos_encoding = tf.concat(
          [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
          axis=-1) 
        return pos_encoding


class PositionalEncoding(layers.Layer):
    def __init__(self, num_vocab=NUM_CLASSES, maxlen=max_len, num_hid=DECODER_DIM):
        super().__init__()
        self.num_hid = num_hid
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        #self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)
        '''
        self.pos_emb = tf.math.divide(
            self.positional_encoding(maxlen-1, num_hid),
            tf.math.sqrt(tf.cast(num_hid, tf.float32)))
        '''
        self.pos_emb = self.positional_encoding(maxlen, num_hid)

    def call(self, x):
        seq_len = tf.shape(x)[1]  # Sequence length of the input
        x = self.emb(x)  # Shape: [batch_size, seq_len, num_hid]
        x *= tf.math.sqrt(tf.cast(self.num_hid, tf.float32))  # Scale embeddings
        
        # Add positional encoding (broadcast to match input shape)
        pos_encoding = self.pos_emb[:seq_len, :]  # Shape: [seq_len, num_hid]
        pos_encoding = tf.expand_dims(pos_encoding, axis=0)  # Shape: [1, seq_len, num_hid]
        return x + pos_encoding  # Broadcasting happens here

    
    def positional_encoding(self, maxlen, num_hid):
        depth = num_hid/2
        positions = tf.range(maxlen, dtype = tf.float32)[..., tf.newaxis]
        depths = tf.range(depth, dtype = tf.float32)[np.newaxis, :]/depth
        angle_rates = tf.math.divide(1, tf.math.pow(tf.cast(10000, tf.float32), depths))
        angle_rads = tf.linalg.matmul(positions, angle_rates)
        pos_encoding = tf.concat(
          [tf.math.sin(angle_rads), tf.math.cos(angle_rads)],
          axis=-1) 
        return pos_encoding
# Adapted from github -- This code was NOT written by me
def shape_list(x, out_type=tf.int32): # This function is used to easily get the shape of a tensor
    static = x.shape.as_list()
    dynamic = tf.shape(x, out_type=out_type)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

# Feed-Forward Network with dropout and swish activation
class FFN(tf.keras.layers.Layer):
    def __init__(self, out, ffmult=4):
        super(FFN, self).__init__()
        self.dense1 = layers.Dense(ffmult * out, activation='swish')
        self.dense2 = layers.Dense(out)
        self.dropout1 = layers.Dropout(0.1)
        self.dropout2 = layers.Dropout(0.1)
    
    def call(self, inputs, training=True):
        x = self.dense1(inputs)
        x = self.dropout1(x, training=training)
        x = self.dense2(x)
        x = self.dropout2(x, training=training)
        return x

# Gated Linear Unit activation, which is more stable than ReLU
class FastGLU(tf.keras.layers.Layer):
    def __init__(self, in_size):
        super(FastGLU, self).__init__()
        self.in_size = in_size
        self.linear = tf.keras.layers.Dense(in_size * 2)  # Linear layer that outputs 2 * in_size 
        # Relu  

    def call(self, X):
        out = self.linear(X)  # Apply the linear layer
        # Split the output into two parts
        linear_part, gate_part = tf.split(out, num_or_size_splits=2, axis=-1)
        # Add sigmoid
        gated_output = linear_part * tf.nn.sigmoid(gate_part)
        return gated_output

# Convolution block, used in the encoder
class Conv(tf.keras.layers.Layer):

    def __init__(self, in_size, kernel_size=31, stride=1, exp=2):
        # We're using a 1x1 convolution to expand the channels, then a 31x1 convolution, then a 1x1 convolution to reduce the channels back to the original size
        # All with dropout, bn, and GLU activation
        super(Conv, self).__init__()
        self.conv1 = tf.keras.layers.Conv2D(exp * in_size, kernel_size=1, strides=stride, padding='valid')
        self.conv2 = tf.keras.layers.Conv2D(in_size, kernel_size=1, strides=stride, padding='valid')
        self.bn = tf.keras.layers.BatchNormalization(momentum=0.9, epsilon=1e-5)
        self.act = tf.keras.layers.Activation(tf.nn.swish)
        self.glu = FastGLU(in_size=exp * in_size)  
        self.conv_dw = tf.keras.layers.DepthwiseConv2D(kernel_size=(kernel_size, 1), strides=stride, padding='same', depth_multiplier=1)
        self.dropout = tf.keras.layers.Dropout(0.1)

    def call(self, x, training=True):
        B, T, S = shape_list(x)
        x = tf.reshape(x, (B, T, 1, S))   
        x = self.conv1(x)  
        x = self.glu(x)
        x = self.conv_dw(x)
        x = self.bn(x)
        x = self.act(x)  
        x = self.conv2(x)
        x = tf.reshape(x, (B, T, S))
        x = self.dropout(x, training=training)
        return x  

# LLaMA Attention does not need positional embeddings which have to be calculated for each input, speeding up training time by 250% 
# This allows us to use more parameters
# This LLaMA is half-adapted half origimal, as the original is in pytorch
class LLaMAAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, head_dim):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.scaling = 1 / tf.math.sqrt(tf.cast(head_dim, tf.float32))
        self.dense_qkv = tf.keras.layers.Dense(3 * self.num_heads * self.head_dim, use_bias=False)
        self.out_proj = tf.keras.layers.Dense(num_heads * head_dim, use_bias=False)
    
    
    def apply_attention(self, query, key, value):
        # query, key, value shape: [batch, num_heads, seq_len, head_dim]
        
        # Calculate attention scores
        scores = tf.matmul(query, key, transpose_b=True) * self.scaling  # [batch, num_heads, seq_len, seq_len]

        # Apply softmax
        attention_weights = tf.nn.softmax(scores, axis=-1)
        
        # Apply attention to values
        output = tf.matmul(attention_weights, value)  # [batch, num_heads, seq_len, head_dim]
        return output
    
    def call(self, inputs, freqs):
        batch_size, seq_len, _ = tf.unstack(tf.shape(inputs))

        # Compute QK and V
        qkv = self.dense_qkv(inputs)
        qkv = tf.reshape(qkv, [batch_size, seq_len, 3, self.num_heads, self.head_dim])
        query, key, value = tf.unstack(qkv, axis=2)
        
        # Reshape freqs to match the shape of query and key
        freqs = tf.reshape(freqs, [1, seq_len, 1, self.head_dim])
        freqs = tf.repeat(freqs, repeats=self.num_heads, axis=2)
        
        # Apply rotary positional embeddings
        query = apply_rotary_emb(freqs, query)
        key = apply_rotary_emb(freqs, key)
        
        # Transpose for attention calculation
        # QKV are all going to be the same here because it is self attention
        query = tf.transpose(query, [0, 2, 1, 3])  # [batch, num_heads, seq_len, head_dim]
        key = tf.transpose(key, [0, 2, 1, 3])
        value = tf.transpose(value, [0, 2, 1, 3])
 
        # Apply custom attention
        attn_output = self.apply_attention(
            query=query,
            key=key,
            value=value
        )
        
        # Reshape attention output
        attn_output = tf.transpose(attn_output, [0, 2, 1, 3])  # [batch, seq_len, num_heads, head_dim]
        attn_output = tf.reshape(attn_output, [batch_size, seq_len, self.num_heads * self.head_dim])
        
        # Project back to how it was
        output = self.out_proj(attn_output)
        
        return output



# LLaMA Attention uses rotary embeddings, which are more computationally efficient than the standard positional embeddings.
class RotaryPositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, d_model, max_seq_len):
        super(RotaryPositionalEmbedding, self).__init__()
        self.d_model = d_model
        self.max_seq_len = max_seq_len

    def build(self, _):
        # Create positional embedding matrix
        position = tf.range(self.max_seq_len, dtype=tf.float32)[:, tf.newaxis]
        div_term = tf.exp(tf.range(0, self.d_model, 2, dtype=tf.float32) * -(tf.math.log(10000.0) / self.d_model))
        pos_encoding = tf.concat([
            tf.sin(position * div_term),
            tf.cos(position * div_term)
        ], axis=-1)
        self.positional_embedding = tf.Variable(pos_encoding, trainable=False)

    def call(self, x):
        seq_len = tf.shape(x)[1]
        return self.positional_embedding[:seq_len, :]
    

# The SqueezeFormer is better than the original encoder for this case because the conv paired with attention can capture relationships between the 
# long number of frames we are dealing with per sequence
# It's also computationally quicker
class SqueezeformerBlock(tf.keras.layers.Layer):
    def __init__(self, ff_dim, num_heads, multiplication_factor):
        super(SqueezeformerBlock, self).__init__()
        self.ff_dim = ff_dim
        self.num_heads = num_heads
        self.multiplication_factor = multiplication_factor

        # LLama 
        self.llama_attention = LLaMAAttention(num_heads=num_heads, head_dim=ff_dim // num_heads)

        # FFN
        self.ffn1 = FFN(self.ff_dim, self.multiplication_factor)
        self.ffn2 = FFN(self.ff_dim, self.multiplication_factor)

        # Conv Module
        self.conv = Conv(ff_dim)

        # Layer Normalizations
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-5)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-5)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-5)

    def call(self, inputs, freqs):
        attn_output = self.llama_attention(inputs, freqs)
        x = self.layernorm1(inputs + attn_output)

        
        ff_output = self.ffn1(x)
        x = self.layernorm2(x + ff_output)  # Residual connection

        conv_output = self.conv(x)
        x = self.layernorm3(x + conv_output)  # Residual connection

        ff_output = self.ffn2(x)
        x = self.layernorm3(x + ff_output)  # Residual connection
        return x




class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, num_blocks=NUM_ENCODER_BLOCKS, num_heads=4, ff_dim=200, ml=TIME):
        super(EncoderBlock, self).__init__()
        self.blocks = [SqueezeformerBlock(ff_dim, num_heads, MULTIPLICATION_FACTOR_ENCODER) for _ in range(num_blocks)]
        self.rope = RotaryPositionalEmbedding(d_model=ff_dim // num_heads, max_seq_len=ml)

    def call(self, inputs):
        freqs = self.rope(inputs) # Calculate the rotary embeddings once, and we can reuse them for each block as they're all the same length (padded)
        x = inputs
        for block in self.blocks:
            x = block(x, freqs)
        
        return x


class BasicEncoder(layers.Layer):
    def __init__(self, embed_dim=ENCODER_DIM, num_heads=4, feed_forward_dim=400, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=256)
        self.ffn = keras.Sequential(
            [
                layers.Dense(400, activation="relu"),
                layers.Dense(200),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=True):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def get_causal_attention_mask(labels):
    # Get the shape dynamically using shape_list
    shape = shape_list(labels)
    batch_size, seq_length = shape[0], shape[1]
    
    # Create a look-ahead mask for a single sequence (shape: [seq_length, seq_length])
    causal_mask = tf.linalg.band_part(tf.ones((seq_length, seq_length)), -1, 0)
    
    # Expand causal mask to (1, seq_length, seq_length) for broadcasting
    causal_mask = tf.expand_dims(causal_mask, axis=0)
    
    # Tile the mask along batch dimension: (batch_size, seq_length, seq_length)
    causal_mask = tf.tile(causal_mask, [batch_size, 1, 1])
    
    return tf.cast(causal_mask, tf.float32)


class Decoder(layers.Layer):
    def __init__(self, embed_dim=DECODER_DIM, num_heads=8, feed_forward_dim=DECODER_DENSE_FFN_DIM, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=256
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=200)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(400, activation="relu"),
                layers.Dense(200),
            ]
        )
    def call(self, target, enc_out, mask=None, training=True):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = get_causal_attention_mask(target)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att, training = training))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out, training = training) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out, training = training))
        return ffn_out_norm
        
def shape_list(x, out_type=tf.int32): # This function is used to easily get the shape of a tensor
    static = x.shape.as_list()
    dynamic = tf.shape(x, out_type=out_type)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]


@tf.keras.utils.register_keras_serializable('model', 'ASLFingerspellingModel')
class ASLFingerspellingModel(keras.Model):
    def __init__(self, num_classes, **kwargs):
        self.decoder = Decoder()
        self.le = LandmarkEmbedding()
        self.enc = keras.Sequential(
            [self.le]
            + [
                EncoderBlock()
            ]
        )
        self.concat = layers.Concatenate(axis=2) 
        self.add = layers.Add()
        self.dropout = layers.Dropout(0.1)
        # no softmax? Let's try it...
        self.output_layer = layers.Dense(NUM_CLASSES, activation="softmax") # This should be NUM_CLASSES?? Even if we're not predicting the SOS token...
        self.pos_encoding = PositionalEncoding(NUM_CLASSES, max_len - 1, 200)  # These need to be calculated EACH TIME
        self.num_classes = num_classes
        super(ASLFingerspellingModel, self).__init__(**kwargs)


    def call(self, inputs, training=True):
        inputs, outputs = inputs[0], inputs[1]
        DI = outputs[:, :-1]
        encoded = self.enc(inputs)

        decoded = self.decoder(self.pos_encoding(DI), encoded, mask=None)

        decoded = self.dropout(decoded, training=training)
        return self.output_layer(decoded)   

    def train_step(self, data):
        inp, out = data
        with tf.GradientTape() as tape:
            preds = self((inp, out), training=True)
            loss = self.compiled_loss(out[:, 1:], preds, regularization_losses=self.losses)
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        self.compiled_metrics.update_state(out[:, 1:], preds)
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        inp, out = data
        preds = self((inp, out), training=False)
        loss = self.compiled_loss(out[:, 1:], preds, regularization_losses=self.losses)
        self.compiled_metrics.update_state(out[:, 1:], preds)
        return {m.name: m.result() for m in self.metrics}
    

    def predict(self, x, training=False):
        # Pass through the encoder
        enc = self.enc(x, training=training)
    
        # Initialize the decoder input with SOS_TOKEN
        batch_size = tf.shape(x)[0]
        dec_input = tf.ones((batch_size, 1), dtype=tf.int32) * SOS_TOKEN
        # Greedy decoding
        for _ in range(max_len - 1):
            # Decode using the current input
            dec_out = self.decoder(self.pos_encoding(dec_input), enc, mask=None, training=training)
            
            # Compute logits and predict the next token
            logits = self.output_layer(dec_out[:, -1:])  # Focus only on the last timestep
            next_token = tf.argmax(logits, axis=-1, output_type=tf.int32)
    
            # Stop if all sequences predict EOS
            if tf.reduce_all(next_token == EOS_TOKEN):
                break
    
            # Append the predicted token to the decoder input
            dec_input = tf.concat([dec_input, next_token], axis=-1)
    
        # Return the generated sequence without the initial SOS token
        return dec_input[:, 1:]


model = ASLFingerspellingModel(NUM_CLASSES)
# Custom CCE because we want to ignore padding tokens
class MaskedSCCE(tf.keras.losses.Loss): 
    def __init__(self, num_classes=NUM_CLASSES, from_logits=False, **kwargs): # from logits = FALSE because our output dense has softmax
        super().__init__(**kwargs)
        self.num_classes = num_classes
        self.from_logits = from_logits

    def call(self, y_true, y_pred):
        print(y_true, y_pred)
        # Cast y_true to integer type
        y_true = tf.cast(y_true, tf.int32)
        
        # Create a mask where padding tokens are ignored
        mask = tf.cast(y_true != PAD_TOKEN, tf.float32)  # Shape: [batch_size, seq_len]
        # One-hot encode y_true
        y_true_one_hot = tf.one_hot(y_true, self.num_classes, axis=-1, dtype=tf.float32)  # Shape: [batch_size, seq_len, num_classes]
        
        # Calculate categorical cross-entropy loss
        loss = tf.keras.losses.categorical_crossentropy(y_true_one_hot, y_pred, from_logits=self.from_logits)  # Shape: [batch_size, seq_len]
        
        # Apply the mask to ignore padding positions in the loss
        loss = loss * mask  # Masked loss, Shape: [batch_size, seq_len]
        
        valid_tokens = tf.reduce_sum(mask)
        valid_tokens = tf.maximum(valid_tokens, 1.0)
        
        # Calculate the final loss by summing and normalizing over the valid tokens
        loss = tf.reduce_sum(loss) / valid_tokens
        
        return loss

    def get_config(self):
        # To be able to be saved
        config = super().get_config()
        config.update({
            "num_classes": self.num_classes,
            "from_logits": self.from_logits
        })
        return config


loss = MaskedSCCE()
adam = keras.optimizers.Adam(learning_rate=1e-4)


model.compile(
    optimizer=adam,
    loss=loss,
    metrics=['accuracy'])
EPOCHS = 315

# adapted and modified
def lrfn(current_step, num_warmup_steps, lr_max, num_cycles=0.50, num_training_steps=EPOCHS):
    WARMUP_METHOD = "exp"
    if current_step < num_warmup_steps:
        if WARMUP_METHOD == 'log':
            return lr_max * 0.10 ** (num_warmup_steps - current_step)
        else:
            return lr_max * 2 ** -(num_warmup_steps - current_step)
    else:
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))

        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) * lr_max


LR_SCHEDULE = [lrfn(step, num_warmup_steps=5, lr_max=1e-3, num_cycles=0.50) for step in range(EPOCHS)]
lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda step: LR_SCHEDULE[step], verbose=0)

class printCB(tf.keras.callbacks.Callback):
    def __init__(self, batch):
        self.batch = batch
        inp, out = batch
        print(out[0:1])
    def on_epoch_end(self, epoch, logs=None):
        inputs, outputs = self.batch
        src = inputs[0:1]
        tgt = outputs[0:1]
        tf.print("tgt", tgt)
        # Autoregressive inference (assuming this method works)

        output_sequence = self.model.predict(src)[0]
        # Map from token indices back to characters
        d = {v: k for k, v in char_to_idx.items()}
        chars = [d[ix.numpy()] for ix in output_sequence if ix.numpy() != EOS_TOKEN]
        predicted_word = ''.join(chars)

        # Print true labels and predictions
        print("\nY True: ", "".join([d[i.numpy()] for i in tgt[0]]))
        print("Y pred inference: ", predicted_word)

        # Run inference with the current model
        y_pred_training = self.model((src,tgt), training=False)
        y_pred_training = tf.argmax(y_pred_training, axis=-1)

        # Flatten the predicted sequence tensor and filter out EOS_TOKEN directly in TensorFlow
        y_pred_training_flat = tf.reshape(y_pred_training, [-1])  # Flatten the tensor
        filtered_pred = tf.boolean_mask(y_pred_training_flat, y_pred_training_flat != EOS_TOKEN)  # Remove EOS tokens
        
        # Convert indices to characters and join them into the final string
        predicted_word_training = ''.join([d[ix.numpy()] for ix in filtered_pred])
        print("Y pred training: ", predicted_word_training)


cb = printCB(next(iter(val_dataset)))


tf.Tensor(
[[60 18  0 34 49 36 36 42 39 46 52 50 36 61 59 59 59 59 59 59 59 59 59 59
  59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59
  59 59 59 59 59 59 59 59 59 59 59 59 59 59 59 59]], shape=(1, 64), dtype=int32)


In [27]:
history = model.fit(train_dataset, epochs=80, validation_data=(val_dataset), verbose=1, callbacks=[cb])
model.save("/kaggle/working/model.keras")

Epoch 1/80




Tensor("compile_loss/masked_scce_3/Cast:0", shape=(None, None), dtype=float32) Tensor("asl_fingerspelling_model_3_1/dense_67_1/Softmax:0", shape=(None, None, 62), dtype=float32)
Tensor("compile_loss/masked_scce_3/Cast:0", shape=(None, None), dtype=float32) Tensor("asl_fingerspelling_model_3_1/dense_67_1/Softmax:0", shape=(None, None, 62), dtype=float32)
[1m1520/1520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.0629 - loss: 0.0161Tensor("compile_loss/masked_scce_3/Cast:0", shape=(None, None), dtype=float32) Tensor("asl_fingerspelling_model_3_1/dense_67_1/Softmax:0", shape=(None, None, 62), dtype=float32)
tgt [[60 18 0 ... 59 59 59]]

Y True:  <SOS>3 creekhouse<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>
Y pred inference:  3 cher hores
Y pred trai


KeyboardInterrupt



In [28]:
model.save("/kaggle/working/model.keras")