In [None]:
!pip uninstall -y tensorflow

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tensorflow-cpu

Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-cpu)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow-cpu)
  Downloading keras-3.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting ml-dtypes<0.5.0,>=0.4.0 (from tensorflow-cpu)
  Downloading ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting namex (from keras>=3.5.0->tensorflow-cpu)
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting optree (from keras>=3.5.0->tensorflow-cpu)
  Downloading optree-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.8/47.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading tensorflow_cpu-2.18.0-cp310-cp310-manylinux_2_17_x86_64.man

In [None]:
import os
import numpy as np
import pandas as pd
import pickle

# Define paths
data_path = "/content/drive/MyDrive/train/home/usuaris/imatge/ltarres/wicv2023/how2sign/i3d_features/train"
translation_file_path = "/content/drive/MyDrive/id_translation_subset.pkl"

# Load the translations from the .pkl file
with open(translation_file_path, 'rb') as file:
    translation_data = pickle.load(file)

# Convert translation data to a DataFrame
translation_df = pd.DataFrame(translation_data)

# Get the list of files in the directory and select the first 20 files
file_list = os.listdir(data_path)[:]


translations = []
numpy_arrays = []
ids = []

# Process the first 100 files
for npy_file in file_list:

    file_id = npy_file.replace('.npy', '')


    if file_id in translation_df['id'].values:
        # Get the corresponding translation
        translation = translation_df.loc[translation_df['id'] == file_id, 'translation'].values[0]

        # Load the .npy file
        npy_filepath = os.path.join(data_path, npy_file)
        numpy_array = np.load(npy_filepath)

        # Append to lists
        translations.append(translation)
        numpy_arrays.append(numpy_array)
        ids.append(file_id)

# Create a new DataFrame for the results
results_df = pd.DataFrame({
    'id': ids,
    'translation': translations,
    'numpy_array': numpy_arrays
})

# Display the new DataFrame
print(results_df.head())


                          id  \
0  eS8QaBYoDU0_1-9-rgb_front   
1  b9nWwzf0C5E_9-5-rgb_front   
2  -fyFTnt9w9Q_4-5-rgb_front   
3  f8ShD9YwEfo_5-2-rgb_front   
4  5Oq-F-EC_pU_9-8-rgb_front   

                                         translation  \
0  When the ball is picked up by your defender, w...   
1  Now, from this position pick up your foot and ...   
2                So just think of your backhand now.   
3  But what it'll do is it'll bring up a window o...   
4                            They also fold back up.   

                                         numpy_array  
0  [[0.0, 0.0054244995, 0.010101318, 0.034423828,...  
1  [[0.0, 0.00022244453, 0.24072266, 0.0, 0.02577...  
2  [[0.030654907, 0.020111084, 0.03955078, 0.1687...  
3  [[0.056274414, 0.0, 0.0, 0.068237305, 0.0, 0.3...  
4  [[0.0, 0.14453125, 0.0, 0.3569336, 0.004905700...  


In [None]:
results_df.shape

(30384, 3)

In [None]:
def analyze_sequence_lengths(results_df):
    # Calculate sequence lengths for each entry
    sequence_lengths = [arr.shape[0] for arr in results_df['numpy_array']]

    # Create a distribution analysis
    length_stats = {
        'min': np.min(sequence_lengths),
        'max': np.max(sequence_lengths),
        'mean': np.mean(sequence_lengths),
        'median': np.median(sequence_lengths),
        'std': np.std(sequence_lengths)
    }

    return sequence_lengths, length_stats

def filter_short_sequences(results_df, max_length_threshold=300):
    # Get original sequence lengths
    sequence_lengths = [arr.shape[0] for arr in results_df['numpy_array']]

    # Create mask for short sequences
    short_sequence_mask = [length <= max_length_threshold for length in sequence_lengths]

    # Filter dataframe
    filtered_df = results_df[short_sequence_mask].copy()

    # Print statistics
    print(f"Original dataset size: {len(results_df)}")
    print(f"Filtered dataset size: {len(filtered_df)}")
    print(f"Kept {len(filtered_df)/len(results_df)*100:.2f}% of the data")

    return filtered_df

def analyze_translation_lengths(filtered_df):
    # Analyze word counts in translations
    word_counts = [len(text.split()) for text in filtered_df['translation']]

    return {
        'min_words': min(word_counts),
        'max_words': max(word_counts),
        'avg_words': np.mean(word_counts),
        'median_words': np.median(word_counts)
    }

# Implementation
def prepare_filtered_dataset():
    # 1. First analyze the data
    lengths, stats = analyze_sequence_lengths(results_df)
    print("Original sequence statistics:", stats)

    # 2. Filter short sequences
    filtered_df = filter_short_sequences(results_df, max_length_threshold=1000)

    # 3. Analyze translation complexity
    translation_stats = analyze_translation_lengths(filtered_df)
    print("Translation statistics:", translation_stats)

    return filtered_df

In [None]:
analyze_sequence_lengths(results_df)

([372,
  114,
  37,
  80,
  71,
  212,
  499,
  107,
  50,
  187,
  152,
  594,
  110,
  149,
  99,
  286,
  126,
  144,
  36,
  265,
  274,
  115,
  110,
  304,
  81,
  107,
  246,
  358,
  83,
  90,
  142,
  131,
  132,
  107,
  117,
  113,
  47,
  193,
  153,
  155,
  277,
  45,
  57,
  326,
  27,
  154,
  241,
  56,
  33,
  58,
  365,
  48,
  132,
  25,
  120,
  52,
  79,
  94,
  225,
  202,
  367,
  454,
  180,
  531,
  232,
  311,
  179,
  399,
  448,
  32,
  41,
  33,
  101,
  50,
  168,
  145,
  380,
  192,
  213,
  61,
  212,
  243,
  409,
  77,
  160,
  95,
  149,
  42,
  57,
  37,
  107,
  429,
  186,
  65,
  230,
  11,
  135,
  87,
  220,
  322,
  77,
  119,
  125,
  35,
  149,
  117,
  222,
  123,
  73,
  1101,
  187,
  51,
  62,
  115,
  228,
  23,
  278,
  308,
  299,
  76,
  169,
  199,
  201,
  51,
  69,
  160,
  163,
  262,
  64,
  174,
  467,
  45,
  50,
  170,
  254,
  35,
  289,
  175,
  184,
  149,
  194,
  46,
  206,
  112,
  252,
  137,
  128,
  16,
  102,
  282

In [None]:
filtered_df = filter_short_sequences(results_df, max_length_threshold=300)

Original dataset size: 30384
Filtered dataset size: 26247
Kept 86.38% of the data


In [None]:
filtered_df['numpy_array'][1][1]

array([0.        , 0.0007391 , 0.25756836, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [None]:
def analyze_translation_lengths(filtered_df):
    # Analyze word counts in translations
    word_counts = [len(text.split()) for text in filtered_df['translation']]

    return {
        'min_words': min(word_counts),
        'max_words': max(word_counts),
        'avg_words': np.mean(word_counts),
        'median_words': np.median(word_counts)
    }

In [None]:
analyze_translation_lengths(filtered_df)

{'min_words': 1,
 'max_words': 383,
 'avg_words': 14.576789728349906,
 'median_words': 13.0}

In [None]:
def prepare_filtered_dataset():
    # 1. First analyze the data
    lengths, stats = analyze_sequence_lengths(results_df)
    print("Original sequence statistics:", stats)

    # 2. Filter short sequences
    filtered_df = filter_short_sequences(results_df, max_length_threshold=300)

    # 3. Analyze translation complexity
    translation_stats = analyze_translation_lengths(filtered_df)
    print("Translation statistics:", translation_stats)

    return filtered_df

In [None]:
prepare_filtered_dataset()

Original sequence statistics: {'min': 1, 'max': 3577, 'mean': 172.94638625592418, 'median': 133.0, 'std': 158.48273234081685}
Original dataset size: 30384
Filtered dataset size: 26247
Kept 86.38% of the data
Translation statistics: {'min_words': 1, 'max_words': 383, 'avg_words': 14.576789728349906, 'median_words': 13.0}


Unnamed: 0,id,translation,numpy_array
1,b9nWwzf0C5E_9-5-rgb_front,"Now, from this position pick up your foot and ...","[[0.0, 0.00022244453, 0.24072266, 0.0, 0.02577..."
2,-fyFTnt9w9Q_4-5-rgb_front,So just think of your backhand now.,"[[0.030654907, 0.020111084, 0.03955078, 0.1687..."
3,f8ShD9YwEfo_5-2-rgb_front,But what it'll do is it'll bring up a window o...,"[[0.056274414, 0.0, 0.0, 0.068237305, 0.0, 0.3..."
4,5Oq-F-EC_pU_9-8-rgb_front,They also fold back up.,"[[0.0, 0.14453125, 0.0, 0.3569336, 0.004905700..."
5,1cUIexb0ELM_8-8-rgb_front,You're only one swing thought away from hookin...,"[[0.0, 0.0, 0.068725586, 0.019378662, 0.0, 0.0..."
...,...,...,...
30376,15bYoBr7BWs_20-8-rgb_front,"Now, these are really easy to turn on.","[[0.0, 0.057556152, 0.0, 0.0, 0.0010938644, 0...."
30378,2WTWOS8bF7A_7-3-rgb_front,"There you go, so now she has the super glossy ...","[[0.0, 0.0, 0.00047302246, 0.0, 0.0, 0.0544433..."
30379,c6difzHesqQ_6-8-rgb_front,"Hi, hey, pause.","[[0.04119873, 0.0, 0.0, 0.03503418, 0.0, 0.163..."
30380,c2KLK-rr89U_20-8-rgb_front,"And automatically, our cat looks angry.","[[0.0, 0.17492676, 0.0, 0.0, 0.14465332, 0.0, ..."


In [None]:
import re

In [None]:
def preprocess_translation(text):
    # Remove special characters (except spaces)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Add <START> and <END> tags
    #text = f"<START> {text.strip()} <END>"
    return text


In [None]:
filtered_df['translation'] = filtered_df['translation'].apply(preprocess_translation)

# Display the updated DataFrame
print(filtered_df[['id', 'translation']])

                               id  \
1       b9nWwzf0C5E_9-5-rgb_front   
2       -fyFTnt9w9Q_4-5-rgb_front   
3       f8ShD9YwEfo_5-2-rgb_front   
4       5Oq-F-EC_pU_9-8-rgb_front   
5       1cUIexb0ELM_8-8-rgb_front   
...                           ...   
30376  15bYoBr7BWs_20-8-rgb_front   
30378   2WTWOS8bF7A_7-3-rgb_front   
30379   c6difzHesqQ_6-8-rgb_front   
30380  c2KLK-rr89U_20-8-rgb_front   
30383   -EdUkSqns3U_6-3-rgb_front   

                                             translation  
1      now from this position pick up your foot and t...  
2                     so just think of your backhand now  
3      but what itll do is itll bring up a window on ...  
4                                 they also fold back up  
5      youre only one swing thought away from hooking...  
...                                                  ...  
30376               now these are really easy to turn on  
30378  there you go so now she has the super glossy lips  
30379                   

In [None]:
import numpy as np

# Define the target frame size
target_frame_size = 300
feature_dim = 1024  # Feature size per frame (already fixed)

def pad_or_truncate(array, target_size, feature_dim):
    num_frames = array.shape[0]
    if num_frames < target_size:
        # Pad with zeros
        padding = np.zeros((target_size - num_frames, feature_dim))
        padded_array = np.vstack((array, padding))
    else:
        # Truncate to target size
        padded_array = array[:target_size]
    return padded_array

# Apply to the DataFrame
filtered_df['padded_numpy_array'] = filtered_df['numpy_array'].apply(
    lambda x: pad_or_truncate(x, target_frame_size, feature_dim)
)

# Check the result
print(filtered_df['padded_numpy_array'].iloc[0].shape)  # Should be (300, 1024)


(300, 1024)


In [None]:
from transformers import T5Tokenizer

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize translations
filtered_df['tokenized'] = filtered_df['translation'].apply(
    lambda x: tokenizer(x, padding="max_length", truncation=True, max_length=25, return_tensors="pt")
)

# Split tokenized results into input IDs and attention masks
filtered_df['input_ids'] = filtered_df['tokenized'].apply(lambda x: x['input_ids'].squeeze(0))
filtered_df['attention_mask'] = filtered_df['tokenized'].apply(lambda x: x['attention_mask'].squeeze(0))

# Drop the intermediate 'tokenized' column
filtered_df = filtered_df.drop(columns=['tokenized'])

# Verify the result
print(filtered_df[['input_ids', 'attention_mask']].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


                                           input_ids  \
1  [tensor(230), tensor(45), tensor(48), tensor(1...   
2  [tensor(78), tensor(131), tensor(317), tensor(...   
3  [tensor(68), tensor(125), tensor(34), tensor(1...   
4  [tensor(79), tensor(92), tensor(11750), tensor...   
5  [tensor(39), tensor(15), tensor(163), tensor(8...   

                                      attention_mask  
1  [tensor(1), tensor(1), tensor(1), tensor(1), t...  
2  [tensor(1), tensor(1), tensor(1), tensor(1), t...  
3  [tensor(1), tensor(1), tensor(1), tensor(1), t...  
4  [tensor(1), tensor(1), tensor(1), tensor(1), t...  
5  [tensor(1), tensor(1), tensor(1), tensor(1), t...  


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
train_df, val_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 20997, Validation size: 2625, Test size: 2625


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class SignLanguageDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            'video_features': torch.tensor(row['padded_numpy_array'], dtype=torch.float32),
            'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long)
        }

# Create datasets
train_dataset = SignLanguageDataset(train_df)
val_dataset = SignLanguageDataset(val_df)
test_dataset = SignLanguageDataset(test_df)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:
# Inspect the first item of the train dataset to check if data is loaded correctly
sample = train_dataset[0]
print(f"Video Features Shape: {sample['video_features'].shape}")
print(f"Input IDs Shape: {sample['input_ids'].shape}")
print(f"Attention Mask Shape: {sample['attention_mask'].shape}")


Video Features Shape: torch.Size([300, 1024])
Input IDs Shape: torch.Size([25])
Attention Mask Shape: torch.Size([25])


  'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long)


In [None]:
# Get the first batch from the train_loader to check the data
batch = next(iter(train_loader))
print(f"Video Features Batch Shape: {batch['video_features'].shape}")
print(f"Input IDs Batch Shape: {batch['input_ids'].shape}")
print(f"Attention Mask Batch Shape: {batch['attention_mask'].shape}")

Video Features Batch Shape: torch.Size([8, 300, 1024])
Input IDs Batch Shape: torch.Size([8, 25])
Attention Mask Batch Shape: torch.Size([8, 25])


  'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long)


In [None]:
from transformers import T5ForConditionalGeneration

# Load the pretrained T5 model
decoder = T5ForConditionalGeneration.from_pretrained("t5-small")


In [None]:
import torch
import torch.nn as nn
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput

class I3DEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, lstm_hidden_dim, num_layers=1, bidirectional=False):
        super(I3DEncoder, self).__init__()
        self.linear_proj = nn.Linear(input_dim, hidden_dim)  # Linear projection to match LSTM input size
        self.lstm = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=lstm_hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional
        )
        # Update the hidden dimension after LSTM (doubled if bidirectional)
        self.hidden_dim = lstm_hidden_dim * (2 if bidirectional else 1)

    def forward(self, x):
        # x: [batch_size, seq_len, input_dim] (I3D features)
        x = self.linear_proj(x)  # Shape: [batch_size, seq_len, hidden_dim]
        lstm_out, _ = self.lstm(x)  # Shape: [batch_size, seq_len, lstm_hidden_dim]
        return lstm_out


In [None]:
!pip install transformers



In [None]:
class SignLanguageRecognitionModel(nn.Module):
    def __init__(self, encoder, decoder):
        super(SignLanguageRecognitionModel, self).__init__()
        self.encoder = encoder  # I3D Encoder
        self.decoder = decoder  # T5 Decoder

        # Add a linear layer to project encoder outputs to the correct dimension for the decoder
        self.projection = nn.Linear(encoder.hidden_dim, decoder.config.d_model)

    def forward(self, video_features, decoder_input_ids, attention_mask):
        # Step 1: Pass video features through the encoder
        encoder_outputs = self.encoder(video_features)

        # Step 2: Project encoder outputs to the correct dimension
        encoder_outputs = self.projection(encoder_outputs)

        # Wrap encoder outputs in BaseModelOutput
        encoder_outputs = BaseModelOutput(last_hidden_state=encoder_outputs)

        # Step 3: Pass the projected features to the T5 decoder
        decoder_outputs = self.decoder(
            input_ids=None,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            encoder_outputs=encoder_outputs  # Pass BaseModelOutput to the decoder
        )
        return decoder_outputs

In [None]:
# Define parameters
input_dim = 1024  # I3D feature dimension
hidden_dim = 512  # Dimension after linear projection
lstm_hidden_dim = 512  # LSTM hidden state size
num_layers = 2  # Number of LSTM layers
bidirectional = True  # Use bidirectional LSTM
decoder_hidden_dim = 512  # T5 decoder expects this hidden_dim

# Initialize the LSTM-based I3DEncoder
encoder = I3DEncoder(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    lstm_hidden_dim=lstm_hidden_dim,
    num_layers=num_layers,
    bidirectional=bidirectional
)

# Load the pre-trained T5 decoder
decoder = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create the Sign Language Recognition Model
model = SignLanguageRecognitionModel(encoder, decoder)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


SignLanguageRecognitionModel(
  (encoder): I3DEncoder(
    (linear_proj): Linear(in_features=1024, out_features=512, bias=True)
    (lstm): LSTM(512, 512, num_layers=2, batch_first=True, bidirectional=True)
  )
  (decoder): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
        

In [None]:
from torch.optim import AdamW

# Define the optimizer
optimizer = AdamW(model_transformers.parameters(), lr=5e-5)

# Define the loss function
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
num_epochs = 50
patience = 3  # Number of epochs to wait before stopping if no improvement
best_val_loss = float('inf')  # Initialize the best validation loss to a high value
patience_counter = 0  # Counter to track the patience

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        # Move data to the device
        video_features = batch['video_features'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        labels = input_ids[:, 1:].contiguous()

        # Create an attention mask for the video features
        video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)

        # Forward pass
        outputs = model(video_features, input_ids, video_attention_mask)
        logits = outputs.logits  # Predicted token logits

        # Get predictions excluding the last token (usually a special token like EOS)
        predicted_tokens = logits[:, :-1, :].contiguous()

        # Calculate loss
        loss = loss_fn(predicted_tokens.view(-1, predicted_tokens.size(-1)), labels.view(-1))
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Calculate average training loss for this epoch
    avg_train_loss = total_loss / len(train_loader)

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Move data to the device
            video_features = batch['video_features'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            labels = input_ids[:, 1:].contiguous()
            video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)

            # Forward pass
            outputs = model(video_features, input_ids, video_attention_mask)
            logits = outputs.logits
            predicted_tokens = logits[:, :-1, :].contiguous()

            # Calculate loss
            loss = loss_fn(predicted_tokens.view(-1, predicted_tokens.size(-1)), labels.view(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    # Check for improvement
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # Reset the patience counter
        # Optionally, save the model
        torch.save(model.state_dict(), "/content/drive/MyDrive/best_model2_with LSTM.pth")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break


  'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/50, Training Loss: 4.3463, Validation Loss: 3.8806
Epoch 2/50, Training Loss: 3.9872, Validation Loss: 3.7113
Epoch 3/50, Training Loss: 3.8142, Validation Loss: 3.6206
Epoch 4/50, Training Loss: 3.6815, Validation Loss: 3.5280
Epoch 5/50, Training Loss: 3.5594, Validation Loss: 3.4556
Epoch 6/50, Training Loss: 3.4308, Validation Loss: 3.3800
Epoch 7/50, Training Loss: 3.3091, Validation Loss: 3.3310
Epoch 8/50, Training Loss: 3.1915, Validation Loss: 3.2907
Epoch 9/50, Training Loss: 3.0761, Validation Loss: 3.2729
Epoch 10/50, Training Loss: 2.9642, Validation Loss: 3.2479
Epoch 11/50, Training Loss: 2.8558, Validation Loss: 3.2364
Epoch 12/50, Training Loss: 2.7458, Validation Loss: 3.2476
Epoch 13/50, Training Loss: 2.6437, Validation Loss: 3.2668
Epoch 14/50, Training Loss: 2.5403, Validation Loss: 3.2879
Early stopping triggered!


In [None]:
# Load the saved weights into the model
model.load_state_dict(torch.load("/content/drive/MyDrive/best_model2_with LSTM.pth"))
model.to(device)


  model.load_state_dict(torch.load("/content/drive/MyDrive/best_model2_with LSTM.pth"))


SignLanguageRecognitionModel(
  (encoder): I3DEncoder(
    (linear_proj): Linear(in_features=1024, out_features=512, bias=True)
    (lstm): LSTM(512, 512, num_layers=2, batch_first=True, bidirectional=True)
  )
  (decoder): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
        

In [None]:
def predict(model, tokenizer, video_features, max_length=25):
    """
    Generate predictions for given video features.

    Args:
        model (SignLanguageRecognitionModel): Trained model instance.
        tokenizer (T5Tokenizer): Tokenizer used during preprocessing and training.
        video_features (Union[np.ndarray, torch.Tensor]): Input video features of shape (seq_len, feature_dim).
        max_length (int): Maximum length of the output sequence.

    Returns:
        str: Decoded prediction as text.
    """
    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():
        # Convert video_features to a PyTorch tensor if it's a NumPy array
        if isinstance(video_features, np.ndarray):
            video_features = torch.from_numpy(video_features).float()

        # Ensure video_features is on the correct device
        video_features = video_features.to(device)

        # Add a batch dimension to video features
        video_features = video_features.unsqueeze(0)
        print(video_features.shape)

        # Create an attention mask for video features
        video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)
        print(video_attention_mask.shape)

        # Generate the output sequence using the decoder
        outputs = model.decoder.generate(
            input_ids=None,  # No input sequence for T5 since it's decoder-only generation
            encoder_outputs=BaseModelOutput(
                last_hidden_state=model.projection(model.encoder(video_features))
            ),
            attention_mask=video_attention_mask,
            max_length=max_length,
            num_beams=2,  # Beam search for better predictions
            early_stopping=True
        )
        print(outputs)

        # Decode the generated token IDs to a string
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return prediction


In [None]:
# Example video features tensor
sample_video_features = filtered_df['padded_numpy_array'][4]  # Replace with actual video features

# Predict the translation
translation = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {translation}")

torch.Size([1, 300, 1024])
torch.Size([1, 300])
tensor([[   0,    3,   99,   39,   15,  352,   12,  241,   12, 3197,   34,   91,
           13,    8, 2182,    1]])
Predicted Translation: if youre going to want to pull it out of the bag


In [None]:
filtered_df['translation'][4]

'they also fold back up'

In [None]:
# Example video features tensor
sample_video_features = filtered_df['padded_numpy_array'][8932]  # Replace with actual video features

# Predict the translation
translation = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {translation}")

torch.Size([1, 300, 1024])
torch.Size([1, 300])
tensor([[    0,     3,    23,    43,     3,     9, 15305,     1]])
Predicted Translation: i have a zoom


In [None]:
filtered_df['translation'][8932]

'the force and the velocity combined makes the ball go further'

In [None]:
# Example video features tensor
sample_video_features = filtered_df['padded_numpy_array'][791]  # Replace with actual video features

# Predict the translation
translation = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {translation}")

torch.Size([1, 300, 1024])
torch.Size([1, 300])
tensor([[   0,    3,   88,    7,    3,    9,  779, 7523,   24,   54,  199,  376,
          369,  223,   12,    8, 6476,    7,   11,    3,   88,   54,  199,  376,
          369]])
Predicted Translation: hes a major developer that can help him come back to the knees and he can help him come


In [None]:
filtered_df['translation'][791]

'and mainly besides just the warming and the holding of the joints it can help make them feel loved and wanted'

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def predict(model, tokenizer, video_features, max_length=25):
    """
    Generate predictions for given video features.

    Args:
        model (SignLanguageRecognitionModel): Trained model instance.
        tokenizer (T5Tokenizer): Tokenizer used during preprocessing and training.
        video_features (Union[np.ndarray, torch.Tensor]): Input video features of shape (seq_len, feature_dim).
        max_length (int): Maximum length of the output sequence.

    Returns:
        str: Decoded prediction as text.
    """
    model.eval()

    with torch.no_grad():
        # Convert video_features to a PyTorch tensor
        if isinstance(video_features, np.ndarray):
            video_features = torch.from_numpy(video_features).float()

        # Ensure video_features is on the correct device
        video_features = video_features.to(device)

        # Add a batch dimension to video features
        video_features = video_features.unsqueeze(0)

        # Create an attention mask for video features
        video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)

        # Generate the output sequence using the decoder
        outputs = model.decoder.generate(
            input_ids=None,
            encoder_outputs=BaseModelOutput(
                last_hidden_state=model.projection(model.encoder(video_features))
            ),
            attention_mask=video_attention_mask,
            max_length=max_length,
            num_beams=2,  # Beam search for better predictions
            early_stopping=True
        )

        # Decode the generated token IDs to a string
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return prediction


sample_video_features = filtered_df['padded_numpy_array'][14]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][14]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: youre going to be talking about how you can do it
Ground Truth Translation: but depending on how you cook it you can tenderize it
BLEU score: 0.05637560315259291


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][5]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][5]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: if youre not going to hit the ball and you want to be able to take it out
Ground Truth Translation: youre only one swing thought away from hooking the ball and losing your slice and this could be it
BLEU score: 0.0601429426464788


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][13]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][13]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: i think its important to remember to keep the thread on the other side so that you can stay on the other side
Ground Truth Translation: so the rules get a little bit convoluted but its important to remember first of all to stay on the strip
BLEU score: 0.1629944673128894


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][3]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][3]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: i have a process that you can do on your computer
Ground Truth Translation: but what itll do is itll bring up a window on your computer that brings up the task manager
BLEU score: 0.05361218207146106


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][16]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][16]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: i am going to talk about the power swings and the balance of the chest
Ground Truth Translation: dont worry about your power dont worry about getting everything perfect just snap off a lot of punches
BLEU score: 0.014242474285751547


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][17]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][17]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: a lot of people are going to need a car that has a little bit of a little bit of
Ground Truth Translation: and doing this could be a little harder in smaller vehicle but it still could be done
BLEU score: 0.020364851292391


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][9]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][9]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: i have a squat so i have a squat so i have
Ground Truth Translation: but a good player that has the strokes once they get the wheel chair down youre in trouble
BLEU score: 0.01033114956441737


In [None]:
sample_video_features = filtered_df['padded_numpy_array'][18]

# Predict the translation
prediction = predict(model, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][18]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")

Predicted Translation: i need to get it done
Ground Truth Translation: this is a flamingo catch
BLEU score: 0


In [None]:
import torch
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import pandas as pd
import numpy as np

def calculate_dataset_bleu(model, tokenizer, dataloader, device):
    """
    Calculate BLEU scores for an entire dataset

    Args:
        model: The trained model
        tokenizer: The tokenizer used for text processing
        dataloader: DataLoader containing the dataset
        device: Device to run the model on

    Returns:
        dict: Dictionary containing BLEU scores and predictions
    """
    model.eval()
    all_bleu_scores = []
    all_predictions = []
    all_references = []
    smoothing = SmoothingFunction().method1

    with torch.no_grad():
        # Use tqdm for progress bar
        for batch in tqdm(dataloader, desc="Calculating BLEU scores"):
            # Get video features and reference text
            video_features = batch['video_features'].to(device)

            # Get reference texts by decoding input_ids
            references = [tokenizer.decode(ids, skip_special_tokens=True)
                         for ids in batch['input_ids']]

            # Generate predictions for the batch
            for i in range(len(video_features)):
                video_feature = video_features[i].unsqueeze(0)
                video_attention_mask = torch.ones(video_feature.shape[:2],
                                               dtype=torch.long,
                                               device=device)

                # Generate prediction
                outputs = model.decoder.generate(
                    input_ids=None,
                    encoder_outputs=BaseModelOutput(
                        last_hidden_state=model.projection(model.encoder(video_feature))
                    ),
                    attention_mask=video_attention_mask,
                    max_length=25,
                    num_beams=2,
                    early_stopping=True
                )

                # Decode prediction
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                reference = references[i]

                # Calculate BLEU score
                reference_tokens = reference.split()
                prediction_tokens = prediction.split()

                bleu_score = sentence_bleu([reference_tokens],
                                         prediction_tokens,
                                         smoothing_function=smoothing)

                all_bleu_scores.append(bleu_score)
                all_predictions.append(prediction)
                all_references.append(reference)

    # Calculate statistics
    results = {
        'mean_bleu': np.mean(all_bleu_scores),
        'median_bleu': np.median(all_bleu_scores),
        'std_bleu': np.std(all_bleu_scores),
        'min_bleu': np.min(all_bleu_scores),
        'max_bleu': np.max(all_bleu_scores),
        'bleu_scores': all_bleu_scores,

    }

    return results

def evaluate_all_splits(model, tokenizer, train_loader, val_loader, test_loader, device):
    """
    Evaluate BLEU scores for all data splits
    """
    print("Evaluating training set...")
    train_results = calculate_dataset_bleu(model, tokenizer, train_loader, device)

    print("Evaluating validation set...")
    val_results = calculate_dataset_bleu(model, tokenizer, val_loader, device)

    print("Evaluating test set...")
    test_results = calculate_dataset_bleu(model, tokenizer, test_loader, device)

    # Create summary DataFrame
    summary_data = {
        'Metric': ['Mean BLEU', 'Median BLEU', 'Std BLEU', 'Min BLEU', 'Max BLEU'],
        'Train': [train_results['mean_bleu'], train_results['median_bleu'],
                 train_results['std_bleu'], train_results['min_bleu'],
                 train_results['max_bleu']],
        'Validation': [val_results['mean_bleu'], val_results['median_bleu'],
                      val_results['std_bleu'], val_results['min_bleu'],
                      val_results['max_bleu']],
        'Test': [test_results['mean_bleu'], test_results['median_bleu'],
                test_results['std_bleu'], test_results['min_bleu'],
                test_results['max_bleu']]
    }

    summary_df = pd.DataFrame(summary_data)

    # Save detailed results
    all_results = {
        'train': train_results,
        'validation': val_results,
        'test': test_results,
        'summary': summary_df
    }

    return all_results

# Function to save results to CSV files
def save_results(results, output_dir='bleu_results'):
    """
    Save evaluation results to CSV files
    """
    import os
    os.makedirs(output_dir, exist_ok=True)

    # Save summary
    results['summary'].to_csv(f'{output_dir}/summary_metrics.csv', index=False)

    # Save detailed results for each split
    for split in ['train', 'validation', 'test']:
        detailed_df = pd.DataFrame({
            'bleu_score': results[split]['bleu_scores']
        })
        detailed_df.to_csv(f'{output_dir}/{split}_detailed_results.csv', index=False)

In [None]:
results = evaluate_all_splits(model, tokenizer, train_loader, val_loader, test_loader, device)

# Save the results to CSV
save_results(results, output_dir='bleu_results')

Evaluating training set...


  'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long)
Calculating BLEU scores: 100%|██████████| 2625/2625 [1:50:49<00:00,  2.53s/it]


Evaluating validation set...


Calculating BLEU scores: 100%|██████████| 329/329 [13:57<00:00,  2.54s/it]


Evaluating test set...


Calculating BLEU scores: 100%|██████████| 329/329 [14:01<00:00,  2.56s/it]


In [None]:
summery_df  = results['summary']

In [None]:
summery_df

Unnamed: 0,Metric,Train,Validation,Test
0,Mean BLEU,0.071204,0.047343,0.049382
1,Median BLEU,0.025607,0.022024,0.021459
2,Std BLEU,0.122937,0.08079,0.088023
3,Min BLEU,0.0,0.0,0.0
4,Max BLEU,1.0,0.846482,1.0


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
from tqdm import tqdm
import pandas as pd

def calculate_bleu_scores(df, model, tokenizer, split_name=""):
    """
    Calculate BLEU scores for all entries in a dataframe

    Args:
        df (pd.DataFrame): DataFrame containing the data
        model: The trained model
        tokenizer: The tokenizer
        split_name (str): Name of the data split for printing purposes

    Returns:
        list: List of BLEU scores
        dict: Dictionary containing statistics
    """
    bleu_scores = []
    predictions = []
    references = []
    smoothing_function = SmoothingFunction().method1

    print(f"\nProcessing {split_name} split...")

    # Process each sample in the dataset
    for idx in tqdm(range(len(df))):
        # Get video features and ground truth
        video_features = df['padded_numpy_array'].iloc[idx]
        reference = df['translation'].iloc[idx]

        # Get prediction
        prediction = predict(model, tokenizer, video_features)

        # Store predictions and references
        predictions.append(prediction)
        references.append(reference)

        # Calculate BLEU score
        reference_tokens = reference.split()
        prediction_tokens = prediction.split()

        try:
            bleu = sentence_bleu([reference_tokens], prediction_tokens,
                               smoothing_function=smoothing_function)
            bleu_scores.append(bleu)
        except Exception as e:
            print(f"Error calculating BLEU score for index {idx}: {e}")
            bleu_scores.append(0.0)

    # Calculate statistics
    stats = {
        'mean_bleu': np.mean(bleu_scores),
        'median_bleu': np.median(bleu_scores),
        'std_bleu': np.std(bleu_scores),
        'min_bleu': np.min(bleu_scores),
        'max_bleu': np.max(bleu_scores)
    }

    # Create a results DataFrame
    results_df = pd.DataFrame({
        'reference': references,
        'prediction': predictions,
        'bleu_score': bleu_scores
    })

    return bleu_scores, stats, results_df

# Calculate BLEU scores for each split
train_bleu, train_stats, train_results = calculate_bleu_scores(train_df, model, tokenizer, "Training")
val_bleu, val_stats, val_results = calculate_bleu_scores(val_df, model, tokenizer, "Validation")
test_bleu, test_stats, test_results = calculate_bleu_scores(test_df, model, tokenizer, "Test")

# Print statistics for each split
print("\nTraining Set Statistics:")
for metric, value in train_stats.items():
    print(f"{metric}: {value:.4f}")

print("\nValidation Set Statistics:")
for metric, value in val_stats.items():
    print(f"{metric}: {value:.4f}")

print("\nTest Set Statistics:")
for metric, value in test_stats.items():
    print(f"{metric}: {value:.4f}")



# Save results to CSV files
train_results.to_csv('train_results_lstm.csv', index=False)
val_results.to_csv('val_results.csv_lstm', index=False)
test_results.to_csv('test_results.csv_lstm', index=False)



Processing Training split...


100%|██████████| 20997/20997 [1:55:04<00:00,  3.04it/s]



Processing Validation split...


100%|██████████| 2625/2625 [14:04<00:00,  3.11it/s]



Processing Test split...


100%|██████████| 2625/2625 [14:07<00:00,  3.10it/s]


Training Set Statistics:
mean_bleu: 0.0689
median_bleu: 0.0247
std_bleu: 0.1214
min_bleu: 0.0000
max_bleu: 1.0000

Validation Set Statistics:
mean_bleu: 0.0459
median_bleu: 0.0207
std_bleu: 0.0800
min_bleu: 0.0000
max_bleu: 0.8465

Test Set Statistics:
mean_bleu: 0.0478
median_bleu: 0.0202
std_bleu: 0.0870
min_bleu: 0.0000
max_bleu: 1.0000





In [None]:
# Save results to CSV files
train_results.to_csv('train_results_lstm.csv', index=False)
val_results.to_csv('val_results_lstm.csv', index=False)
test_results.to_csv('test_results_lstm.csv', index=False)

In [None]:
import torch
import torch.nn as nn


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput

Transformer Encoder

In [None]:
class I3DEncoder2(nn.Module):
    def __init__(self, input_dim, hidden_dim, nhead, num_layers, ff_dim, dropout=0.1, max_seq_len=300):
        # Change I3DEncoder to I3DEncoder2 to correctly call the superclass's __init__ method
        super(I3DEncoder2, self).__init__()
        self.linear_proj = nn.Linear(input_dim, hidden_dim)

        # Positional encoding to provide sequence order information
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_len, hidden_dim))

        # Define Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=nhead,
            dim_feedforward=ff_dim,
            dropout=dropout
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        # x: [batch_size, seq_len, input_dim] (I3D features)
        x = self.linear_proj(x)  # Project to Transformer input dimension: [batch_size, seq_len, hidden_dim]
        x = x + self.positional_encoding[:, :x.size(1), :]  # Add positional encoding

        # Pass through Transformer Encoder (requires [seq_len, batch_size, hidden_dim])
        x = self.transformer(x.permute(1, 0, 2))  # [seq_len, batch_size, hidden_dim]
        x = x.permute(1, 0, 2)  # Back to [batch_size, seq_len, hidden_dim]
        return x

In [None]:
# Define parameters for Transformer Encoder
input_dim = 1024  # I3D feature dimension
hidden_dim = 512  # Hidden size for Transformer and T5 compatibility
nhead = 8         # Number of attention heads
num_layers = 4    # Number of Transformer layers
ff_dim = 2048     # Feedforward network dimension
dropout = 0.2     # Dropout rate

# Initialize the Transformer-based I3DEncoder
encoder = I3DEncoder2(
    input_dim=input_dim,
    hidden_dim=hidden_dim,
    nhead=nhead,
    num_layers=num_layers,
    ff_dim=ff_dim,
    dropout=dropout,
    max_seq_len=300  # Assuming maximum sequence length is 300
)

# Load the pre-trained T5 decoder
decoder = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create the Sign Language Recognition Model
model_transformers = SignLanguageRecognitionModel(encoder, decoder)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_transformers.to(device)


SignLanguageRecognitionModel(
  (encoder): I3DEncoder2(
    (linear_proj): Linear(in_features=1024, out_features=512, bias=True)
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
      )
    )
  )
  (decoder): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): E

In [None]:
num_epochs = 50
patience = 2  # Number of epochs to wait before stopping if no improvement
best_val_loss = float('inf')  # Initialize the best validation loss to a high value
patience_counter = 0  # Counter to track the patience

for epoch in range(num_epochs):
    model_transformers.train()
    total_loss = 0

    for batch in train_loader:
        # Move data to the device
        video_features = batch['video_features'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        labels = input_ids[:, 1:].contiguous()

        # Create an attention mask for the video features
        video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)

        # Forward pass
        outputs = model_transformers(video_features, input_ids, video_attention_mask)
        logits = outputs.logits  # Predicted token logits

        # Get predictions excluding the last token (usually a special token like EOS)
        predicted_tokens = logits[:, :-1, :].contiguous()

        # Calculate loss
        loss = loss_fn(predicted_tokens.view(-1, predicted_tokens.size(-1)), labels.view(-1))
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Calculate average training loss for this epoch
    avg_train_loss = total_loss / len(train_loader)

    # Validation phase
    model_transformers.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            # Move data to the device
            video_features = batch['video_features'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            labels = input_ids[:, 1:].contiguous()
            video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)

            # Forward pass
            outputs = model_transformers(video_features, input_ids, video_attention_mask)
            logits = outputs.logits
            predicted_tokens = logits[:, :-1, :].contiguous()

            # Calculate loss
            loss = loss_fn(predicted_tokens.view(-1, predicted_tokens.size(-1)), labels.view(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    # Check for improvement
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # Reset the patience counter
        # Optionally, save the model
        torch.save(model_transformers.state_dict(), "/content/drive/MyDrive/best_model_with_Transformers.pth")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break


  'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
  'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long)


Epoch 1/50, Training Loss: 4.1133, Validation Loss: 3.8548
Epoch 2/50, Training Loss: 3.9313, Validation Loss: 3.7490
Epoch 3/50, Training Loss: 3.7823, Validation Loss: 3.7227
Epoch 4/50, Training Loss: 3.6495, Validation Loss: 3.6544
Epoch 5/50, Training Loss: 3.5157, Validation Loss: 3.6482
Epoch 6/50, Training Loss: 3.3846, Validation Loss: 3.6139
Epoch 7/50, Training Loss: 3.2514, Validation Loss: 3.6311
Epoch 8/50, Training Loss: 3.1168, Validation Loss: 3.6505
Early stopping triggered!


In [None]:
# Load the saved weights into the model
model_transformers.load_state_dict(torch.load("/content/drive/MyDrive/best_model_with_Transformers.pth"))
model_transformers.to(device)


  model_transformers.load_state_dict(torch.load("/content/drive/MyDrive/best_model_with_Transformers.pth"))


SignLanguageRecognitionModel(
  (encoder): I3DEncoder2(
    (linear_proj): Linear(in_features=1024, out_features=512, bias=True)
    (transformer): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
      )
    )
  )
  (decoder): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): E

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def predict(model, tokenizer, video_features, max_length=25):
    """
    Generate predictions for given video features.

    Args:
        model (SignLanguageRecognitionModel): Trained model instance.
        tokenizer (T5Tokenizer): Tokenizer used during preprocessing and training.
        video_features (Union[np.ndarray, torch.Tensor]): Input video features of shape (seq_len, feature_dim).
        max_length (int): Maximum length of the output sequence.

    Returns:
        str: Decoded prediction as text.
    """
    model.eval()

    with torch.no_grad():
        # Convert video_features to a PyTorch tensor
        if isinstance(video_features, np.ndarray):
            video_features = torch.from_numpy(video_features).float()

        # Ensure video_features is on the correct device
        video_features = video_features.to(device)

        # Add a batch dimension to video features
        video_features = video_features.unsqueeze(0)

        # Create an attention mask for video features
        video_attention_mask = torch.ones(video_features.shape[:2], dtype=torch.long, device=device)

        # Generate the output sequence using the decoder
        outputs = model.decoder.generate(
            input_ids=None,
            encoder_outputs=BaseModelOutput(
                last_hidden_state=model.projection(model.encoder(video_features))
            ),
            attention_mask=video_attention_mask,
            max_length=max_length,
            num_beams=2,  # Beam search for better predictions
            early_stopping=True
        )

        # Decode the generated token IDs to a string
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return prediction


sample_video_features = filtered_df['padded_numpy_array'][14]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][14]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: i can roast it up with a little bit of a roaster
Ground Truth Translation: but depending on how you cook it you can tenderize it
BLEU score: 0.020255986027125642


In [None]:
# Example video features tensor
sample_video_features = filtered_df['padded_numpy_array'][4]  # Replace with actual video features

# Predict the translation
translation = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {translation}")

Predicted Translation: i have a sleeve and a sleeve


In [None]:
# Get the ground truth translation
reference = filtered_df['translation'][4]
print(f"Ground Truth Translation: {reference}")

Ground Truth Translation: they also fold back up


In [None]:
# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


BLEU score: 0.017033186037639283


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][5]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][5]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: if you want to hit the ball you want to take your swing to the right side of the ball so you can
Ground Truth Translation: youre only one swing thought away from hooking the ball and losing your slice and this could be it
BLEU score: 0.020828838183973034


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][13]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][13]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: a lot of times it is very important to keep in mind that it is important to keep in mind that it is
Ground Truth Translation: so the rules get a little bit convoluted but its important to remember first of all to stay on the strip
BLEU score: 0.022023814946586635


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][3]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][3]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: i have a computer that has a computer that has a computer that has a computer that has
Ground Truth Translation: but what itll do is itll bring up a window on your computer that brings up the task manager
BLEU score: 0.023914960914330066


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][16]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][16]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: i really want to make sure that you have a good workout on the back of the bike
Ground Truth Translation: dont worry about your power dont worry about getting everything perfect just snap off a lot of punches
BLEU score: 0.01284618972676772


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][17]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][17]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: if youre going to be a little older you might be able to get a bigger car and you
Ground Truth Translation: and doing this could be a little harder in smaller vehicle but it still could be done
BLEU score: 0.057259987315337754


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][9]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][9]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: i have a good chance to get a good swing and a good swing if you have a
Ground Truth Translation: but a good player that has the strokes once they get the wheel chair down youre in trouble
BLEU score: 0.025281168697394947


In [None]:

sample_video_features = filtered_df['padded_numpy_array'][18]

# Predict the translation
prediction = predict(model_transformers, tokenizer, sample_video_features)
print(f"Predicted Translation: {prediction}")

# Get the ground truth translation
reference = filtered_df['translation'][18]
print(f"Ground Truth Translation: {reference}")

# Tokenize the reference and prediction
reference_tokens = reference.split()
prediction_tokens = prediction.split()  # Model prediction tokens

# Apply BLEU score calculation with smoothing
smoothing_function = SmoothingFunction().method1  # Smoothing to handle cases with zero n-gram matches
bleu_score = sentence_bleu([reference_tokens], prediction_tokens, smoothing_function=smoothing_function)

# Print BLEU score
print(f"BLEU score: {bleu_score}")


Predicted Translation: i need it
Ground Truth Translation: this is a flamingo catch
BLEU score: 0


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import numpy as np
import pandas as pd

def evaluate_split(model, tokenizer, df, split_name):
    bleu_scores = []
    predictions = []
    smoothing = SmoothingFunction().method1

    print(f"\nEvaluating {split_name} split...")
    for idx in tqdm(range(len(df))):
        video_features = df['padded_numpy_array'].iloc[idx]
        reference = df['translation'].iloc[idx]

        try:
            prediction = predict(model_transformers, tokenizer, video_features)
            bleu = sentence_bleu([reference.split()], prediction.split(), smoothing_function=smoothing)

            predictions.append({
                'id': df['id'].iloc[idx],
                'reference': reference,
                'prediction': prediction,
                'bleu': bleu
            })
            bleu_scores.append(bleu)
        except Exception as e:
            print(f"Error at index {idx}: {e}")
            continue

    results = pd.DataFrame(predictions)
    results.to_csv(f'{split_name.lower()}_predictions.csv', index=False)

    metrics = {
        'mean_bleu': np.mean(bleu_scores),
        'median_bleu': np.median(bleu_scores),
        'std_bleu': np.std(bleu_scores),
        'min_bleu': np.min(bleu_scores),
        'max_bleu': np.max(bleu_scores)
    }

    print(f"\n{split_name} Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics, results

# Evaluate all splits
train_metrics, train_results = evaluate_split(model_transformers, tokenizer, train_df, 'Train')
val_metrics, val_results = evaluate_split(model_transformers, tokenizer, val_df, 'Validation')
test_metrics, test_results = evaluate_split(model_transformers, tokenizer, test_df, 'Test')



Evaluating Train split...


100%|██████████| 20997/20997 [1:47:57<00:00,  3.24it/s]



Train Results:
mean_bleu: 0.0334
median_bleu: 0.0175
std_bleu: 0.0560
min_bleu: 0.0000
max_bleu: 1.0000

Evaluating Validation split...


100%|██████████| 2625/2625 [13:36<00:00,  3.21it/s]



Validation Results:
mean_bleu: 0.0258
median_bleu: 0.0141
std_bleu: 0.0473
min_bleu: 0.0000
max_bleu: 0.8155

Evaluating Test split...


100%|██████████| 2625/2625 [13:35<00:00,  3.22it/s]


Test Results:
mean_bleu: 0.0255
median_bleu: 0.0144
std_bleu: 0.0418
min_bleu: 0.0000
max_bleu: 0.4483



