In [1]:
import os
import pandas as pd
# # Set the visible GPU devices
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
MAX_FRAMES = 300
max_length_decoder = 128
batch_size = 64
num_encoder_layers = 4 #4
num_decoder_layers = 4 #4
encoder_hidden_size = 512 #512
decoder_hidden_size = 512 #512
num_attention_heads = 8
dropout = 0.1
num_keypoints = 152
WEIGTH_DECAY = 0.01
learning_rate = 3e-4 #3e-4 
num_beams = 3



import re
import numpy as np
import torch
import wandb
import random
import gc
import collections
import math
import ast
import collections
import math
import sacrebleu

from tqdm import tqdm
from transformers import (
    BertConfig, BertModel,
    GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
    EncoderDecoderModel,
    PreTrainedTokenizerFast,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    get_constant_schedule_with_warmup
)
from datasets import Dataset
from torch.utils.data import DataLoader
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from bleu_cal import quick_bleu_metric
from dataloaders import FeatureVectorDataset, FeatureVectorDataset_Isign
from pose_format import Pose
from pose_format.pose_visualizer import PoseVisualizer
from itertools import cycle

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set random seed for reproducibility
set_seed()

POSE_DIR_ISIGN = "/DATA7/vaibhav/isign/Data/iSign-poses_v1.1/"
STEP_FRAMES_ISIGN = None
ADD_NOISE_ISIGN = False

train_df2 = pd.read_csv("/DATA3/vaibhav/isign/PretrainingISL/isign_new.csv")
#train_df = pd.read_csv('/DATA3/vaibhav/isign/PretrainingISL/train_MT16M.csv')
train_df = pd.read_csv('/DATA3/vaibhav/isign/PretrainingISL/train_BLIMPCISLR.csv')
all_sequences_target = train_df['text'].tolist() + train_df2['text'].tolist()

# Initialize and train the tokenizer
tokenizer_model = models.BPE()
tokenizer = Tokenizer(tokenizer_model)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

vocab_size_decoder = 15000

trainer = trainers.BpeTrainer(
    vocab_size=vocab_size_decoder,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<PERSON>", "<UNKNOWN>"]
)

tokenizer.train_from_iterator(all_sequences_target, trainer=trainer)
# Save the tokenizer
#Make tokenizer_file if it does not exist

if not os.path.exists('tokenizer_file'):
    os.makedirs('tokenizer_file')

tokenizer.save("tokenizer_file/target_tokenizer.json")

#Load the tokenizer as a PreTrainedTokenizerFast
tokenizer_target = PreTrainedTokenizerFast(tokenizer_file="tokenizer_file/target_tokenizer.json")
tokenizer_target.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
    'additional_special_tokens': ['<PERSON>', '<UNKNOWN>']
})




  from .autonotebook import tqdm as notebook_tqdm







0

In [2]:
best_checkpoint_path_isignB4 = '/DATA3/vaibhav/isign/PretrainingISL/predictions_new/BLIMP_Pretraining_BlimpFrameMatchA100_Linear60kBPE0.85Threshold_PT1_best_model_checkpoint_isignB4.pth'
                                
save_name = "IsignMLIMP_PT1"

eval_df2 = pd.read_csv('/DATA7/vaibhav/tokenization/val_split_unicode_filtered.csv')
#eval_df2 = pd.read_csv('/DATA7/vaibhav/tokenization/test_split_unicode_filtered.csv')

eval2_video_uids = eval_df2['uid'].tolist()
eval2_labels = [f'<s>{text}</s>' for text in eval_df2['text'].tolist()]
eval2_labels = tokenizer_target(eval2_labels, max_length=max_length_decoder, padding="max_length", truncation=True)['input_ids']
eval2_dataset = FeatureVectorDataset_Isign(eval2_video_uids, tokenizer_target, 
                                        MAX_FRAMES, POSE_DIR_ISIGN, eval2_labels, 
                                        step_frames=STEP_FRAMES_ISIGN, add_noise = ADD_NOISE_ISIGN)
eval2_loader = DataLoader(eval2_dataset, batch_size=batch_size, num_workers=2, pin_memory=True, prefetch_factor=2)


In [3]:
# Encoder Configuration and Model
encoder_config = BertConfig(
    hidden_size=encoder_hidden_size,
    num_hidden_layers=num_encoder_layers,
    num_attention_heads=num_attention_heads,
    hidden_dropout_prob=dropout,  # Dropout after fully connected layers
    attention_probs_dropout_prob=dropout,  # Dropout on attention weights
)
#encoder = BertForCausalLM(encoder_config)
encoder = BertModel(encoder_config)
print(encoder_config)

# Decoder Configuration and Model
decoder_config = GPT2Config(
    vocab_size=len(tokenizer_target),
    n_positions=max_length_decoder, # We have padded and truncated to 128
    n_embd=decoder_hidden_size,
    n_layer=num_decoder_layers,
    n_head=num_attention_heads,
    pad_token_id=tokenizer_target.pad_token_id,
    bos_token_id=tokenizer_target.bos_token_id,
    eos_token_id=tokenizer_target.eos_token_id,
    add_cross_attention=True,  # Important for Seq2Seq models (Can't find this on HF docs)
    embd_pdrop=dropout,  # Dropout on embeddings 
    attn_pdrop=dropout,  # Dropout on attention probabilities 
    resid_pdrop=dropout  # Dropout on residual connections 
)
print(decoder_config)
decoder = GPT2LMHeadModel(decoder_config)

########################################################
#decoder.resize_token_embeddings(len(tokenizer_target))
########################################################

# Linear layer to project feature vectors to the expected input shape
class FeatureProjection(torch.nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims = 1024):
        super(FeatureProjection, self).__init__()
        # self.linear = torch.nn.Linear(input_dim, output_dim)
        self.linear1 = torch.nn.Linear(input_dim, hidden_dims)
        self.linear2 = torch.nn.Linear(hidden_dims, output_dim)
        self.gelu = torch.nn.GELU()

    def forward(self, x):
        # return self.linear(x)
        x = self.gelu(self.linear1(x))
        x = self.linear2(x)
        return x


# Combine Encoder and Decoder into EncoderDecoderModel
feature_projection = FeatureProjection(num_keypoints, encoder_config.hidden_size)
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

########################################################################
#model.decoder.resize_token_embeddings(len(tokenizer_target))
########################################################################

# Tie weights (optional)
model.config.decoder_start_token_id = tokenizer_target.bos_token_id
model.config.eos_token_id = tokenizer_target.eos_token_id
model.config.pad_token_id = tokenizer_target.pad_token_id
model.config.vocab_size = decoder_config.vocab_size
model.config.max_length = max_length_decoder


BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 512,
  "n_head": 8,
  "n_inner": null,
  "n_layer": 4,
  "n_positions": 128,
  "pad_token_id": 1,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inv

In [4]:
def load_checkpoint(model, feature_projection, optimizer, scheduler, checkpoint_path):
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        feature_projection.load_state_dict(checkpoint['feature_projection_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        current_step = checkpoint['current_step']
        best_val_B4 = checkpoint['best_val_B4']
        best_val_loss = checkpoint.get('best_val_loss', float('inf'))  # Backwards compatibility
        best_val_B4_isign = checkpoint['best_val_B4_isign']
        best_val_B1_isign = checkpoint['best_val_B1_isign']
        best_val_loss_isign = checkpoint.get('best_val_loss_isign', float('inf'))  # Backwards compatibility
        epoch_steps = checkpoint['epoch_steps']
        print(f"Checkpoint loaded, resuming from epoch {start_epoch}")
        print("*"*50)
        return start_epoch, best_val_B4, best_val_loss, best_val_B4_isign, best_val_loss_isign, best_val_B1_isign, epoch_steps
    else:
        print("No checkpoint found, starting from scratch")
        return 0, 0.0, float('inf')



In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
feature_projection.to(device)

optimizer = torch.optim.AdamW(
    list(model.parameters()) + list(feature_projection.parameters()),
    weight_decay=WEIGTH_DECAY,
    lr=learning_rate
)

# Calculate total steps for scheduler
#total_steps = len(train_loader)  
# Set warmup to 10% of total steps
warmup_steps = 100 #int(warmup_steps_ratio * total_steps)

# total_steps = len(train_loader) * num_epochs
#warmup_steps = len(train_loader) * warmup_steps_epocs

# Create scheduler with linear warmup and constant afterwards

scheduler = get_constant_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    #num_training_steps=total_steps  # Will maintain constant lr after warmup
)

#wandb.watch(model, log="all", log_freq=100)

epoch_steps = 0
# Load checkpoint or pretrained weights

#/DATA3/vaibhav/isign/PretrainingISL/predictions_new/CISLR_Pretraining_FrameMatch_Linear60kBPE0.85Threshold_PT1_best_model_checkpoint_isignB4.pth                
if os.path.exists(best_checkpoint_path_isignB4): #best_checkpoint_path_isignB4
    start_epoch, best_val_B4, best_val_loss, best_val_B4_isign, best_val_loss_isign, best_val_B1_isign, epoch_steps = load_checkpoint(
        model, feature_projection, optimizer, scheduler, best_checkpoint_path_isignB4
    )
    start_epoch = 0
    print("Loaded best model checkpoint IsignB4")
    print(epoch_steps)
    print("*"*50)



  checkpoint = torch.load(checkpoint_path)


Checkpoint loaded, resuming from epoch 1
**************************************************
Loaded best model checkpoint IsignB4
207500
**************************************************


In [6]:
epoch = 0
def model_eval(eval_loader, log_what, best_val_B4,best_val_loss,best_val_B4_isign,
               best_val_B1_isign,best_val_loss_isign,counter, current_step,epoch_steps, save_model=False):
    model.eval()
    feature_projection.eval()
    eval_loss = 0.0
    all_refs = []
    sacre_refs = []
    sacre_preds = []
    all_preds = []
    
    with torch.no_grad():
        eval_progress = tqdm(eval_loader, desc=f"Evaluating Epoch {epoch+1}")
        for eval_batch in eval_progress:
            input_ids = eval_batch['input_ids'].to(device)
            attention_mask = eval_batch['attention_mask'].to(device)
            labels = eval_batch['labels'].to(device)
            
            input_ids = feature_projection(input_ids)
            input_ids = input_ids.view(input_ids.size(0), -1, encoder_config.hidden_size)

            outputs = model(
                inputs_embeds=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            eval_loss += outputs.loss.item()
            
            # Generate predictions with improved parameters
            generated_ids = model.generate(
                inputs_embeds=input_ids,
                attention_mask=attention_mask,
                max_length=max_length_decoder,
                num_beams=num_beams,
                length_penalty=0.6,
                no_repeat_ngram_size=3,
                early_stopping=True
            )
            
            # Process predictions and references
            generated_ids = torch.where(
                generated_ids == -100,
                torch.tensor(tokenizer_target.pad_token_id).to(generated_ids.device),
                generated_ids
            )
            labels = torch.where(
                labels == -100,
                torch.tensor(tokenizer_target.pad_token_id).to(labels.device),
                labels
            )
            
            preds = tokenizer_target.batch_decode(generated_ids, skip_special_tokens=True)
            refs = tokenizer_target.batch_decode(labels, skip_special_tokens=True)
            
            for ref in refs:
                sacre_refs.append(str(ref))
            
            for pred in preds:
                sacre_preds.append(str(pred))
            
            ref_tokens = [ref.strip().split() for ref in refs]
            pred_tokens = [pred.strip().split() for pred in preds]
            
            all_refs.extend([ref] for ref in ref_tokens)
            all_preds.extend(pred_tokens)
    
    # Calculate metrics
    avg_eval_loss = eval_loss / len(eval_loader)
    bleu1, bleu2, bleu3, bleu4 = quick_bleu_metric(all_refs, all_preds, split=f'{log_what }Validation')
    bleu_sacre = sacrebleu.corpus_bleu(sacre_preds, [sacre_refs])
    bleu_sacre1, bleu_sacre2, bleu_sacre3, bleu_sacre4 =  bleu_sacre.precisions[0], bleu_sacre.precisions[1], bleu_sacre.precisions[2], bleu_sacre.precisions[3]
    # Save best model
    # Log metrics
    df = pd.DataFrame({
                'Reference': [' '.join(ref[0]) for ref in all_refs],
                'Prediction': [' '.join(pred) for pred in all_preds]
            })
    df.to_csv(f'/DATA3/vaibhav/isign/PretrainingISL/helpers/test_csvs/{save_name}.csv', index=False)
    
    if log_what == "ISIGN":
        print(f'Sacre Bleu1_Isign :{bleu_sacre1}')
        print(f'Sacre Bleu2_Isign :{bleu_sacre2}')
        print(f'Sacre Bleu3_Isign :{bleu_sacre3}')
        print(f'Sacre Bleu4_Isign :{bleu_sacre4}')

        print(f'BLEU1_Isign :{bleu1 * 100}')
        print(f'BLEU2_Isign :{bleu2 * 100}')
        print(f'BLEU3_Isign :{bleu3 * 100}')
        print(f'BLEU4_Isign :{bleu4 * 100}')


    # Clean up memory
    torch.cuda.empty_cache()
    gc.collect()
    
    # Resume training
    model.train()
    feature_projection.train()
    return best_val_B4, best_val_loss, best_val_B4_isign, best_val_B1_isign, best_val_loss_isign


In [7]:
best_val_B4, best_val_loss, best_val_B4_isign, best_val_B1_isign, best_val_loss_isign = model_eval(
                eval2_loader, "ISIGN", best_val_B4,best_val_loss, best_val_B4_isign, 
                best_val_B1_isign, best_val_loss_isign,1,  epoch_steps, epoch_steps, save_model=True)

Evaluating Epoch 1:   0%|          | 0/89 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Evaluating Epoch 1: 100%|██████████| 89/89 [01:51<00:00,  1.25s/it]


BLEU ISIGNValidation >>> B1:16.42, B2:7.51, B3:4.62, B4:3.23


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


Sacre Bleu1_Isign :21.986722699311773
Sacre Bleu2_Isign :4.5882411742165505
Sacre Bleu3_Isign :2.3376432289271856
Sacre Bleu4_Isign :1.4756890996038832
BLEU1_Isign :16.423014999488693
BLEU2_Isign :7.508895888785383
BLEU3_Isign :4.62098957975436
BLEU4_Isign :3.231809095881845
