In [11]:
import os
import pandas as pd

# Define the paths to your video and caption directories
video_folder = '/kaggle/input/soccer/content/drive/MyDrive/SoccerNet_Captions/TrimmedVideos'  # Path to video folder
caption_folder = '/kaggle/input/soccer/content/drive/MyDrive/SoccerNet_Captions/TrimmedCaptions'  # Path to caption folder

# Get a list of video files (assuming they have extensions like .mp4)
video_files = [f for f in os.listdir(video_folder) if f.endswith('.mp4')]

# Create a list to store video paths and captions
video_paths = []
captions = []

# Iterate through each video file
for video_file in video_files:
    # Get the corresponding caption file name (same name, but with .txt extension)
    caption_file = video_file.replace('.mp4', '.txt')
    
    # Check if the caption file exists
    if caption_file in os.listdir(caption_folder):
        # Construct full path to video file
        video_paths.append(os.path.join(video_folder, video_file))
        
        # Read the caption from the caption file
        caption_path = os.path.join(caption_folder, caption_file)
        with open(caption_path, 'r') as file:
            caption = file.read().strip()  # Read and remove any extra spaces or newlines
        captions.append(caption)

# Create a DataFrame to store the video paths and captions
df = pd.DataFrame({
    'video_path': video_paths,
    'caption': captions
})

# Display the DataFrame
df

Unnamed: 0,video_path,caption
0,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Salomon Rondon (West Brom) receives a precise ...
1,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Aaron Cresswell (West Ham) swings a cross into...
2,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Gabriel Obertan (Newcastle Utd) fails to find ...
3,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Substitution. Roberto Firmino did his best and...
4,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Aaron Cresswell (West Ham) tries to slide the ...
...,...,...
610,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Cesc Fabregas (Chelsea) takes the free kick bu...
611,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Dusan Tadic (Southampton) flights in the cross...
612,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Wayne Rooney (Manchester United) escapes witho...
613,/kaggle/input/soccer/content/drive/MyDrive/Soc...,Cesc Fabregas (Chelsea) steps up to take the c...


In [6]:
pip install av

Note: you may need to restart the kernel to use updated packages.


In [5]:
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoImageProcessor

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoImageProcessor
import av
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Initialize tokenizer and image processor
tokenizer = AutoTokenizer.from_pretrained("gpt2")
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")

# Assign a padding token if not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Define custom dataset
class VideoCaptionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, image_processor, device, clip_len=16, max_length=20):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.device = device
        self.clip_len = clip_len
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get video path and caption
        video_path = self.dataframe.iloc[idx]['video_path']
        caption = self.dataframe.iloc[idx]['caption']
        
        # Process video frames
        container = av.open(video_path)
        seg_len = container.streams.video[0].frames
        indices = set(np.linspace(0, seg_len, num=self.clip_len, endpoint=False).astype(np.int64))
        frames = []

        container.seek(0)
        for i, frame in enumerate(container.decode(video=0)):
            if i in indices:
                frames.append(frame.to_ndarray(format="rgb24"))

        # Prepare pixel values using the image processor
        pixel_values = self.image_processor(frames, return_tensors="pt").pixel_values.to(self.device)

        # Tokenize caption with padding and truncation
        inputs = self.tokenizer(caption, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        labels = inputs.input_ids.to(self.device)

        # Mask padding tokens in labels
        labels[labels == self.tokenizer.pad_token_id] = -100

        return torch.squeeze(pixel_values), torch.squeeze(labels), caption


# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the dataset and DataLoader for train and test
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train DataLoader
train_dataset = VideoCaptionDataset(train_df, tokenizer, image_processor, device)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Test DataLoader
test_dataset = VideoCaptionDataset(test_df, tokenizer, image_processor, device)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Example usage of DataLoader (train)
for pixel_values, labels, _ in train_dataloader:
    print("Train - Pixel values shape:", pixel_values.shape)
    print("Train - Labels shape:", labels.shape)
    break  # Just to print one batch for now

# Example usage of DataLoader (test)
for pixel_values, labels, _ in test_dataloader:
    print("Test - Pixel values shape:", pixel_values.shape)
    print("Test - Labels shape:", labels.shape)
    break  # Just to print one batch for now


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Train - Pixel values shape: torch.Size([8, 16, 3, 224, 224])
Train - Labels shape: torch.Size([8, 20])
Test - Pixel values shape: torch.Size([8, 16, 3, 224, 224])
Test - Labels shape: torch.Size([8, 20])


In [8]:
model = VisionEncoderDecoderModel.from_pretrained("Neleac/timesformer-gpt2-video-captioning").to(device)

# Freeze encoder parameters
for param in model.encoder.parameters():
    param.requires_grad = False

config.json:   0%|          | 0.00/41.2k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.timesformer.modeling_timesformer.TimesformerModel'> is overwritten by shared encoder config: TimesformerConfig {
  "_name_or_path": "facebook/timesformer-base-finetuned-k600",
  "architectures": [
    "TimesformerForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "divided_space_time",
  "drop_path_rate": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "abseiling",
    "1": "acting in play",
    "2": "adjusting glasses",
    "3": "air drumming",
    "4": "alligator wrestling",
    "5": "answering questions",
    "6": "applauding",
    "7": "applying cream",
    "8": "archaeological excavation",
    "9": "archery",
    "10": "arguing",
    "11": "arm wrestling",
    "12": "arranging flowers",
    "13": "assembling bicycle",
    "14": "assembling computer",
    "15": "attending conference",
    "16": "auctioning",
    "17": "backflip (human)",
   

In [None]:
import torch
import os
import random
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
import shutil

# Function to train the model
def train_model(model, train_dataloader, optimizer, num_epochs=5, checkpoint_dir="/kaggle/working/checkpoints2"):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    best_accuracy = 0  # Initialize the best accuracy as 0
    best_checkpoint_path = None

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        # Using tqdm for progress bar
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            pixel_values, labels = batch[:2]  # Only use the first two components
            # Proceed with training logic

            optimizer.zero_grad()

            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

        # Evaluate model and generate captions after every epoch
        if (epoch + 1) % 3 == 0:  # Evaluate every 2 epochs

            
            torch.save(model.state_dict(), "/kaggle/working/epoch"+str(epoch)+".pth")

            # checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch+1}accuracy{best_accuracy:.4f}.bin")
            # torch.save(model.state_dict(), checkpoint_path)
            # print(f"New best model saved at: {checkpoint_path}")
            
            # # Remove previous checkpoint if exists
            # if best_checkpoint_path and os.path.exists(best_checkpoint_path):
            #     os.remove(best_checkpoint_path)
            
            # best_checkpoint_path = checkpoint_path

            
            print("\nGenerating captions for a random video from the test set:")
            random_video_idx = random.choice(range(len(test_dataloader.dataset)))
            batch = test_dataloader.dataset[random_video_idx]
            pixel_values, labels = batch[:2]
            generate_caption(model, pixel_values)
            

# Function to evaluate the model
def evaluate_model(model, test_dataloader):
    model.eval()
    gen_kwargs = {
        "min_length": 10,
        "max_length": 20,
        "num_beams": 8,
    }
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for pixel_values, labels in test_dataloader:
            # Generate captions
            tokens = model.generate(pixel_values, **gen_kwargs)
            captions = tokenizer.batch_decode(tokens, skip_special_tokens=True)
            
            # Assuming the labels are text and we compare them directly for accuracy
            true_labels.append(labels)
            predicted_labels.append(captions)

            # We break after the first batch for now
            break

    # Assuming the labels and predictions are text, we can calculate accuracy (this can vary based on your actual task)
    accuracy = calculate_accuracy(true_labels, predicted_labels)
    print(f"Model accuracy: {accuracy:.4f}")
    return accuracy


# Function to generate captions on a random video
def generate_caption(model, pixel_values):
    model.eval()
    gen_kwargs = {
        "min_length": 10,
        "max_length": 20,
        "num_beams": 8,
    }

    with torch.no_grad():
        tokens = model.generate(pixel_values.unsqueeze(0), **gen_kwargs)  # Add batch dimension
        caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
        print(f"Generated Caption: {caption}")

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Start the training process
train_model(model, train_dataloader, optimizer, num_epochs=30)

                                                           

Epoch [1/30], Loss: 2.2821


                                                           

Epoch [2/30], Loss: 1.7652


                                                           

Epoch [3/30], Loss: 1.4696

Generating captions for a random video from the test set:


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Caption: Cesc Fabregas (Chelsea) sends a cross into the box, but one of the


                                                           

Epoch [4/30], Loss: 1.2481


                                                           

Epoch [5/30], Loss: 1.0796


                                                           

Epoch [6/30], Loss: 0.9520

Generating captions for a random video from the test set:
Generated Caption: The referee stops play so that a substitution can be made and Aleksandar Mitrovic


                                                           

Epoch [7/30], Loss: 0.8188


                                                           

Epoch [8/30], Loss: 0.6976


                                                           

Epoch [9/30], Loss: 0.6272

Generating captions for a random video from the test set:
Generated Caption: Cesc Fabregas (Chelsea) floats the free kick into the box, but it's


                                                            

Epoch [10/30], Loss: 0.5605


                                                            

Epoch [11/30], Loss: 0.5113


                                                            

Epoch [12/30], Loss: 0.4575

Generating captions for a random video from the test set:
Generated Caption: Pedro (Chelsea) races towards goal but the defender gets back well to make a challenge.


                                                            

Epoch [13/30], Loss: 0.4232


                                                            

Epoch [14/30], Loss: 0.3837


                                                            

Epoch [15/30], Loss: 0.3548

Generating captions for a random video from the test set:
Generated Caption: A yellow card for a tackle by Mark Noble (West Ham). Andy Carroll (West Ham)


                                                            

Epoch [16/30], Loss: 0.3277


                                                            

Epoch [17/30], Loss: 0.3152


                                                            

Epoch [18/30], Loss: 0.3023

Generating captions for a random video from the test set:
Generated Caption: Eden Hazard (Chelsea) produces a killer pass onto Diego Costa, who loses the ball to


                                                            

Epoch [19/30], Loss: 0.2804


                                                            

Epoch [20/30], Loss: 0.2603


                                                            

Epoch [21/30], Loss: 0.2393

Generating captions for a random video from the test set:
Generated Caption: A player from Southampton takes his opponent down, the referee blows his whistle for a foul. It


                                                            

Epoch [22/30], Loss: 0.2476


                                                            

Epoch [23/30], Loss: 0.2342


                                                            

Epoch [24/30], Loss: 0.2288

Generating captions for a random video from the test set:
Generated Caption: Cesc Fabregas (Chelsea) floats the ball in from the corner but it's intercepted


                                                            

Epoch [25/30], Loss: 0.2120


                                                            

Epoch [26/30], Loss: 0.2096


                                                            

Epoch [27/30], Loss: 0.1891

Generating captions for a random video from the test set:
Generated Caption: Goal! Sadio Mane displays great vision and sends a pass to Graziano Pelle (


                                                            

Epoch [28/30], Loss: 0.1889


                                                            

Epoch [29/30], Loss: 0.1770


Epoch 30/30:  37%|███▋      | 23/62 [01:22<02:18,  3.55s/it]

In [8]:
pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b8191a70f41d5d6ade303df35767220b7c3a6e15a2fed9c2f3b94628108b66ac
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [29]:
import torch
import os
import random
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
import shutil
import evaluate

# Initialize metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# Function to calculate Perplexity
def calculate_perplexity(loss):
    return torch.exp(loss)

# Function to train the model
def train_model(model, train_dataloader, optimizer, num_epochs=5, checkpoint_dir="checkpoints"):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    best_accuracy = 0  # Initialize the best accuracy as 0
    best_checkpoint_path = None

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        # Using tqdm for progress bar
        for pixel_values, labels, text in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):

            print("\nGenerating captions for a random video from the test set:")
            random_video_idx = random.choice(range(len(test_dataloader.dataset)))
            pixel_values, labels = train_dataloader.dataset[random_video_idx]
            print("original: ",text)
            caption = generate_caption(model, pixel_values)
    
            # Evaluate BLEU, ROUGE, and Perplexity
            print("\nEvaluating on the test set:")
            bleu_score, rouge_score, perplexity = evaluate_model(caption, text)
            print(f"BLEU score: {bleu_score:.4f}, ROUGE score: {rouge_score:.4f}, Perplexity: {perplexity:.4f}")

            optimizer.zero_grad()

            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

        # Evaluate model and generate captions after every epoch
        # if (epoch + 1) % 2 == 0:  # Evaluate every 2 epochs
        checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch+1}_accuracy_{best_accuracy:.4f}.bin")
        torch.save(model.state_dict(), checkpoint_path)
        print(f"New best model saved at: {checkpoint_path}")
        
        # Remove previous checkpoint if exists
        if best_checkpoint_path and os.path.exists(best_checkpoint_path):
            os.remove(best_checkpoint_path)
        
        best_checkpoint_path = checkpoint_path

        print("\nGenerating captions for a random video from the test set:")
        random_video_idx = random.choice(range(len(test_dataloader.dataset)))
        pixel_values, labels = train_dataloader.dataset[random_video_idx]
        print("original: ",text)
        caption = generate_caption(model, pixel_values)

        # Evaluate BLEU, ROUGE, and Perplexity
        print("\nEvaluating on the test set:")
        bleu_score, rouge_score, perplexity = evaluate_model(caption, text)
        print(f"BLEU score: {bleu_score:.4f}, ROUGE score: {rouge_score:.4f}, Perplexity: {perplexity:.4f}")

# Function to evaluate the model
def evaluate_model(predicted_labels, true_labels):

    # Flatten the lists if necessary
    true_labels = [item for sublist in true_labels for item in sublist]
    predicted_labels = [item for sublist in predicted_labels for item in sublist]

    # Compute BLEU and ROUGE scores
    bleu_score = bleu_metric.compute(predictions=predicted_labels, references=true_labels)["bleu"]
    rouge_score = rouge_metric.compute(predictions=predicted_labels, references=true_labels)["rouge1"]

    # Calculate Perplexity
    perplexity = calculate_perplexity(total_loss / len(test_dataloader))

    return bleu_score, rouge_score, perplexity

# Function to generate captions on a random video
def generate_caption(model, pixel_values):
    model.eval()
    gen_kwargs = {
        "min_length": 10,
        "max_length": 20,
        "num_beams": 8,
    }

    with torch.no_grad():
        tokens = model.generate(pixel_values.unsqueeze(0), **gen_kwargs)  # Add batch dimension
        caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
        print(f"Generated Caption: {caption}")
    return caption

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Start the training process
train_model(model, train_dataloader, optimizer, num_epochs=5)

                                                          

Epoch [1/5], Loss: 0.5813
New best model saved at: checkpoints/model_epoch_1_accuracy_0.0000.bin

Generating captions for a random video from the test set:
original:  Die
Generated Caption: Goal! Sadio Mane displays great vision and sends a pass to Graziano Pelle (


                                                          

Epoch [2/5], Loss: 0.5006
New best model saved at: checkpoints/model_epoch_2_accuracy_0.0000.bin

Generating captions for a random video from the test set:
original:  A
Generated Caption: A yellow card for a tackle by Vincent Kompany (Manchester City). Manuel Pellegr


                                                          

Epoch [3/5], Loss: 0.4407
New best model saved at: checkpoints/model_epoch_3_accuracy_0.0000.bin

Generating captions for a random video from the test set:
original:  The
Generated Caption: The referee stops play so that a substitution can be made and Aleksandar Mitrovic


                                                         

KeyboardInterrupt: 

In [56]:
import torch
import os
import random
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score
import shutil
import evaluate

# Initialize metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# Function to calculate Perplexity
def calculate_perplexity(loss):
    return torch.exp(loss)

# Function to train the model
def train_model(model, train_dataloader, optimizer, num_epochs=5, checkpoint_dir="checkpoints"):
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    best_accuracy = 0  # Initialize the best accuracy as 0
    best_checkpoint_path = None

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        # Using tqdm for progress bar
        for pixel_values, labels, text in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):

            loss = torch.Tensor([0.5])
            print("\nGenerating captions for a random video from the test set:")
            random_video_idx = random.choice(range(len(train_dataloader.dataset)))
            pixel_values, labels, text = train_dataloader.dataset[random_video_idx]
            print("original: ",text)
            caption = generate_caption(model, pixel_values)
    
            # Evaluate BLEU, ROUGE, and Perplexity
            print("\nEvaluating on the test set:")
            bleu_score, rouge_score, perplexity = evaluate_model(loss, caption, text)
            print(f"BLEU score: {bleu_score}, ROUGE score: {rouge_score}, Perplexity: {perplexity}")

            optimizer.zero_grad()

            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

        # Evaluate model and generate captions after every epoch
        # if (epoch + 1) % 2 == 0:  # Evaluate every 2 epochs
        checkpoint_path = os.path.join(checkpoint_dir, f"model_epoch_{epoch+1}_accuracy_{best_accuracy:.4f}.bin")
        torch.save(model.state_dict(), checkpoint_path)
        print(f"New best model saved at: {checkpoint_path}")
        
        # Remove previous checkpoint if exists
        if best_checkpoint_path and os.path.exists(best_checkpoint_path):
            os.remove(best_checkpoint_path)
        
        best_checkpoint_path = checkpoint_path

        print("\nGenerating captions for a random video from the test set:")
        random_video_idx = random.choice(range(len(test_dataloader.dataset)))
        pixel_values, labels = train_dataloader.dataset[random_video_idx]
        print("original: ",text)
        caption = generate_caption(model, pixel_values)

        # Evaluate BLEU, ROUGE, and Perplexity
        print("\nEvaluating on the test set:")
        bleu_score, rouge_score, perplexity = evaluate_model(caption, text)
        print(f"BLEU score: {bleu_score:.4f}, ROUGE score: {rouge_score:.4f}, Perplexity: {perplexity:.4f}")

# Function to evaluate the model
def evaluate_model(total_loss, predicted_labels, true_labels):

    # Flatten the lists if necessary
    true_labels = [item for sublist in true_labels for item in sublist]
    predicted_labels = [item for sublist in predicted_labels for item in sublist][:len(true_labels)]
    true_labels = [item for sublist in true_labels for item in sublist][:len(predicted_labels)]
    

    # Compute BLEU and ROUGE scores
    bleu_score = bleu_metric.compute(predictions=predicted_labels, references=true_labels)["bleu"]
    rouge_score = rouge_metric.compute(predictions=predicted_labels, references=true_labels)["rouge1"]

    # Calculate Perplexity
    perplexity = calculate_perplexity(total_loss / len(test_dataloader))

    return bleu_score, rouge_score, perplexity

# Function to generate captions on a random video
def generate_caption(model, pixel_values):
    model.eval()
    gen_kwargs = {
        "min_length": 10,
        "max_length": 40,
        "num_beams": 8,
    }

    with torch.no_grad():
        tokens = model.generate(pixel_values.unsqueeze(0), **gen_kwargs)  # Add batch dimension
        caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
        print(f"Generated Caption: {caption}")
    return caption

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Start the training process
train_model(model, train_dataloader, optimizer, num_epochs=5)

Epoch 1/5:   0%|          | 0/62 [00:00<?, ?it/s]


Generating captions for a random video from the test set:
original:  What a bad luck! Tyler Blackett (Manchester United) got in the way of a pass and unintentionally sent the ball past his own goalkeeper. The score is 1:1.
Generated Caption: Nacho Monreal (Arsenal) sends a lofted cross into the penalty area. Unfortunately for him, Mesut Ozil (Arsenal) found some space to play in on the edge of the

Evaluating on the test set:


                                                 

BLEU score: 0.0, ROUGE score: 0.026143790849673203, Perplexity: tensor([1.0317])




ValueError: not enough values to unpack (expected 5, got 4)

In [None]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load pretrained processor, tokenizer, and model
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = VisionEncoderDecoderModel.from_pretrained("Neleac/timesformer-gpt2-video-captioning").to(device)

# Freeze encoder parameters
for param in model.encoder.parameters():
    param.requires_grad = False

# Define optimizer and loss
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

# Load and process video
video_path = "clip.mp4"
container = av.open(video_path)
seg_len = container.streams.video[0].frames
clip_len = model.config.encoder.num_frames
indices = set(np.linspace(0, seg_len, num=clip_len, endpoint=False).astype(np.int64))
frames = []

container.seek(0)
for i, frame in enumerate(container.decode(video=0)):
    if i in indices:
        frames.append(frame.to_ndarray(format="rgb24"))

# Prepare pixel values
pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(device)

# Example training captions
train_captions = ["A man and a woman are dancing on a stage in front of a mirror."]

# Assign a padding token if not defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize captions with padding and truncation
inputs = tokenizer(train_captions, return_tensors="pt", padding="max_length", max_length=20, truncation=True)
labels = inputs.input_ids.to(device)

# Mask padding tokens in labels
labels[labels == tokenizer.pad_token_id] = -100


# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(pixel_values, labels=labels)
    loss = outputs.loss

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

# Evaluate and generate captions
model.eval()
gen_kwargs = {
    "min_length": 10,
    "max_length": 20,
    "num_beams": 8,
}
with torch.no_grad():
    tokens = model.generate(pixel_values, **gen_kwargs)
    caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
    print(f"Generated Caption: {caption}")