# **Importing Libraries**

In [None]:
!pip install bitsandbytes --q
!pip install peft --q
!pip install accelerate --q
!pip install huggingface_hub --q
!pip install av --q
!pip install albumentations --q
!pip install aiohttp --q
print("Library Installation done")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, AutoProcessor, XCLIPVisionModel, AutoModel , get_scheduler
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import math
import os
import requests
import cv2
import json
import pandas as pd
from torchvision import transforms
import matplotlib.pyplot as plt
from PIL import Image
import av
import albumentations as A
import torch.nn as nn
import numpy as np
import transformers
from huggingface_hub import hf_hub_download, login
from IPython.display import Video
import pathlib
import asyncio
import numpy as np
import pandas as pd
from torch.utils.data import Dataset , DataLoader
import aiohttp
import asyncio
from urllib.parse import urlparse

# **HF LOGIN**

In [None]:
!huggingface-cli logout


  pid, fd = os.forkpty()


Successfully logged out.


In [None]:
from huggingface_hub import login

token = "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"
login(token=token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# **LLAMA QUANTIZATION AND INSERTIVE ADAPTER(LORA)**

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch16-16-frames")
MODEL_ID = "meta-llama/Llama-3.2-3B"
# Load Llama with QLoRA
quant_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)


# LoRA configuration
lora_config = LoraConfig(
    r=64,               # Rank of the update matrices
    lora_alpha=16,     # Scaling factor
    target_modules=["q_proj", "v_proj", 'k_proj'],  # Apply LoRA to specific layers
    lora_dropout=0.05, # Dropout probability for LoRA
    bias="none",       # Bias configuration (none, all, or lora_only)
    task_type="CAUSAL_LM"  # For causal language modeling tasks
)



tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side='right')
llm_model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=quant_config,use_auth_token = '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')
for param in llm_model.parameters():
  param.requires_grad = False
llm_model = prepare_model_for_kbit_training(llm_model)
llm_model_with_lora = get_peft_model(llm_model, lora_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from torchinfo import summary
summary(llm_model_with_lora)

Layer (type:depth-idx)                                            Param #
PeftModelForCausalLM                                              --
├─LoraModel: 1-1                                                  --
│    └─LlamaForCausalLM: 2-1                                      --
│    │    └─LlamaModel: 3-1                                       1,829,153,792
│    │    └─Linear: 3-2                                           (394,002,432)
Total params: 2,223,156,224
Trainable params: 25,690,112
Non-trainable params: 2,197,466,112

# **GIFS DOWNLOADING**

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/test-df-with-link-1/test_df_with_link.csv')

In [None]:
GIF_PATH = pathlib.Path('gifs')
if not GIF_PATH.exists():
    GIF_PATH.mkdir()

def is_valid_url(url):
    """Check if the URL is a valid string and properly formatted."""
    if not isinstance(url, str):
        return False
    parsed = urlparse(url)
    return all([parsed.scheme, parsed.netloc])

async def download_file(url):
    filename = url.split("/")[-1]
    filepath = GIF_PATH / filename

    # Check if the GIF file already exists
    if filepath.exists():
#         print(f"File {filename} already exists, skipping download.")
        return

    max_retries = 3  # Set the maximum number of retries
    retries = 0
    while retries < max_retries:
        try:
            async with aiohttp.ClientSession(trust_env=True) as session:
                async with session.get(url) as response:
                    if response.status == 200:
                        with open(filepath, mode="wb") as file:
                            while True:
                                chunk = await response.content.read(1024)
                                if not chunk:
                                    break
                                file.write(chunk)
#                         print(f"Downloaded file {filename}")
                        return  # Exit the loop if successful
                    else:
#                         print(f"Failed to download {url}: HTTP Status {response.status}")
                        return
        except (aiohttp.client_exceptions.ClientConnectorError, ConnectionResetError) as e:
#             print(f"Error downloading {url}: {e}")
            retries += 1
            await asyncio.sleep(1)  # Wait for a second before retrying

async def safe_request(semaphore, url):
    async with semaphore:
        return await download_file(url)

async def main(url_col, parallel_processes):
    # Filter out invalid URLs
    valid_urls = [url for url in url_col if is_valid_url(url)]

    semaphore = asyncio.Semaphore(parallel_processes)
    tasks = [asyncio.ensure_future(safe_request(semaphore, url)) for url in valid_urls]
    await asyncio.gather(*tasks)



In [None]:
# Example usage
await main(df['full_link'][43000:54000], 50)  # Assuming df['full_link'] is a collection of URLs

# **GENERATION CONFIG OF LLM**

In [None]:
generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=120, pad_token_id = tokenizer.pad_token_id)

generation_config_bs = GenerationConfig(
    num_beams=10,          # Number of beams for beam search
    max_new_tokens=50,        # Maximum length of the generated text
    early_stopping=True,  # Stop the generation when all beams finish
    no_repeat_ngram_size=2,  # Prevent repetition of 2-grams
    length_penalty=0.7,   # Length penalty to encourage longer sequences
    num_return_sequences=1,  # Number of sequences to return
    pad_token_id = tokenizer.pad_token_id
)
generation_config_greedy = GenerationConfig(
    max_new_tokens=120,        # Maximum length of the generated text
    num_beams=1,          # Greedy decoding, no beam search (set num_beams=1)
    no_repeat_ngram_size=2,  # Optional: Avoid repetition of 2-grams
    early_stopping=True,  # Stop the generation when end-of-sequence is generated
    do_sample=False,       # Disable sampling for greedy decoding
    pad_token_id = tokenizer.pad_token_id
)

tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
llm_model_with_lora.config.pad_token_id = tokenizer.pad_token_id

# **MODEL**
Adapter :: MLP(738->1024
               1024->3072)  
DECODER :: LLaMA 2.1 3B LoRA  
VISUAL ENCODER :: XCLiP


In [None]:
class Connective_Adapter(nn.Module):
    def __init__(self, device):
        super().__init__()
        self.device = device
        self.Linear_1 = nn.Linear(768, 1024).to(device)
#         self.Linear_2 = nn.Linear(1024, 1024).to(device)
        self.Linear_3 = nn.Linear(1024, 3072).to(device)
        self.GELU = nn.GELU()  # Only one instance of GELU
        self.LayerNorm = nn.LayerNorm(normalized_shape=3072)

    def forward(self, x):
        # Ensure the input tensor is on the correct device
        x = x.to(self.device)

        x = self.Linear_1(x)
#         x = self.GELU(x)
#         x = self.Linear_2(x)
        x = self.GELU(x)
#         x = self.Drop(x)
        x = self.Linear_3(x)
        x = self.LayerNorm(x)


        return x


class VideoQAModel(torch.nn.Module):
  def __init__(self, video_model, llama_model):
    super(VideoQAModel, self).__init__()
    self.video_model = video_model
    self.llama_embeddings = llama_model.model.get_input_embeddings()
    self.llama_model = llama_model
    self.adapter = Connective_Adapter(device)

  def forward(self, frames , question, attention_mask):
    frames = frames.to(self.video_model.device)
    outputs = self.video_model(frames.view(-1,3,224,224))
    video_embeddings = outputs.pooler_output.view((-1,16,768))
    video_embeddings_converted = self.adapter(video_embeddings).to(question.device)

    question_embeddings = self.llama_embeddings(question).to(video_embeddings_converted.device)
#     print(question_embeddings.shape, video_embeddings_converted.shape)
    additional_embeddings = torch.concat((video_embeddings_converted, question_embeddings), dim=-2)
    logits = self.llama_model(inputs_embeds=additional_embeddings, attention_mask=attention_mask)
    #generated_seq = self.llama_model.generate(input_embeds=additional_embeddings, generation_config=generation_config_greedy)
    return logits

  def generate(self, frames , question, generation_config):
    frames = frames.to(self.video_model.device)
    outputs = self.video_model(frames.view(-1,3,224,224))
    video_embeddings = outputs.pooler_output.view((-1,16,768))
    video_embeddings_converted = self.adapter(video_embeddings).to(question.device)
    question_embeddings = self.llama_embeddings(question).to(video_embeddings_converted.device)
#     print(question_embeddings.shape, video_embeddings_converted.shape)
    additional_embeddings = torch.concat((video_embeddings_converted, question_embeddings), dim=-2)
    generated_seq = self.llama_model.generate(inputs_embeds=additional_embeddings, generation_config=generation_config)
    return generated_seq

# **CUSTOM IMAGE PROCESSOR FOR FASTER SPEEDS**

In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import torch

class AlbumentationsBatchProcessor:
    def __init__(self):
        # Define the transformation pipeline
        self.transform = A.Compose([
            A.SmallestMaxSize(max_size=224, interpolation=2),  # Resize
            A.CenterCrop(height=224, width=224),  # Center crop
            A.Normalize(mean=(0.485, 0.456, 0.406),
                        std=(0.229, 0.224, 0.225),
                        max_pixel_value=255.0),  # Normalize
            ToTensorV2()  # Convert to PyTorch Tensor
        ])

    def frame_splitter(self, gif_path, num_key_frames=16):
        frames_list = []
        with Image.open(gif_path) as im:
            for i in range(num_key_frames):
                try:
                    # Seek a frame at equally spaced intervals
                    im.seek(im.n_frames // num_key_frames * i)
                    # Convert frame to RGB to ensure it has 3 channels
                    frames_list.append(im.convert('RGB').copy())
                except EOFError:
                    # If the GIF has fewer frames, break early
                    break

        # Padding with black (zero-value) frames if there are fewer than 16 frames
        if len(frames_list) < num_key_frames:
            pad_length = num_key_frames - len(frames_list)
            width, height = frames_list[0].size
            zero_frame = Image.new("RGB", (width, height), (0, 0, 0))  # Create a black frame
            frames_list.extend([zero_frame] * pad_length)

        return frames_list

    def __call__(self, path):
        self.path = path
        images = self.frame_splitter(self.path)
        processed_images = []
        for image in images:
            # Apply transformation to each image
            transformed = self.transform(image=np.array(image))['image']
            processed_images.append(transformed)

        # Stack into a batch (PyTorch tensor)
        return torch.stack(processed_images)

A_processor = AlbumentationsBatchProcessor()

In [None]:
import time

# Start time
start_time = time.time()

# Example usage
gif_path = 'gifs/tumblr_nakz3m797W1tg4t6vo1_250.gif'
pixel_values = A_processor(gif_path)
print(pixel_values.shape)

# End time
end_time = time.time()

# Calculate the difference
diff_time = end_time - start_time
print(f"Time taken: {diff_time} seconds")


torch.Size([16, 3, 224, 224])
Time taken: 0.03337860107421875 seconds


# **DATASET BUILDER AND DATALOADER**

In [None]:
class Dataset_builder(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get the URL and check its validity
        url = self.df.iloc[idx]['full_link']

        # Skip if the URL is not a string
        if not isinstance(url, str):
            print(f"Skipped: Invalid URL at index {idx}, found {url}")
            return None  # Skip this item

        filename = url.split("/")[-1]
        filepath = 'gifs/' + filename

        # Check if the GIF exists at the given filepath
        if not os.path.exists(filepath):
            print(f"Skipped: {filepath} not found")
            return None  # Skip this item

        try:
            # Get frames from the GIF using the FrameMaker class
            frames = A_processor(filepath)
        except Exception as e:
            print(f"Error processing {filepath}: {str(e)}")
            return None  # Skip this item in case of errors

        # Retrieve the question and answer
        question = "<|begin_of_text|>"+self.df.iloc[idx]['question']
        answer = self.df.iloc[idx]['answer']+'<|end_of_text|>'

        return frames, question, answer

def collate_fn(batch):
    # Filter out skipped items (None) from the batch
    batch = list(filter(lambda x: x is not None, batch))

    # Return the remaining items
    if batch:
        return torch.utils.data.dataloader.default_collate(batch)
    else:
        return None

In [None]:
Train_Dataset = Dataset_builder(df[43000:53500])
Train_Data_Loader = DataLoader(Train_Dataset, batch_size=36, shuffle=True, collate_fn=collate_fn)
Val_Dataset = Dataset_builder(df[53500:54000])
Val_Data_Loader = DataLoader(Val_Dataset, batch_size=36, shuffle=True, collate_fn=collate_fn)

# **INITIALIZING THE MODEL**

In [None]:
# Load XCLIP model and processor
Video_Model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch16-16-frames").to('cuda')
for param in Video_Model.parameters():
    param.requires_grad = False

EPOCHS = 1
fin_model = VideoQAModel(Video_Model, llm_model_with_lora)
fin_model.to(device)  # Move to the primary device (first GPU)

## **INITIALIZING THE OPTIMIZER AND CRITERION**

In [None]:

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.AdamW(fin_model.parameters(), lr=5e-5)

## lr_scheduler
num_train_epochs = EPOCHS
num_update_steps_per_epoch = len(Train_Data_Loader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="cosine",  ##Cosine Annealing or any other
    optimizer=optimizer,
    num_warmup_steps=0.1*num_training_steps,
    num_training_steps=num_training_steps
)

## **CUSTOM TRAINING LOOP**

In [None]:

def train(model, dataloader, criterion, optimizer, scheduler, device):
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)  # Add tqdm progress bar

    for gif_path, questions, answers in progress_bar:
        if gif_path is None:
            continue  # Skip if gif_path is None

        try:
            question_answer_tokenized = tokenizer(questions, answers, padding=True, truncation=True, add_special_tokens=False, return_tensors='pt')
            gif_path = gif_path.to(device)  # Move gif_path to device
            input_ids = question_answer_tokenized['input_ids'].to(device)  # Move input_ids to device
            attention_mask = question_answer_tokenized['attention_mask']
            attention_mask = torch.concat((torch.ones(attention_mask.shape[0], 16), attention_mask), dim=-1).to(device)

            optimizer.zero_grad()
            output = model(gif_path, input_ids, attention_mask)
            logits = output.logits

            # Ensure that the logits and labels have the correct shape
            shift_logits = logits[..., 16:-1, :].contiguous().to(device)
            shift_labels = question_answer_tokenized['input_ids'][..., 1:].contiguous().to(device)

            # Compute the loss
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss.backward()
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()

            # Update tqdm bar with the current loss
            progress_bar.set_postfix(loss=loss.item())

        except FileNotFoundError as e:
            print(f"File not found: {e}. Skipping this sample.")
            continue

    avg_loss = running_loss / len(dataloader)
    return avg_loss

## **CUSTOM VALIDATION LOOP**

In [None]:
def validate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
#     correct = 0
#     total = 0
    progress_bar = tqdm(dataloader, desc="Validating", leave=False)  # Add tqdm progress bar

    with torch.no_grad():
        for gif_path, questions, answers in progress_bar:
            if gif_path is None:
                continue  # Skip if gif_path is None

            question_answer_tokenized = tokenizer(questions, answers, padding=True, truncation=True, add_special_tokens=False, return_tensors='pt')
            gif_path = gif_path.to(device)  # Move gif_path to device
            input_ids = question_answer_tokenized['input_ids'].to(device)  # Move input_ids to device
            attention_mask = question_answer_tokenized['attention_mask']
            attention_mask = torch.concat((torch.ones(attention_mask.shape[0], 16), attention_mask), dim=-1).to(device)

            output = model(gif_path, input_ids, attention_mask)
            logits = output.logits

            # Ensure that the logits and labels have the correct shape
            shift_logits = logits[..., 16:-1, :].contiguous().to(device)
            shift_labels = question_answer_tokenized['input_ids'][..., 1:].contiguous().to(device)

            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            running_loss += loss.item()

            # Update tqdm bar with the current loss
            progress_bar.set_postfix(loss=loss.item())

    avg_loss = running_loss / len(dataloader)
#     accuracy = 100 * correct / total
    return avg_loss


## **TRAINING LOOP**

In [None]:
# Training loop with progress bar
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    avg_loss = train(fin_model, Train_Data_Loader, loss_fn, optimizer, lr_scheduler, device)
    avg_val_los = validate(fin_model,Val_Data_Loader, loss_fn, device)

    print(f"End of epoch {epoch+1}, average loss: {avg_loss}, average validation loss: {avg_val_loss}")

    torch.save({
                  'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss': avg_loss,
    }, 'checkpoint.pth')
    print(f"Model saved at epoch {epoch+1} to {model_path}")

Epoch 1/1


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Training:  39%|███▊      | 113/292 [35:33<56:03, 18.79s/it, loss=1.35]  

Skipped: Invalid URL at index 6079, found nan


                                                                        

TypeError: cannot unpack non-iterable float object

In [None]:
avg_val_loss = validate(fin_model,Val_Data_Loader, loss_fn, device)
print(f"average validation loss: {avg_val_loss}")

                                                                      

average validation loss: 1.5043692759105138




# **INFERENCE ON EXAMPLES**

In [None]:
def predict(question,gif_path):
  fin_model.eval()
  frames = A_processor(gif_path)
  question='<|begin_of_text|>'+question
  tokenized_question = tokenizer(question, add_special_tokens=False, return_tensors='pt')
  y = fin_model.generate(torch.stack((frames,)).to('cuda'), tokenized_question['input_ids'].to("cuda"), generation_config_bs)
  return tokenizer.decode(y.tolist()[0], skip_special_tokens=True)

In [None]:
question = 'What is in the video?'
gif_path = 'gifs/tumblr_nakz3m797W1tg4t6vo1_250.gif'
y_pred = predict(question,gif_path)
print(y_pred)

In [None]:
import shutil

# Specify the path to the directory you want to remove
directory_path = '/kaggle/working/gifs'

# Remove the directory and all its contents
shutil.rmtree(directory_path)

print(f"Removed directory: {directory_path}")


Removed directory: /kaggle/working/gifs


In [None]:
os.remove('/kaggle/working/checkpoint_epoch1_ds4_r64_2ffn.pth')