<a href="https://colab.research.google.com/github/SamsonWanjiku/DeepLearningExamples/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import GPT2Model, GPT2Tokenizer
from torch.utils.data import DataLoader

# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Define the model architecture
class TextToVideoModel(nn.Module):
    def __init__(self):
        super(TextToVideoModel, self).__init__()
        self.text_encoder = GPT2Model.from_pretrained("gpt2")
        self.video_encoder = models.resnet50(pretrained=False)
        self.fusion_layer = nn.Linear(768 + 1000, 512)  # Fusion layer

    def forward(self, text, video):
        # Tokenize text and convert to tensor indices
        tokenized_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

        # Pass tokenized text to the GPT-2 model
        text_encoding = self.text_encoder(**tokenized_text).last_hidden_state[:, 0, :]  # Extract CLS token
        video_features = self.video_encoder(video)
        flattened_video_features = video_features.view(video_features.size(0), -1)  # Flatten video features
        fused_features = torch.cat((text_encoding, flattened_video_features), dim=1)  # Concatenate text and video features
        output = self.fusion_layer(fused_features)
        return output

# Initialize the model, optimizer, and loss function
model = TextToVideoModel()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Dummy dataset and DataLoader (Replace with your actual dataset)
class DummyDataset(torch.utils.data.Dataset):
    def __init__(self):
        # Dummy data
        self.text_data = ["Sample text"] * 32
        self.video_data = torch.randn(32, 3, 224, 224)  # Batch size x Channels x Height x Width
        self.target = torch.randn(32, 512)  # Batch size x Output size

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        return self.text_data[idx], self.video_data[idx], self.target[idx]

# Prepare data loader
dataset = DummyDataset()
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for text, video, target in data_loader:
        optimizer.zero_grad()
        output = model(text, video)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

# Save trained model
torch.save(model.state_dict(), "text_to_video_model.pth")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

