In [1]:
! pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git --q
! pip install -q datasets bitsandbytes einops wandb --q
! pip install transformers==4.28.0 --q
! pip install --upgrade datasets transformers --q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.5 MB/s

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
import json

In [3]:
class Embed_Dataset(Dataset):

    def __init__(self, imageIDs, imageEmbeddings, annotations_data, tokenizer):
        self.imageIDs = imageIDs
        self.imageEmbeddings = imageEmbeddings
        self.annotations_data = annotations_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.imageIDs)

    def __getitem__(self, index):
        max_len = 256

        image_name = self.imageIDs[index]
        image_embedding = self.imageEmbeddings[index]

        captions = get_captions(self.annotations_data, image_name)

        if len(captions) == 0:
            text = ""
        else:
            text = captions[0]

        caption_tokens = tokenizer.encode(text, add_special_tokens=True)
        padded_caption_tokens = caption_tokens + [tokenizer.pad_token_id] * (max_len - len(caption_tokens))

        return {
            'image_name': image_name,
            'caption_text': text,
            'image_embedding': image_embedding,
            'caption_tokens':torch.tensor(padded_caption_tokens)
        }

In [4]:
class ProjectionModel(nn.Module):
    def __init__(
        self,
        clip_embeddings : int = 512,
        token_embeddings : int = 2560,
        projection_tokens : int = 4,
        projection_layers : int = 4
    ):
        super().__init__()
        model_name = "microsoft/phi-2"

        self.projection_tokens = projection_tokens
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.vocab_size = len(tokenizer)
        self.tokenizer.pad_token = tokenizer.eos_token
        self.phi2Model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to("cuda")
        self.token_embedding = self.phi2Model.get_submodule('model.embed_tokens')

        self.layer1 = build_mlp_vector_projector(
            clip_embeddings, token_embeddings, 1, self.projection_tokens
        ).to("cuda")

        for name, param in self.phi2Model.named_parameters():
                param.requires_grad = False

    def generate_text_from_embeddings(self, logits):
        probabilities = logits.softmax(dim=2)  # Softmax along the last dimension
        predicted_indices = torch.argmax(probabilities, dim=2) # Find the index of the class with highest probability
        predicted_texts = [self.tokenizer.decode(seq) for seq in predicted_indices] # Decode each sequence

        return predicted_texts


    def forward(self, x, captions):
        x = self.layer1(x)
        caption_token_embeddings = self.token_embedding(captions)
        inputs = torch.concat((x, caption_token_embeddings), axis=-2)
        outputs = self.phi2Model(inputs_embeds=inputs)
        predictions = self.generate_text_from_embeddings(outputs.logits)

        loss = F.cross_entropy(
            outputs.logits[:, self.projection_tokens:, :].reshape(-1, outputs.logits.size(-1)), captions.reshape(-1)
        )

        return loss, predictions

In [5]:
class Projection_Model(nn.Module):
    def __init__(
        self, 
        input_hidden_size: int, 
        hidden_size: int, 
        num_layers: int, 
        width: int
    ):
        super(Projection_Model, self).__init__()
        self.layers = nn.ModuleList()

        for _ in range(width):
            layer = [nn.Linear(input_hidden_size, hidden_size)]

            for _ in range(1, num_layers):
                layer.append(nn.GELU())
                layer.append(nn.Linear(hidden_size, hidden_size))

            self.layers.append(nn.Sequential(*layer))

    def forward(self, x):
        return torch.cat([layer(x) for layer in self.layers], dim=-2)


def build_layer_vector_projector(
    input_hidden_size: int, 
    hidden_size: int, 
    num_layers: int, 
    num_tokens: int
):
    return Projection_Model(
        input_hidden_size, 
        hidden_size, 
        num_layers, 
        num_tokens
    )

In [6]:
file_data = 'captions_val2017.json'

with open(file_data, 'r') as f:
    data = json.load(f)

def get_captions(data, image_name):
    captions = []
    img = next((img for img in data['images'] if img['file_name'] == image_name), None)

    if img is not None:
        ids = [annotation['id'] for annotation in data['annotations'] if annotation['image_id'] == img['id']]
        captions = [annotation['caption'] for annotation in data['annotations'] if annotation['id'] in ids]

    return captions

In [7]:
clip_image_embed_dict = torch.load("img_embeddings.pth")
model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

dataset = Embed_Dataset(
    list(clip_image_embed_dict.keys()), list(clip_image_embed_dict.values()), data, tokenizer
)

dataloader = DataLoader(
    dataset, batch_size=1, shuffle=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
model = ProjectionModel()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [9]:
num_epochs = 15
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

for epoch in range(num_epochs):
    model.train()

    for batch in dataloader:
        embeddings = batch['image_embedding'].to('cuda')
        captions = batch['caption_tokens'].to('cuda')
        loss, predictions = model(embeddings, captions)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Loss  : {loss.item()}")
    print(f"Epoch : {epoch + 1}/{num_epochs}")
    print("Caption   : ", batch['caption_text'])
    print("Predicted : ", [text.rstrip('\n') for text in predictions])
    print("----------------------------------------------")

Epoch : 1/15
Loss  : 7.161831855773926
Caption    :  ['A bedroom with a bed and small table near by.']
Prediction :  ['_, to\n\n, a view, a table. the.\nThe']
Epoch : 2/15
Loss  : 6.663324356079102
Caption    :  ['A man riding a brown horse in uniform next to tall green trees.']
Prediction :  [',_\n\n\n who a bike horse is a. to a buildings trees.\nA\nIN']
Epoch : 3/15
Loss  : 6.508380889892578
Caption    :  ['A couple of computer monitors sitting on top of a wooden desk.']
Prediction :  ['_\n\n otherux of years science are on a of each desk desk.\nA#']
Epoch : 4/15
Loss  : 6.26915168762207
Caption    :  ['A woman in a hat sitting next to luggage.']
Prediction :  [",_.ayactions's the red Online on to a.\nAThe\n\n\n\nThe"]
Epoch : 5/15
Loss  : 6.324067115783691
Caption    :  ['A child holding chocolate donut with both hands.']
Prediction :  ['_",\',.men\'s a\n\'t\n a hands.\nTheTheTheTheThe']
Epoch : 6/15
Loss  : 6.472545146942139
Caption    :  ['two zebras are standing together in the 