### **DEPENDENCIES**

In [None]:
%pip install torch transformers wandb timm einops bitsandbytes accelerate

In [2]:
hf_token = "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"
wb_token = "67c72a22325ac1c58306e605a99a7015d44eec79"

In [4]:
import os
import math
import wandb
import torch
import requests
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from einops import rearrange
from PIL import Image, ImageFilter
from bitsandbytes.optim import Adam8bit
from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
from google.colab import drive; drive.mount("/content/drive")

In [7]:
DEVC = "cuda"
DTYP = torch.float16
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **DATASET**

In [8]:
class DimensionsDataset(Dataset):
    def __init__(self, csv_file_path, transform=None):
        self.data = pd.read_csv(csv_file_path)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    @staticmethod
    def transform1(image):
        image = image.convert("L")
        image = image.point(lambda p: p * 1.2)
        image = image.point(lambda p: p > 205 and 255)
        image = image.filter(ImageFilter.MinFilter(3))
        return image

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image = Image.open(requests.get(row["image_link"], stream=True).raw)
        entity_name = row["entity_name"]
        entity_value = row["entity_value"]
        if self.transform:
            image = self.transform(image)
        return {
            "image": image,
            "qa": [
                {
                    "question": f"{entity_name} of object?",
                    "answer": entity_value,
                }
            ]
        }

In [9]:
dset = DimensionsDataset(
    "/content/train_dim.csv",
    transform=None,
)
trns = int(0.8 * len(dset))
trnd, vald = random_split(dset, [trns, len(dset) - trns])

### **MODEL**

In [None]:
mdid = "vikhyatk/moondream2"
mdrv = "2024-08-26"
tokenizer = AutoTokenizer.from_pretrained(mdid, revision=mdrv)
moondream = AutoModelForCausalLM.from_pretrained(
    mdid,
    revision=mdrv,
    trust_remote_code=True,
    torch_dtype=DTYP,
    device_map={"": DEVC},
)

### **TRAINING**

In [11]:
learn_rate = 3e-5
max_epochs = 1
batch_size = 4
grad_acc_s = 2
img_tokens = 729
answer_eos = "<|endoftext|>"

In [12]:
def collate_fn(batch):
    images = [sample["image"] for sample in batch]
    images = [moondream.vision_encoder.preprocess(image) for image in images]

    labels_acc = []
    tokens_acc = []

    for sample in batch:
        toks = [tokenizer.bos_token_id]
        labs = [-100] * (img_tokens + 1)

        for qa in sample["qa"]:
            q_t = tokenizer(
                f"\n\nQuestion: {qa['question']}\n\nAnswer:", add_special_tokens=False
            ).input_ids
            toks.extend(q_t)
            labs.extend([-100] * len(q_t))

            a_t = tokenizer(
                f" {qa['answer']}{answer_eos}", add_special_tokens=False
            ).input_ids
            toks.extend(a_t)
            labs.extend(a_t)

        tokens_acc.append(toks)
        labels_acc.append(labs)

    max_len = -1
    for labels in labels_acc:
        max_len = max(max_len, len(labels))

    attn_mask_acc = []

    for i in range(len(batch)):
        len_i = len(labels_acc[i])
        pad_i = max_len - len_i

        labels_acc[i].extend([-100] * pad_i)
        tokens_acc[i].extend([tokenizer.eos_token_id] * pad_i)
        attn_mask_acc.append([1] * len_i + [0] * pad_i)

    return (
        images,
        torch.stack([torch.tensor(t, dtype=torch.long) for t in tokens_acc]),
        torch.stack([torch.tensor(l, dtype=torch.long) for l in labels_acc]),
        torch.stack([torch.tensor(a, dtype=torch.bool) for a in attn_mask_acc]),
    )


def compute_loss(batch):
    images, tokens, labels, attn_mask = batch

    tokens = tokens.to(DEVC)
    labels = labels.to(DEVC)
    attn_mask = attn_mask.to(DEVC)

    with torch.no_grad():
        img_embs = moondream.vision_encoder(images)

    tok_embs = moondream.text_model.get_input_embeddings()(tokens)
    inputs_embeds = torch.cat(
        (tok_embs[:, 0:1, :], img_embs, tok_embs[:, 1:, :]), dim=1
    )

    outputs = moondream.text_model(
        inputs_embeds=inputs_embeds,
        labels=labels,
        attention_mask=attn_mask,
    )

    return outputs.loss


def lr_schedule(step, max_steps):
    x = step / max_steps
    if x < 0.1:
        return 0.1 * learn_rate + 0.9 * learn_rate * x / 0.1
    else:
        return (
            0.1 * learn_rate
            + 0.9 * learn_rate * (1 + math.cos(math.pi * (x - 0.1))) / 2
        )

In [13]:
trn_loader = DataLoader(
    trnd,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
)
val_loader = DataLoader(
    vald,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
)

In [14]:
moondream.text_model.train()
moondream.text_model.transformer.gradient_checkpointing_enable()

In [15]:
total_steps = max_epochs * len(trn_loader) // grad_acc_s
optimizer = Adam8bit(
    [
        {"params": moondream.text_model.parameters()},
    ],
    lr=learn_rate * 0.1,
    betas=(0.9, 0.95),
    eps=1e-6,
)

In [16]:
wandb.init(
    project="moondream-dimsextract-ft",
    config={
        "EPOCHS": max_epochs,
        "BATCH_SIZE": batch_size,
        "GRAD_ACCUM_STEPS": grad_acc_s,
        "LR": learn_rate,
    }
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [18]:
i = 239
val_after_step = 240
val_loss_accum = 0
val_loss_count = 0
checkpoint_interval = 200
max_val_batches = 16

for epoch in range(max_epochs):
    for batch in tqdm(trn_loader, desc=f"Epoch {epoch + 1}/{max_epochs}"):
        i += 1

        loss = compute_loss(batch)
        loss.backward()

        if i % grad_acc_s == 0:
            optimizer.step()
            optimizer.zero_grad()

            lr = lr_schedule(i / grad_acc_s, total_steps)
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr

        wandb.log({"loss/train": loss.item(), "lr": optimizer.param_groups[0]["lr"], "step": i})

        if i % val_after_step == 0:
            moondream.text_model.eval()

            val_loss_accum = 0
            val_loss_count = 0
            with torch.no_grad():
                for val_batch_idx, val_batch in enumerate(val_loader):
                    if val_batch_idx >= max_val_batches:
                        break
                    val_loss = compute_loss(val_batch)
                    val_loss_accum += val_loss.item()
                    val_loss_count += 1
            avg_val_loss = val_loss_accum / val_loss_count

            wandb.log({"loss/val": avg_val_loss, "step": i})

            moondream.text_model.train()

        if i % checkpoint_interval == 0:
            checkpoint = {
                'epoch': epoch,
                'step': i,
                'model_state_dict': moondream.text_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss.item(),
            }
            torch.save(checkpoint, f'checkpoint_{i}.pt')

wandb.finish()

Epoch 1/1:   0%|          | 0/26582 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 586.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 35.06 MiB is free. Process 4669 has 14.71 GiB memory in use. Of the allocated memory 14.30 GiB is allocated by PyTorch, and 284.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [19]:
moondream.save_pretrained("checkpoints/moondream-dimsextract-ft")

In [20]:
moondream.push_to_hub("Meghnad/moondream2-dimextract-ft-200-4")

model.safetensors:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Meghnad/moondream2-dimextract-ft-200-4/commit/c9c1c0c9f5ef1f34929965fa90b6f94fb58b776d', commit_message='Upload Moondream', commit_description='', oid='c9c1c0c9f5ef1f34929965fa90b6f94fb58b776d', pr_url=None, pr_revision=None, pr_num=None)