Loading the WebSight dataset in streaming mode.

In [1]:
!pip install torchvision transformers datasets


from datasets import load_dataset
import itertools
from torch.utils.data import IterableDataset


train_stream = load_dataset("HuggingFaceM4/WebSight", split="train", streaming=True)
eval_stream  = load_dataset("HuggingFaceM4/WebSight", split="train", streaming=True)


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.1->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

  Loading BLIP's Processor and Model

In [2]:
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Using BLIP's built-in tokenizer for HTML tokenization.
html_tokenizer = processor.tokenizer
if html_tokenizer.pad_token is None:
    html_tokenizer.pad_token = html_tokenizer.eos_token

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [3]:
def tokenize_html(example):

    tokens = html_tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    example["html_tokens"] = tokens
    return example


Tokenizing HTML

In [4]:
import torch

def shift_tokens_right(input_ids, pad_token_id):

    shifted = input_ids.clone()
    shifted[0] = pad_token_id
    shifted[1:] = input_ids[:-1]
    return shifted

def preprocess_example(example):

    # Tokenizing HTML if not already tokenized
    if "html_tokens" not in example:
        example = tokenize_html(example)

    image = example["image"]
    html_tokens = example["html_tokens"]
    image_inputs = processor(images=image, return_tensors="pt")


    labels = torch.tensor(html_tokens["input_ids"])
    decoder_input_ids = shift_tokens_right(labels, html_tokenizer.pad_token_id)

    inputs = {
        "pixel_values": image_inputs["pixel_values"].squeeze(0),
        "input_ids": decoder_input_ids,
        "labels": labels,
        "attention_mask": torch.tensor(html_tokens["attention_mask"]),
        "decoder_attention_mask": torch.tensor(html_tokens["attention_mask"])
    }
    return inputs


 Creating an Iterable Dataset

In [5]:
class StreamingDataset(IterableDataset):
    def __init__(self, stream, num_samples):
        self.stream = stream
        self.num_samples = num_samples

    def __iter__(self):
        return (preprocess_example(tokenize_html(example)) for example in itertools.islice(self.stream, self.num_samples))

    def __len__(self):
        return self.num_samples

# Create streaming datasets without converting them into lists.
train_dataset = StreamingDataset(train_stream, num_samples=1000)
eval_dataset  = StreamingDataset(eval_stream, num_samples=100)

print("✅ Streaming train and eval datasets created successfully!")


✅ Streaming train and eval datasets created successfully!


Data Collator

In [6]:
def collate_fn(batch):

    batch_keys = batch[0].keys()
    collated = {}
    for key in batch_keys:
        collated[key] = torch.stack([example[key] for example in batch])
    return collated

Trainer and Training Arguments

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./blip_finetuned_html",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    max_steps=500,
    logging_steps=10,
    save_steps=50,
    learning_rate=5e-5,
    evaluation_strategy="epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn,
)

# Training
print("\nStarting training...")
trainer.train()




Starting training...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchaussajjad[0m ([33mchaussajjad-thakur-college-of-engineering-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
0,1.8198,1.55122


TrainOutput(global_step=500, training_loss=2.4974287338256835, metrics={'train_runtime': 485.017, 'train_samples_per_second': 1.031, 'train_steps_per_second': 1.031, 'total_flos': 2.96711826112512e+17, 'train_loss': 2.4974287338256835, 'epoch': 0.5})

In [10]:
!pip install evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2

In [11]:
import evaluate

bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = processor.tokenizer.batch_decode(logits.argmax(-1), skip_special_tokens=True)
    labels = [[token if token != -100 else processor.tokenizer.pad_token_id for token in label] for label in labels]
    references = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

    bleu = bleu_metric.compute(predictions=[pred.split() for pred in predictions],
                               references=[[ref.split()] for ref in references])
    return {"bleu": bleu["score"]}


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [15]:
import torch

model.eval()
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=2)

all_predictions = []
all_references = []

for batch in eval_dataloader:
    inputs = {k: v.to(model.device) for k, v in batch.items() if k not in ["labels", "decoder_attention_mask"]}
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = processor.tokenizer.batch_decode(outputs.logits.argmax(-1), skip_special_tokens=True)
    labels = [[token.item() if token.item() != -100 else processor.tokenizer.pad_token_id for token in label_seq] for label_seq in batch["labels"]]
    references = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)

    all_predictions.extend([pred.split() for pred in predictions])
    all_references.extend([[ref.split()] for ref in references])

# Computing BLEU score
bleu = bleu_metric.compute(
    predictions=[" ".join(pred) for pred in all_predictions],
    references=[[" ".join(ref) for ref in refs] for refs in all_references]
)

print("BLEU Score:", bleu["score"])


BLEU Score: 40.25433538669091


In [16]:
torch.save(model.state_dict(), "model_weights.pth")


In [17]:
processor.save_pretrained("processor")


[]