In [1]:
pip install torch torchvision transformers datasets scikit-learn huggingface_hub

Collecting torch
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl (865.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hCollecting torchvision
  Downloading torchvision-0.22.0-cp310-cp310-manylinux_2_28_x86_64.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp31

In [2]:
from datasets import load_dataset  

# 1. Reload your dataset!  
ds = load_dataset("ravisri/bird-presence-classification")  
train_ds = ds["train"]  
test_ds = ds["test"]  

# 2. Now apply preprocessing  
import torch  
from transformers import BlipProcessor  

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")  

def preprocess(example):  
    inputs = processor(images=example["image"], return_tensors="pt")  
    px = inputs["pixel_values"][0]  
    if not isinstance(px, torch.Tensor):  
        px = torch.tensor(px)  
    px = px.float()  
    return {"pixel_values": px, "labels": int(example["label"])}  

train_ds = train_ds.map(preprocess)  
test_ds = test_ds.map(preprocess)

README.md:   0%|          | 0.00/563 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2299 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/575 [00:00<?, ? examples/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/2299 [00:00<?, ? examples/s]

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

In [3]:
from transformers import BlipModel, BlipProcessor  
import torch  
import torch.nn as nn  

class BlipForBinaryClassification(nn.Module):  
    def __init__(self, blip_name="Salesforce/blip-image-captioning-base"):  
        super().__init__()  
        self.base = BlipModel.from_pretrained(blip_name)  
        # Fix: feature size for BLIP base is 512  
        hidden_size = 512  
        self.classifier = nn.Linear(hidden_size, 1)  # Binary classification head  

    def forward(self, pixel_values):  
        # Get image features (returns [batch, 512])  
        outputs = self.base.get_image_features(pixel_values=pixel_values)  
        logits = self.classifier(outputs)  
        return logits.squeeze(-1)

In [4]:
def collate_fn(batch):  
    pixel_values = torch.stack([  
        torch.tensor(item["pixel_values"]) if not isinstance(item["pixel_values"], torch.Tensor) else item["pixel_values"]  
        for item in batch  
    ])  
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.float)  
    return {"pixel_values": pixel_values, "labels": labels}

In [5]:
from torch.utils.data import DataLoader  

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn)  
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
model = BlipForBinaryClassification().to(device)  

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)  
loss_fn = nn.BCEWithLogitsLoss()  

# One training epoch (extend as needed!)  
for epoch in range(3):  
    model.train()  
    total_loss = 0  
    for batch in train_loader:  
        pixel_values = batch["pixel_values"].to(device)  
        labels = batch["labels"].to(device)  
        logits = model(pixel_values)  
        loss = loss_fn(logits, labels)  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  
        total_loss += loss.item()  
    print(f"Epoch {epoch+1}: train loss = {total_loss/len(train_loader):.4f}")

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_model.encoder.layer.0.attention.self.value.bias', 'text_model.encoder.layer.0.attention.self.value.weight', 'text_model.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_model.

Epoch 1: train loss = 0.2885
Epoch 2: train loss = 0.1531
Epoch 3: train loss = 0.0933


In [8]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix  

model.eval()  
all_preds, all_labels = [], []  
with torch.no_grad():  
    for batch in test_loader:  
        pixel_values = batch["pixel_values"].to(device)  
        labels = batch["labels"].cpu().numpy()  
        logits = model(pixel_values)  
        preds = (torch.sigmoid(logits) > 0.5).cpu().numpy()  
        all_preds.extend(preds)  
        all_labels.extend(labels)  

print("Test Accuracy:", accuracy_score(all_labels, all_preds))  
print("Test F1 Score:", f1_score(all_labels, all_preds))  
print("Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))

Test Accuracy: 0.8973913043478261
Test F1 Score: 0.8925318761384335
Confusion Matrix:
 [[271  17]
 [ 42 245]]


In [11]:
# Save BLIP backbone  
model.base.save_pretrained("blip-bird-classifier")  
# Save the classifier head  
torch.save(model.classifier.state_dict(), "blip-bird-classifier/classifier.pt")  
# Save your processor for later use as well  
processor.save_pretrained('blip-bird-classifier')

[]

In [None]:
from huggingface_hub import login  
login()  # Paste your HF token if needed  

In [13]:
from transformers import BlipPreTrainedModel, BlipModel  
import torch  
import torch.nn as nn  

class BlipForCustomClassification(BlipPreTrainedModel):  
    def __init__(self, config):  
        super().__init__(config)  
        self.base = BlipModel(config)  
        self.classifier = nn.Linear(512, 1)  
        self.post_init()  # Required for Hugging Face models  

    def forward(self, pixel_values, labels=None):  
        features = self.base.get_image_features(pixel_values=pixel_values)  
        logits = self.classifier(features).squeeze(-1)  
        loss = None  
        if labels is not None:  
            loss_fn = nn.BCEWithLogitsLoss()  
            loss = loss_fn(logits, labels)  
        return {"logits": logits, "loss": loss}

In [15]:
# Save the BLIP backbone  
model.base.save_pretrained('blip-bird-classifier')  
# Save the classifier head weights  
torch.save(model.classifier.state_dict(), 'blip-bird-classifier/classifier.pt')  
# Save the processor if you use one (recommended!)  
processor.save_pretrained('blip-bird-classifier')  

print("Model and processor saved in 'blip-bird-classifier/'")

Model and processor saved in 'blip-bird-classifier/'


In [17]:
from huggingface_hub import HfApi, HfFolder  

# Log in and create the repo first  
api = HfApi()  
api.create_repo('ravisri/blip-bird-classifier')  # Only needed the first time  

# Push all files in the directory up  
api.upload_folder(  
    folder_path="blip-bird-classifier",  
    repo_id="ravisri/blip-bird-classifier",  
    repo_type="model"  
)

classifier.pt:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/899M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ravisri/blip-bird-classifier/commit/7647f56a0a9810feec4ed28c5e934cc78e4443bc', commit_message='Upload folder using huggingface_hub', commit_description='', oid='7647f56a0a9810feec4ed28c5e934cc78e4443bc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ravisri/blip-bird-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='ravisri/blip-bird-classifier'), pr_revision=None, pr_num=None)

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
import numpy as np  

def print_metrics(y_true, y_pred, label_map={0: "Bird", 1: "No bird"}):  
    acc = accuracy_score(y_true, y_pred)  
    print(f"Overall Accuracy: {acc:.4f}\n")  
    labels = list(label_map.keys())  
    cm = confusion_matrix(y_true, y_pred, labels=labels)  
    report = classification_report(  
        y_true, y_pred, labels=labels, output_dict=True, zero_division=0  
    )  

    for i, label_name in label_map.items():  
        # Use get to handle any missing key  
        p = report.get(str(i), {}).get('precision', 0.0)  
        r = report.get(str(i), {}).get('recall', 0.0)  
        f1 = report.get(str(i), {}).get('f1-score', 0.0)  
        TP = cm[i, i]  
        FP = cm[:, i].sum() - TP  
        FN = cm[i, :].sum() - TP  
        print(f"Class: {label_name} (label {i})")  
        print(f"Precision: {p:.4f}")  
        print(f"Recall: {r:.4f}")  
        print(f"F1-score: {f1:.4f}")  
        print(f"TP: {TP}, FP: {FP}, FN: {FN}\n")

In [22]:
def evaluate_model(model, test_loader, device):  
    model.eval()  
    all_preds = []  
    all_labels = []  
    with torch.no_grad():  
        for batch in test_loader:  
            pixel_values = batch["pixel_values"].to(device)  
            labels = batch["labels"].to(device)  
            logits = model(pixel_values)  
            probs = torch.sigmoid(logits)  
            preds = (probs > 0.5).long().cpu().numpy()  
            all_preds.extend(preds)  
            all_labels.extend(labels.cpu().numpy())  
    return np.array(all_labels), np.array(all_preds)

In [23]:
from transformers import BlipModel, BlipProcessor  
import torch.nn as nn  

# Define your custom classifier as before, but with random weights for the head  
class BlipForBinaryClassification(nn.Module):  
    def __init__(self, model_dir=None):  
        super().__init__()  
        self.base = BlipModel.from_pretrained('Salesforce/blip-image-captioning-base' if model_dir is None else model_dir)  
        self.classifier = nn.Linear(512, 1)  # 512 for BLIP base  
        if model_dir:  
            self.classifier.load_state_dict(torch.load(f"{model_dir}/classifier.pt"))  

    def forward(self, pixel_values):  
        features = self.base.get_image_features(pixel_values=pixel_values)  
        return self.classifier(features).squeeze(-1)  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
# Base (not finetuned): random head  
base_model = BlipForBinaryClassification().to(device)  
base_labels, base_preds = evaluate_model(base_model, test_loader, device)  
print("==== BEFORE finetuning ====")  
print_metrics(base_labels, base_preds)

`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.
Some weights of BlipModel were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['logit_scale', 'text_model.embeddings.LayerNorm.bias', 'text_model.embeddings.LayerNorm.weight', 'text_model.embeddings.position_embeddings.weight', 'text_model.embeddings.word_embeddings.weight', 'text_model.encoder.layer.0.attention.output.LayerNorm.bias', 'text_model.encoder.layer.0.attention.output.LayerNorm.weight', 'text_model.encoder.layer.0.attention.output.dense.bias', 'text_model.encoder.layer.0.attention.output.dense.weight', 'text_model.encoder.layer.0.attention.self.key.bias', 'text_model.encoder.layer.0.attention.self.key.weight', 'text_model.encoder.layer.0.attention.self.query.bias', 'text_model.encoder.layer.0.attention.self.query.weight', 'text_mo

==== BEFORE finetuning ====
Overall Accuracy: 0.4870

Class: Bird (label 0)
Precision: 0.1818
Recall: 0.0069
F1-score: 0.0134
TP: 2, FP: 9, FN: 286

Class: No bird (label 1)
Precision: 0.4929
Recall: 0.9686
F1-score: 0.6533
TP: 278, FP: 286, FN: 9



In [None]:
# Fine-tuned  
finetuned_model = BlipForBinaryClassification("blip-bird-classifier").to(device)  
finetuned_labels, finetuned_preds = evaluate_model(finetuned_model, test_loader, device)  
print("==== AFTER finetuning ====")  
print_metrics(finetuned_labels, finetuned_preds)

`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.
