In [1]:
pip install open_clip_torch datasets torch torchvision scikit-learn tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset  

dataset = load_dataset("ravisri/bird-presence-classification")  
train_ds = dataset["train"]  
test_ds = dataset["test"]  

# Your class mapping (from the Dataset Viewer)  
class_names = ["bird", "no_bird"]

README.md:   0%|          | 0.00/563 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/48.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2299 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/575 [00:00<?, ? examples/s]

In [3]:
from torchvision import transforms  
from torch.utils.data import Dataset, DataLoader  

preprocess = transforms.Compose([  
    transforms.CenterCrop(224),  # Models trained on 224x224, but images are 256x256  
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],  
                         std=[0.26862954, 0.26130258, 0.27577711])  
])  

class HFDataset(Dataset):  
    def __init__(self, hf_ds, transform):  
        self.ds = hf_ds  
        self.transform = transform  

    def __len__(self):  
        return len(self.ds)  

    def __getitem__(self, idx):  
        row = self.ds[idx]  
        img = row["image"]  
        label = row["label"] # Already integer (0:bird, 1:no_bird)  
        return self.transform(img), label  

train_dataset = HFDataset(train_ds, preprocess)  
test_dataset  = HFDataset(test_ds, preprocess)  

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)  
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

In [4]:
import open_clip  
import torch  

device = "cuda" if torch.cuda.is_available() else "cpu"  
model, _, _ = open_clip.create_model_and_transforms(  
    'ViT-B-32', pretrained='laion2b_s34b_b79k'  
)  
tokenizer = open_clip.get_tokenizer("ViT-B-32")  
model = model.to(device).eval()  
FEATURE_DIM = model.visual.output_dim

open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [5]:
prompts = [  
    "There is a bird present in the picture.",      # label 0, "bird"  
    "There is no bird in the picture."              # label 1, "no_bird"  
]

In [23]:
from tqdm import tqdm  
from sklearn.metrics import classification_report  

def evaluate_clip_base(model, tokenizer, loader, prompts, class_names, device='cuda'):  
    model.eval()  
    text_tokens = tokenizer(prompts).to(device)  
    text_features = model.encode_text(text_tokens)  
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)  
    y_true, y_pred = [], []  
    with torch.no_grad():  
        for images, labels in tqdm(loader, desc="Zero-shot CLIP"):  
            images = images.to(device)  
            image_features = model.encode_image(images)  
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)  
            logits = 100. * image_features @ text_features.T  
            preds = logits.argmax(dim=1).cpu().numpy()  
            y_pred.extend(preds)  
            y_true.extend(labels.numpy())  
    print("\n=== ZERO-SHOT CLIP ===")  
    print(classification_report(y_true, y_pred, target_names=class_names, digits=4))  

evaluate_clip_base(model, tokenizer, test_loader, prompts, class_names, device)

Zero-shot CLIP: 100%|██████████| 18/18 [00:01<00:00,  9.36it/s]


=== ZERO-SHOT CLIP ===
              precision    recall  f1-score   support

        bird     0.8056    0.1007    0.1790       288
     no_bird     0.5195    0.9756    0.6780       287

    accuracy                         0.5374       575
   macro avg     0.6625    0.5382    0.4285       575
weighted avg     0.6628    0.5374    0.4281       575






In [30]:
import torch.nn as nn  
import torch.optim as optim  

class CLIPWithHead(nn.Module):  
    def __init__(self, base_model, feature_dim, num_classes):  
        super().__init__()  
        self.base_model = base_model  
        self.head = nn.Linear(feature_dim, num_classes)  
    def forward(self, images):  
        with torch.no_grad():  
            features = self.base_model.encode_image(images)  
            features = features / features.norm(dim=-1, keepdim=True)  
        return self.head(features)  

probe = CLIPWithHead(model, FEATURE_DIM, num_classes=2).to(device)  

def train_linear_head(probe, train_loader, num_epochs=5, lr=1e-3, device='cuda'):  
    optimizer = optim.Adam(probe.head.parameters(), lr=lr)  
    criterion = nn.CrossEntropyLoss()  
    probe.train()  
    for epoch in range(num_epochs):  
        running_loss = 0.  
        for images, labels in tqdm(train_loader, desc=f'Fine-tuning (Epoch {epoch+1})'):  
            images, labels = images.to(device), labels.to(device)  
            optimizer.zero_grad()  
            logits = probe(images)  
            loss = criterion(logits, labels)  
            loss.backward()  
            optimizer.step()  
            running_loss += loss.item()  
        print(f'Epoch {epoch+1} loss: {running_loss/len(train_loader):.4f}')  
    probe.eval()  

train_linear_head(probe, train_loader, num_epochs=100, lr=1e-3, device=device)

Fine-tuning (Epoch 1): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 1 loss: 0.6353


Fine-tuning (Epoch 2): 100%|██████████| 72/72 [00:07<00:00, 10.03it/s]


Epoch 2 loss: 0.5409


Fine-tuning (Epoch 3): 100%|██████████| 72/72 [00:07<00:00,  9.87it/s]


Epoch 3 loss: 0.4788


Fine-tuning (Epoch 4): 100%|██████████| 72/72 [00:07<00:00,  9.83it/s]


Epoch 4 loss: 0.4357


Fine-tuning (Epoch 5): 100%|██████████| 72/72 [00:07<00:00,  9.96it/s]


Epoch 5 loss: 0.4052


Fine-tuning (Epoch 6): 100%|██████████| 72/72 [00:07<00:00, 10.09it/s]


Epoch 6 loss: 0.3829


Fine-tuning (Epoch 7): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 7 loss: 0.3662


Fine-tuning (Epoch 8): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 8 loss: 0.3528


Fine-tuning (Epoch 9): 100%|██████████| 72/72 [00:06<00:00, 10.33it/s]


Epoch 9 loss: 0.3420


Fine-tuning (Epoch 10): 100%|██████████| 72/72 [00:07<00:00, 10.27it/s]


Epoch 10 loss: 0.3333


Fine-tuning (Epoch 11): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 11 loss: 0.3256


Fine-tuning (Epoch 12): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 12 loss: 0.3192


Fine-tuning (Epoch 13): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 13 loss: 0.3133


Fine-tuning (Epoch 14): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 14 loss: 0.3085


Fine-tuning (Epoch 15): 100%|██████████| 72/72 [00:07<00:00, 10.07it/s]


Epoch 15 loss: 0.3045


Fine-tuning (Epoch 16): 100%|██████████| 72/72 [00:07<00:00, 10.06it/s]


Epoch 16 loss: 0.3003


Fine-tuning (Epoch 17): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 17 loss: 0.2969


Fine-tuning (Epoch 18): 100%|██████████| 72/72 [00:07<00:00, 10.10it/s]


Epoch 18 loss: 0.2937


Fine-tuning (Epoch 19): 100%|██████████| 72/72 [00:07<00:00, 10.10it/s]


Epoch 19 loss: 0.2903


Fine-tuning (Epoch 20): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 20 loss: 0.2874


Fine-tuning (Epoch 21): 100%|██████████| 72/72 [00:07<00:00, 10.22it/s]


Epoch 21 loss: 0.2848


Fine-tuning (Epoch 22): 100%|██████████| 72/72 [00:07<00:00, 10.23it/s]


Epoch 22 loss: 0.2824


Fine-tuning (Epoch 23): 100%|██████████| 72/72 [00:07<00:00, 10.21it/s]


Epoch 23 loss: 0.2799


Fine-tuning (Epoch 24): 100%|██████████| 72/72 [00:07<00:00, 10.23it/s]


Epoch 24 loss: 0.2780


Fine-tuning (Epoch 25): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 25 loss: 0.2764


Fine-tuning (Epoch 26): 100%|██████████| 72/72 [00:07<00:00, 10.15it/s]


Epoch 26 loss: 0.2738


Fine-tuning (Epoch 27): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 27 loss: 0.2718


Fine-tuning (Epoch 28): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 28 loss: 0.2701


Fine-tuning (Epoch 29): 100%|██████████| 72/72 [00:07<00:00, 10.11it/s]


Epoch 29 loss: 0.2685


Fine-tuning (Epoch 30): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 30 loss: 0.2665


Fine-tuning (Epoch 31): 100%|██████████| 72/72 [00:07<00:00, 10.22it/s]


Epoch 31 loss: 0.2653


Fine-tuning (Epoch 32): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 32 loss: 0.2638


Fine-tuning (Epoch 33): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 33 loss: 0.2619


Fine-tuning (Epoch 34): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 34 loss: 0.2610


Fine-tuning (Epoch 35): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 35 loss: 0.2595


Fine-tuning (Epoch 36): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 36 loss: 0.2576


Fine-tuning (Epoch 37): 100%|██████████| 72/72 [00:07<00:00, 10.15it/s]


Epoch 37 loss: 0.2574


Fine-tuning (Epoch 38): 100%|██████████| 72/72 [00:07<00:00, 10.15it/s]


Epoch 38 loss: 0.2557


Fine-tuning (Epoch 39): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 39 loss: 0.2542


Fine-tuning (Epoch 40): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 40 loss: 0.2531


Fine-tuning (Epoch 41): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 41 loss: 0.2521


Fine-tuning (Epoch 42): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 42 loss: 0.2511


Fine-tuning (Epoch 43): 100%|██████████| 72/72 [00:07<00:00, 10.11it/s]


Epoch 43 loss: 0.2501


Fine-tuning (Epoch 44): 100%|██████████| 72/72 [00:07<00:00, 10.14it/s]


Epoch 44 loss: 0.2496


Fine-tuning (Epoch 45): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 45 loss: 0.2478


Fine-tuning (Epoch 46): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 46 loss: 0.2472


Fine-tuning (Epoch 47): 100%|██████████| 72/72 [00:07<00:00, 10.15it/s]


Epoch 47 loss: 0.2461


Fine-tuning (Epoch 48): 100%|██████████| 72/72 [00:07<00:00, 10.14it/s]


Epoch 48 loss: 0.2451


Fine-tuning (Epoch 49): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 49 loss: 0.2440


Fine-tuning (Epoch 50): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 50 loss: 0.2436


Fine-tuning (Epoch 51): 100%|██████████| 72/72 [00:07<00:00, 10.09it/s]


Epoch 51 loss: 0.2433


Fine-tuning (Epoch 52): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 52 loss: 0.2416


Fine-tuning (Epoch 53): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 53 loss: 0.2409


Fine-tuning (Epoch 54): 100%|██████████| 72/72 [00:07<00:00, 10.18it/s]


Epoch 54 loss: 0.2400


Fine-tuning (Epoch 55): 100%|██████████| 72/72 [00:07<00:00, 10.08it/s]


Epoch 55 loss: 0.2395


Fine-tuning (Epoch 56): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 56 loss: 0.2388


Fine-tuning (Epoch 57): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 57 loss: 0.2378


Fine-tuning (Epoch 58): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 58 loss: 0.2376


Fine-tuning (Epoch 59): 100%|██████████| 72/72 [00:07<00:00, 10.21it/s]


Epoch 59 loss: 0.2370


Fine-tuning (Epoch 60): 100%|██████████| 72/72 [00:07<00:00, 10.18it/s]


Epoch 60 loss: 0.2357


Fine-tuning (Epoch 61): 100%|██████████| 72/72 [00:07<00:00, 10.21it/s]


Epoch 61 loss: 0.2349


Fine-tuning (Epoch 62): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 62 loss: 0.2344


Fine-tuning (Epoch 63): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 63 loss: 0.2340


Fine-tuning (Epoch 64): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 64 loss: 0.2332


Fine-tuning (Epoch 65): 100%|██████████| 72/72 [00:07<00:00, 10.11it/s]


Epoch 65 loss: 0.2324


Fine-tuning (Epoch 66): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 66 loss: 0.2318


Fine-tuning (Epoch 67): 100%|██████████| 72/72 [00:07<00:00, 10.15it/s]


Epoch 67 loss: 0.2309


Fine-tuning (Epoch 68): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 68 loss: 0.2308


Fine-tuning (Epoch 69): 100%|██████████| 72/72 [00:07<00:00, 10.08it/s]


Epoch 69 loss: 0.2301


Fine-tuning (Epoch 70): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 70 loss: 0.2294


Fine-tuning (Epoch 71): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 71 loss: 0.2291


Fine-tuning (Epoch 72): 100%|██████████| 72/72 [00:07<00:00, 10.21it/s]


Epoch 72 loss: 0.2286


Fine-tuning (Epoch 73): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 73 loss: 0.2279


Fine-tuning (Epoch 74): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 74 loss: 0.2275


Fine-tuning (Epoch 75): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 75 loss: 0.2267


Fine-tuning (Epoch 76): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 76 loss: 0.2267


Fine-tuning (Epoch 77): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 77 loss: 0.2258


Fine-tuning (Epoch 78): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 78 loss: 0.2253


Fine-tuning (Epoch 79): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 79 loss: 0.2248


Fine-tuning (Epoch 80): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 80 loss: 0.2242


Fine-tuning (Epoch 81): 100%|██████████| 72/72 [00:07<00:00, 10.18it/s]


Epoch 81 loss: 0.2242


Fine-tuning (Epoch 82): 100%|██████████| 72/72 [00:07<00:00, 10.14it/s]


Epoch 82 loss: 0.2233


Fine-tuning (Epoch 83): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 83 loss: 0.2232


Fine-tuning (Epoch 84): 100%|██████████| 72/72 [00:07<00:00, 10.19it/s]


Epoch 84 loss: 0.2228


Fine-tuning (Epoch 85): 100%|██████████| 72/72 [00:07<00:00, 10.22it/s]


Epoch 85 loss: 0.2221


Fine-tuning (Epoch 86): 100%|██████████| 72/72 [00:07<00:00, 10.11it/s]


Epoch 86 loss: 0.2220


Fine-tuning (Epoch 87): 100%|██████████| 72/72 [00:07<00:00, 10.11it/s]


Epoch 87 loss: 0.2213


Fine-tuning (Epoch 88): 100%|██████████| 72/72 [00:07<00:00, 10.12it/s]


Epoch 88 loss: 0.2206


Fine-tuning (Epoch 89): 100%|██████████| 72/72 [00:07<00:00, 10.14it/s]


Epoch 89 loss: 0.2205


Fine-tuning (Epoch 90): 100%|██████████| 72/72 [00:07<00:00, 10.20it/s]


Epoch 90 loss: 0.2198


Fine-tuning (Epoch 91): 100%|██████████| 72/72 [00:07<00:00, 10.06it/s]


Epoch 91 loss: 0.2199


Fine-tuning (Epoch 92): 100%|██████████| 72/72 [00:07<00:00, 10.14it/s]


Epoch 92 loss: 0.2192


Fine-tuning (Epoch 93): 100%|██████████| 72/72 [00:07<00:00, 10.17it/s]


Epoch 93 loss: 0.2195


Fine-tuning (Epoch 94): 100%|██████████| 72/72 [00:07<00:00, 10.14it/s]


Epoch 94 loss: 0.2187


Fine-tuning (Epoch 95): 100%|██████████| 72/72 [00:07<00:00, 10.09it/s]


Epoch 95 loss: 0.2178


Fine-tuning (Epoch 96): 100%|██████████| 72/72 [00:07<00:00, 10.11it/s]


Epoch 96 loss: 0.2178


Fine-tuning (Epoch 97): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 97 loss: 0.2174


Fine-tuning (Epoch 98): 100%|██████████| 72/72 [00:07<00:00, 10.13it/s]


Epoch 98 loss: 0.2168


Fine-tuning (Epoch 99): 100%|██████████| 72/72 [00:07<00:00, 10.16it/s]


Epoch 99 loss: 0.2163


Fine-tuning (Epoch 100): 100%|██████████| 72/72 [00:07<00:00, 10.09it/s]

Epoch 100 loss: 0.2163





In [31]:
torch.save(probe.state_dict(), "clip_bird_probe.pt")  
# To reload:  
probe.load_state_dict(torch.load("clip_bird_probe.pt"))  
probe.eval()

  probe.load_state_dict(torch.load("clip_bird_probe.pt"))


CLIPWithHead(
  (base_model): CLIP(
    (visual): VisionTransformer(
      (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (patch_dropout): Identity()
      (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (transformer): Transformer(
        (resblocks): ModuleList(
          (0-11): 12 x ResidualAttentionBlock(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (ls_1): Identity()
            (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): Sequential(
              (c_fc): Linear(in_features=768, out_features=3072, bias=True)
              (gelu): GELU(approximate='none')
              (c_proj): Linear(in_features=3072, out_features=768, bias=True)
            )
            (ls_2): Identity()
          )
      

In [32]:
def evaluate_clip_finetuned(probe, loader, class_names, device='cuda'):  
    probe.eval()  
    y_true, y_pred = [], []  
    with torch.no_grad():  
        for images, labels in tqdm(loader, desc="Fine-tuned CLIP"):  
            images = images.to(device)  
            logits = probe(images)  
            preds = logits.argmax(dim=1).cpu().numpy()  
            y_pred.extend(preds)  
            y_true.extend(labels.numpy())  
    print("\n=== FINE-TUNED (Linear Head) ===")  
    print(classification_report(y_true, y_pred, target_names=class_names, digits=4))  

evaluate_clip_finetuned(probe, test_loader, class_names, device)

Fine-tuned CLIP: 100%|██████████| 18/18 [00:01<00:00,  9.28it/s]


=== FINE-TUNED (Linear Head) ===
              precision    recall  f1-score   support

        bird     0.9173    0.8472    0.8809       288
     no_bird     0.8576    0.9233    0.8893       287

    accuracy                         0.8852       575
   macro avg     0.8874    0.8853    0.8851       575
weighted avg     0.8875    0.8852    0.8851       575






In [33]:
from huggingface_hub import HfApi, HfFolder, upload_folder, create_repo  

# Log in (enter your token)  
from huggingface_hub import login  
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
repo_id = "ravisri/clip-bird-detector"  
api = HfApi()  
api.create_repo(repo_id=repo_id, exist_ok=True)

RepoUrl('https://huggingface.co/ravisri/clip-bird-detector', endpoint='https://huggingface.co', repo_type='model', repo_id='ravisri/clip-bird-detector')

In [37]:
from huggingface_hub import HfApi  

api = HfApi()  
api.upload_file(  
    path_or_fileobj="clip_bird_probe.pt",    # Your local file  
    path_in_repo="clip_bird_probe.pt",       # The filename to use in the repo (can be the same)  
    repo_id=repo_id,  
    repo_type="model"  
)

clip_bird_probe.pt:   0%|          | 0.00/605M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ravisri/clip-bird-detector/commit/e9b164b6ca2ee4de36a879b099ca3b3b6eaf2a48', commit_message='Upload clip_bird_probe.pt with huggingface_hub', commit_description='', oid='e9b164b6ca2ee4de36a879b099ca3b3b6eaf2a48', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ravisri/clip-bird-detector', endpoint='https://huggingface.co', repo_type='model', repo_id='ravisri/clip-bird-detector'), pr_revision=None, pr_num=None)