# Multi-Label Classification with DINOv2 and Kaggle Dataset
This notebook demonstrates how to use the DINOv2 model for multi-label classification using a Kaggle dataset.

In [1]:
# Add your Kaggle API configuration code here:
# For example:
# !mkdir -p ~/.kaggle
# !echo '{"username":"your_kaggle_username","key":"your_kaggle_api_key"}' > ~/.kaggle/kaggle.json
# !chmod 600 ~/.kaggle/kaggle.json
from google.colab import files
import os
import shutil

print("Please upload your kaggle.json file.")
uploaded = files.upload()

os.makedirs("/root/.kaggle", exist_ok=True)

if "kaggle.json" in uploaded:
    shutil.move("kaggle.json", "/root/.kaggle/kaggle.json")
    os.chmod("/root/.kaggle/kaggle.json", 600)
    print("Kaggle API key configured successfully.")
else:
    raise FileNotFoundError("kaggle.json not found. Please re-upload.")

Please upload your kaggle.json file.


Saving kaggle.json to kaggle.json
Kaggle API key configured successfully.


In [6]:
!pip install transformers
!pip install kagglehub
!pip install torch torchvision scikit-learn matplotlib pandas tqdm



In [2]:
import os
import kagglehub
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from PIL import Image
import numpy as np


In [3]:
import os
import pandas as pd

# Define the path to the dataset folder
path = "/kaggle/input/multi-label-image-classification-dataset"

# Debugging: List all files in the dataset directory
print("Dataset contents:")
for root, dirs, files in os.walk(path):
    print(f"Root: {root}")
    print(f"Directories: {dirs}")
    print(f"Files: {files}")

# Correct the path to the CSV file
csv_file = os.path.join(path, "multilabel_modified", "multilabel_classification(6)-reduced_modified.csv")

# Debugging: Verify if the corrected path exists
if not os.path.exists(csv_file):
    print(f"CSV file not found: {csv_file}")
    exit()

# Load the CSV file
print(f"Loading CSV file: {csv_file}")
data = pd.read_csv(csv_file)
print("CSV file loaded successfully!")

Dataset contents:
Root: /kaggle/input/multi-label-image-classification-dataset
Directories: ['multilabel_modified']
Files: []
Root: /kaggle/input/multi-label-image-classification-dataset/multilabel_modified
Directories: ['images']
Files: ['multilabel_classification.csv', 'multilabel_classification(2).csv', 'multilabel_classification(7).csv', 'multilabel_classification(6)-reduced_modified.csv']
Root: /kaggle/input/multi-label-image-classification-dataset/multilabel_modified/images
Directories: []
Files: ['image4781.jpg', 'image5500.jpg', 'image4723.jpg', 'image7761.jpg', 'image4637.jpg', 'image3202.jpg', 'image7545.jpg', 'image1153.jpg', 'image1465.jpg', 'image688.jpg', 'image233.jpg', 'image4332.jpg', 'image3561.jpg', 'image1166.jpg', 'image1975.jpg', 'image1263.jpg', 'image5906.jpg', 'image920.jpg', 'image4998.jpg', 'image5639.jpg', 'image1629.jpg', 'image1701.jpg', 'image5275.jpg', 'image695.jpg', 'image4929.jpg', 'image1273.jpg', 'image5894.jpg', 'image748.jpg', 'image807.jpg', 'ima

In [4]:
import os
import pandas as pd

# Define the dataset path and CSV file
dataset_path = "/kaggle/input/multi-label-image-classification-dataset/multilabel_modified"
csv_file = os.path.join(dataset_path, "multilabel_classification(6)-reduced_modified.csv")

# Load the CSV file
if not os.path.exists(csv_file):
    print(f"CSV file not found: {csv_file}")
    exit()

data = pd.read_csv(csv_file)  # Load the dataset
print("Dataset loaded successfully!")

# Rename columns for easier access
data.rename(columns={
    'Image_Name': 'filename',
    ' Classes(motorcycle, truck, boat, bus, cycle, , , , , , , sitar, ektara, flutes, tabla, harmonium)': 'classes'
}, inplace=True)

# Debugging: Print the updated column names
print("Updated columns in the dataset:", data.columns)

Dataset loaded successfully!
Updated columns in the dataset: Index(['filename', 'classes', 'motorcycle', 'truck', 'boat', 'bus', 'cycle',
       'sitar', 'ektara', 'flutes', 'tabla', 'harmonium'],
      dtype='object')


In [5]:
# Rename columns for easier access
data.rename(columns={
    'Image_Name': 'filename',
    ' Classes(motorcycle, truck, boat, bus, cycle, , , , , , , sitar, ektara, flutes, tabla, harmonium)': 'classes'
}, inplace=True)

# Debugging: Print the updated column names
print("Updated columns in the dataset:", data.columns)

# Drop specified columns (sitar, ektara, etc.)
columns_to_drop = ["sitar", "ektara", "flutes", "tabla", "harmonium"]
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors="ignore")
# Define the directory containing the images
image_dir = os.path.join(dataset_path, "images")
# Extract labels and image paths
labels = data.drop(columns=["filename"]).values
image_paths = data["filename"].apply(lambda x: os.path.join(image_dir, x))

# Split data into train and test sets
train_image_paths, test_image_paths, train_labels, test_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42
)

Updated columns in the dataset: Index(['filename', 'classes', 'motorcycle', 'truck', 'boat', 'bus', 'cycle',
       'sitar', 'ektara', 'flutes', 'tabla', 'harmonium'],
      dtype='object')


In [6]:
# Define a custom dataset
class MultiLabelDataset(Dataset):
    def __init__(self, image_paths, labels, preprocess):
        self.image_paths = image_paths
        self.labels = torch.tensor(labels, dtype=torch.float32)
        self.preprocess = preprocess

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths.iloc[idx]
        image = Image.open(image_path).convert("RGB")
        input_tensor = self.preprocess(image)
        return input_tensor, self.labels[idx]


In [7]:
# Load DINO model from PyTorch Hub
dino_model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
dino_model.eval()

# Define preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize((224, 224), interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

Using cache found in /root/.cache/torch/hub/facebookresearch_dinov2_main


In [8]:
# Define a simple classifier on top of DINO features
class MultiLabelClassifier(nn.Module):
    def __init__(self, dino_model, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.dino_model = dino_model
        self.classifier = nn.Linear(768, num_labels)  # 768 is the hidden size of DINOv2 ViT-S/14 model

    def forward(self, x):
        with torch.no_grad():
            # Extract features using DINO model
            features = self.dino_model(x).squeeze(0)  # Remove batch dimension
        logits = self.classifier(features)
        return logits

In [9]:
# Initialize the classifier
num_labels = train_labels.shape[1]
model = MultiLabelClassifier(dino_model, num_labels)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=1e-4)  # Train only the classifier


In [12]:
# Rename columns for easier access
data.rename(columns={
    'Image_Name': 'filename',
    ' Classes(motorcycle, truck, boat, bus, cycle, , , , , , , sitar, ektara, flutes, tabla, harmonium)': 'classes'
}, inplace=True)

# Debugging: Print the updated column names
print("Updated columns in the dataset:", data.columns)

# Drop specified columns (sitar, ektara, etc.)
columns_to_drop = ["sitar", "ektara", "flutes", "tabla", "harmonium"]
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors="ignore")

# Define the directory containing the images
image_dir = os.path.join(dataset_path, "images")

# Extract labels and image paths
# Select only the numerical label columns, excluding 'filename' and 'classes'
label_columns = data.columns.difference(["filename", "classes"])
labels = data[label_columns].values

image_paths = data["filename"].apply(lambda x: os.path.join(image_dir, x))

# Split data into train and test sets
train_image_paths, test_image_paths, train_labels, test_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42
)

Updated columns in the dataset: Index(['filename', 'classes', 'motorcycle', 'truck', 'boat', 'bus', 'cycle'], dtype='object')


In [13]:
# Prepare data loaders
# Ensure labels are explicitly cast to float32 before passing to the Dataset
train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)

train_dataset = MultiLabelDataset(train_image_paths, train_labels, preprocess)
test_dataset = MultiLabelDataset(test_image_paths, test_labels, preprocess)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [19]:
class MultiLabelClassifier(nn.Module):
    def __init__(self, dino_model, num_labels, hidden_size=384):  # Set default hidden_size to 384
        super(MultiLabelClassifier, self).__init__()
        self.dino_model = dino_model
        self.classifier = nn.Linear(hidden_size, num_labels)  # Use the correct hidden size for the model

    def forward(self, x):
        with torch.no_grad():
            # Extract features using DINO model
            features = self.dino_model(x).squeeze(0)  # Remove batch dimension
        logits = self.classifier(features)
        return logits

# Update the instantiation of the model with the correct hidden size
num_labels = train_labels.shape[1]
hidden_size = 384  # DINOv2 ViT-S/14 outputs feature embeddings of size 384
model = MultiLabelClassifier(dino_model, num_labels, hidden_size)

In [22]:
# Get the total number of batches in the training DataLoader
total_train_batches = len(train_loader)

# Get the total number of batches in the test DataLoader
total_test_batches = len(test_loader)

print(f"Total training batches: {total_train_batches}")
print(f"Total testing batches: {total_test_batches}")

Total training batches: 398
Total testing batches: 100


In [23]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):  # Number of epochs
    print(f"Starting epoch {epoch + 1}")
    model.train()
    total_loss = 0
    for batch_idx, (pixel_values, targets) in enumerate(train_loader):
        print(f"Processing batch {batch_idx + 1}...")
        pixel_values, targets = pixel_values.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(pixel_values)
        print(f"Output shape: {outputs.shape}")
        loss = criterion(outputs, targets)
        print(f"Loss: {loss.item()}")
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


Starting epoch 1
Processing batch 1...
Output shape: torch.Size([16, 5])
Loss: 1.0866426229476929
Processing batch 2...
Output shape: torch.Size([16, 5])
Loss: 0.9417839050292969
Processing batch 3...
Output shape: torch.Size([16, 5])
Loss: 0.876913845539093
Processing batch 4...
Output shape: torch.Size([16, 5])
Loss: 0.8064042329788208
Processing batch 5...
Output shape: torch.Size([16, 5])
Loss: 0.8762982487678528
Processing batch 6...
Output shape: torch.Size([16, 5])
Loss: 0.8010123372077942
Processing batch 7...
Output shape: torch.Size([16, 5])
Loss: 0.7650856971740723
Processing batch 8...
Output shape: torch.Size([16, 5])
Loss: 0.8631837964057922
Processing batch 9...
Output shape: torch.Size([16, 5])
Loss: 0.7699586749076843
Processing batch 10...
Output shape: torch.Size([16, 5])
Loss: 0.8407158851623535
Processing batch 11...
Output shape: torch.Size([16, 5])
Loss: 0.8551861047744751
Processing batch 12...
Output shape: torch.Size([16, 5])
Loss: 0.9258837699890137
Processin

In [24]:
# Evaluation
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for pixel_values, targets in test_loader:
        pixel_values, targets = pixel_values.to(device), targets.to(device)
        outputs = model(pixel_values)
        preds = torch.sigmoid(outputs).cpu().numpy()
        all_preds.append(preds)
        all_targets.append(targets.cpu().numpy())

# Combine predictions and targets
all_preds = np.vstack(all_preds)
all_targets = np.vstack(all_targets)

# Generate classification report
thresholded_preds = (all_preds > 0.5).astype(int)
print(classification_report(all_targets, thresholded_preds))

              precision    recall  f1-score   support

           0       0.15      0.47      0.23       319
           1       0.11      0.65      0.19       114
           2       0.14      0.48      0.21       294
           3       0.24      0.59      0.34       262
           4       0.15      0.38      0.22       219

   micro avg       0.16      0.50      0.24      1208
   macro avg       0.16      0.52      0.24      1208
weighted avg       0.16      0.50      0.25      1208
 samples avg       0.15      0.36      0.20      1208



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
