In [2]:
import os

import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoImageProcessor, AutoModel


# ----------------------------
# Custom Dataset for Clothes_Dataset
# ----------------------------
class CustomDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (str): Directory with all class folders.
            transform (callable, optional): Transformations to be applied to an image.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.samples = []
        # Get sorted list of class directories
        self.classes = sorted([d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))])
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}
        
        # Traverse each class directory and store image paths with their corresponding label
        for cls in self.classes:
            cls_folder = os.path.join(root_dir, cls)
            for fname in os.listdir(cls_folder):
                if fname.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                    self.samples.append((os.path.join(cls_folder, fname), self.class_to_idx[cls]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        img_path, label = self.samples[index]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

# ----------------------------
# Evaluation Function: Compute Macro F1 Score
# ----------------------------
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    f1 = f1_score(all_labels, all_preds, average='macro')
    return f1

# ----------------------------
# Evaluation Function: Compute Average Loss
# ----------------------------
def evaluate_loss(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    num_batches = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            num_batches += 1
    avg_loss = total_loss / num_batches if num_batches > 0 else 0.0
    return avg_loss

# ----------------------------
# Custom Model: DINO-v2 (Hugging Face) + Classification Layer
# ----------------------------
class DinoClassifier(nn.Module):
    def __init__(self, num_classes, hidden_dim=768):
        """
        Args:
            num_classes (int): Number of target classes.
            hidden_dim (int): Hidden dimension (should match DINO-v2 output, typically 768 for dinov2-base).
        """
        super(DinoClassifier, self).__init__()
        # Load the pre-trained DINO-v2 model from Hugging Face.
        self.feature_extractor = AutoModel.from_pretrained('facebook/dinov2-base')
        # Force the model to return a tuple rather than a dict.
        self.feature_extractor.config.return_dict = False
        # Freeze the DINO-v2 parameters.
        for param in self.feature_extractor.parameters():
            param.requires_grad = False

        # Classification head: takes the [CLS] token output and produces logits.
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # x is expected to be of shape (B, 3, H, W)
        # Use the DINO-v2 feature extractor with no gradient updates.
        with torch.no_grad():
            # Note: the Hugging Face model expects the keyword 'pixel_values'
            features = self.feature_extractor(pixel_values=x)[0]
        # features: (B, seq_length, hidden_dim). For ViT-like models, the first token is the [CLS] token.
        cls_token = features[:, 0, :]  # Use the first token for classification.
        logits = self.classifier(cls_token)
        return logits

# ----------------------------
# Main Training and Evaluation Loop
# ----------------------------
def main():
    # Use the Hugging Face image processor to handle resizing, cropping, and normalization.
    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
    # Define a transform that applies the processor and extracts the tensor.
    transform = lambda image: processor(image, return_tensors="pt")["pixel_values"].squeeze(0)
    
    # Create the dataset from the root folder.
    dataset = CustomDataset(root_dir='../../heterogeneous_data/lab1/archive/Clothes_Dataset', transform=transform)
    num_classes = len(dataset.classes)
    print(f"Found {len(dataset)} images belonging to {num_classes} classes: {dataset.classes}")
    
    # Split dataset into training (80%) and validation (20%) sets.
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    # Create DataLoaders for training and validation with batch_size 16.
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
    
    # Device configuration.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Instantiate our custom model.
    model = DinoClassifier(num_classes=num_classes, hidden_dim=768)
    model = model.to(device)
    
    # Define loss function and optimizer (only classifier parameters are trainable).
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # ----- Evaluate initial model (without fine-tuning) on validation set -----
    initial_f1 = evaluate_model(model, val_loader, device)
    print(f"Initial F1 score on validation set: {initial_f1:.4f}")
    
    # ----- Fine-tuning for 3 epochs -----
    num_epochs = 3
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        
        # Compute validation loss at the end of the epoch.
        val_loss = evaluate_loss(model, val_loader, criterion, device)
        print(f"Epoch {epoch+1}/{num_epochs}: Training Loss = {avg_loss:.4f}, Validation Loss = {val_loss:.4f}")
    
    # ----- Evaluate the model after fine-tuning -----
    final_f1 = evaluate_model(model, val_loader, device)
    print(f"F1 score on validation set after fine-tuning: {final_f1:.4f}")

if __name__ == '__main__':
    main()
    torch.cuda.empty_cache()

Found 7500 images belonging to 15 classes: ['Blazer', 'Celana_Panjang', 'Celana_Pendek', 'Gaun', 'Hoodie', 'Jaket', 'Jaket_Denim', 'Jaket_Olahraga', 'Jeans', 'Kaos', 'Kemeja', 'Mantel', 'Polo', 'Rok', 'Sweter']
Initial F1 score on validation set: 0.0404
Epoch 1/3: Training Loss = 0.7578, Validation Loss = 0.5824
Epoch 2/3: Training Loss = 0.5108, Validation Loss = 0.5368
Epoch 3/3: Training Loss = 0.4289, Validation Loss = 0.5878
F1 score on validation set after fine-tuning: 0.7963
