## Housekeeping

In [1]:
!conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
#!pip install pytorch torchvision cudatoolkit
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install focal_loss_torch
!pip install kaggle

Collecting pytorch
  Using cached pytorch-1.0.2.tar.gz (689 bytes)
  Preparing metadata (setup.py) ... [?25ldone
[31mERROR: Could not find a version that satisfies the requirement cudatoolkit (from versions: none)[0m
[31mERROR: No matching distribution found for cudatoolkit[0m
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/y2/_3pq15pd08556v211gq9jslr0000gn/T/pip-req-build-2d4u13x7
  Running command git clone --filter=blob:none -q https://github.com/openai/CLIP.git /private/var/folders/y2/_3pq15pd08556v211gq9jslr0000gn/T/pip-req-build-2d4u13x7
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25ldone


In [2]:
import os
import re
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from PIL import Image
from torch.utils.data import Dataset, DataLoader


import clip
import torch
from torch import nn
from torch.optim import Adam
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.nn.functional as F
from torch.nn import BCELoss
from sklearn.metrics import f1_score

## Data Prep

In [3]:
DIR = '/Users/sugardady/Documents/USYD/2024S1/COMP5329/A2_due_17_may/COMP5329S1A2Dataset'
TRAIN_FILENAME = os.path.join(DIR, "train.csv")
TEST_FILENAME = os.path.join(DIR, "test.csv")
IMAGES_DIR = os.path.join(DIR, "data")
with open(TRAIN_FILENAME) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    data_df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")

with open(TEST_FILENAME) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    test_df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")


# Read and preprocess data
data_df['image_path'] = data_df['ImageID'].apply(lambda x: os.path.join(IMAGES_DIR, x))
test_df['image_path'] = test_df['ImageID'].apply(lambda x: os.path.join(IMAGES_DIR, x))
# Split data into training and validation sets
train_df, val_df = train_test_split(data_df, test_size=0.2, random_state=5329)
train_captions = train_df['Caption'].values
val_captions = val_df['Caption'].values
train_df.head()

Unnamed: 0,ImageID,Labels,Caption,image_path
5444,5444.jpg,7,A yellow train traveling down train tracks in ...,/Users/sugardady/Documents/USYD/2024S1/COMP532...
16727,16727.jpg,1 3 4,A man in yellow vest riding motorcycle in street.,/Users/sugardady/Documents/USYD/2024S1/COMP532...
26146,26146.jpg,1,A woman with nice breast holding an upside dow...,/Users/sugardady/Documents/USYD/2024S1/COMP532...
2183,2183.jpg,1,A woman is skiing down a snowy slope with moun...,/Users/sugardady/Documents/USYD/2024S1/COMP532...
29199,29199.jpg,11,A fire hydrant on someone's lawn in the grass ...,/Users/sugardady/Documents/USYD/2024S1/COMP532...


In [4]:
# Set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model
model, preprocess = clip.load("ViT-B/32", device=device)


In [5]:
def preprocess_images(image_paths):
    images = []
    for path in image_paths:
        image = Image.open(path).convert("RGB")  # Ensure images are in RGB
        image = preprocess(image)  # Apply CLIP's preprocessing
        images.append(image.unsqueeze(0))  # Add batch dimension
    return torch.cat(images).to(device)  # Concatenate all images into a single tensor

def tokenize_captions(captions):
    return clip.tokenize(captions).to(device)


In [6]:

# Extract image paths and captions from the DataFrame
image_paths = train_df['image_path'].tolist()
captions = train_df['Caption'].tolist()

# Preprocess images and tokenize captions
processed_images = preprocess_images(image_paths)
tokenized_captions = tokenize_captions(captions)


In [7]:
# Assume 'data_df' is your DataFrame and it has a 'Labels' column where labels are space-separated
train_df['Labels'] = train_df['Labels'].apply(lambda x: x.split())
val_df['Labels'] = val_df['Labels'].apply(lambda x: x.split())
# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the labels to one-hot encoding
train_labels_one_hot = mlb.fit_transform(train_df['Labels'])
val_labels_one_hot = mlb.fit_transform(val_df['Labels'])
# mlb.classes_ will give you the array of all labels that the binarizer has seen
print("Labels:", mlb.classes_)

# Example output
print(val_labels_one_hot)



Labels: ['1' '10' '11' '13' '14' '15' '16' '17' '18' '19' '2' '3' '4' '5' '6' '7'
 '8' '9']
[[1 0 0 ... 0 0 0]
 [1 1 0 ... 0 1 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


In [6]:
class CustomDataset(Dataset):
    def __init__(self, image_paths, captions, labels, transform, tokenize):
        self.image_paths = image_paths
        self.captions = captions
        self.labels = labels
        self.transform = transform
        self.tokenize = tokenize

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Image processing
        image = Image.open(self.image_paths[idx]).convert('RGB')
        image = self.transform(image)

        # Caption processing
        caption = self.tokenize([self.captions[idx]])

        # Labels
        label = torch.tensor(self.labels[idx], dtype=torch.float32)

        return image, caption.squeeze(0), label
# Note: These should be functions that operate on a single item and return a tensor
transform = preprocess
tokenize = clip.tokenize

# Create dataset and dataloader
train_ds = CustomDataset(
    image_paths=train_df['image_path'].tolist(),
    captions=train_df['Caption'].tolist(),
    labels=train_labels_one_hot,
    transform=transform,
    tokenize=tokenize
)
train_dataloader = DataLoader(train_ds, batch_size=64, shuffle=True)

val_ds = CustomDataset(

    image_paths=val_df['image_path'].tolist(),
    captions=val_df['Caption'].tolist(),
    labels=val_labels_one_hot,
    transform=transform,
    tokenize=tokenize
)
val_dataloader = DataLoader(val_ds, batch_size=64, shuffle=True)

NameError: name 'train_labels_one_hot' is not defined

In [9]:
# Quick check to see if everything is loading correctly
for images, captions, labels in train_dataloader:
    print('Images:', images.shape)  # Expect: [batch_size, C, H, W]
    print('Captions:', captions.shape)  # Expect: [batch_size, L]
    print('Labels:', labels.shape)  # Expect: [batch_size, num_labels]
    break


Images: torch.Size([64, 3, 224, 224])
Captions: torch.Size([64, 77])
Labels: torch.Size([64, 18])


## Model Definition

In [7]:
class MultiLabelCLIPClassifier(nn.Module):
    def __init__(self, clip_model, num_labels):
        super(MultiLabelCLIPClassifier, self).__init__()
        self.clip_model = clip_model
        self.fc = nn.Linear(512, num_labels)  # 512 is an example feature size, adjust based on CLIP's output
        self.sigmoid = nn.Sigmoid()
        #self.float()


    def forward(self, images, captions):
        with torch.no_grad():
            image_features = self.clip_model.encode_image(images)
            text_features = self.clip_model.encode_text(captions)

        # Combining features: simple addition or concatenation can be used depending on the task
        features = (image_features + text_features) / 2
        output = self.fc(features)
        output = self.sigmoid(output)  # Sigmoid activation for multi-label classification
        return output

# Initialize the model
num_class = 18
clip_model, _ = clip.load("ViT-B/32", device=device)
model = MultiLabelCLIPClassifier(clip_model, num_class).to(device)  # 18 is the number of labels


In [8]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss


## Model Training

In [13]:
num_epochs = 30  # Set the number of epochs

### Model with Multi Label Focal Loss

In [14]:
class FocalLossMultiLabel(torch.nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25):
        super(FocalLossMultiLabel, self).__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, logits, targets):
        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        probs = torch.sigmoid(logits)
        p_t = targets * probs + (1 - targets) * (1 - probs)
        fl_loss = self.alpha * (1 - p_t) ** self.gamma * bce_loss
        return fl_loss.mean()

# Define the criterion with Focal Loss
criterion = FocalLossMultiLabel(gamma=0.7)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
#scheduler = CosineAnnealingLR(optimizer, T_max=100)  # Adjust T_max based on your training epochs

# Initialize early stopping object
early_stopping = EarlyStopping(patience=5, verbose=True, path='best_model_focal_adamw.pt')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, captions, labels in train_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)

        # Forward pass
        outputs = model(images, captions)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Learning rate scheduling step
    #scheduler.step()

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():  # Use torch.no_grad() during validation
        for images, captions, labels in val_dataloader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)
            outputs = model(images, captions)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    # Calculate average losses
    train_loss = total_loss / len(train_dataloader)
    val_loss /= len(val_dataloader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping and model checkpointing
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break


Epoch [1/30], Train Loss: 0.1411, Val Loss: 0.1252
Validation loss decreased (inf --> 0.125155).  Saving model ...
Epoch [2/30], Train Loss: 0.1184, Val Loss: 0.1138
Validation loss decreased (0.125155 --> 0.113775).  Saving model ...
Epoch [3/30], Train Loss: 0.1115, Val Loss: 0.1096
Validation loss decreased (0.113775 --> 0.109649).  Saving model ...
Epoch [4/30], Train Loss: 0.1086, Val Loss: 0.1077
Validation loss decreased (0.109649 --> 0.107692).  Saving model ...
Epoch [5/30], Train Loss: 0.1071, Val Loss: 0.1066
Validation loss decreased (0.107692 --> 0.106596).  Saving model ...
Epoch [6/30], Train Loss: 0.1062, Val Loss: 0.1059
Validation loss decreased (0.106596 --> 0.105910).  Saving model ...
Epoch [7/30], Train Loss: 0.1057, Val Loss: 0.1055
Validation loss decreased (0.105910 --> 0.105452).  Saving model ...
Epoch [8/30], Train Loss: 0.1053, Val Loss: 0.1051
Validation loss decreased (0.105452 --> 0.105128).  Saving model ...
Epoch [9/30], Train Loss: 0.1050, Val Loss: 0

#### Evaluation

In [15]:
model.load_state_dict(torch.load('best_model_focal_adamw.pt'))
model.eval()
# Proceed with evaluation or application of the model
with torch.no_grad():
    correct_predictions = 0
    total_predictions = 0
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)
        predicted = outputs > 0.5  # Thresholding at 0.5
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.numel()

    accuracy = correct_predictions / total_predictions
    print(f'Validation Accuracy: {accuracy:.4f}')
    all_labels = []
all_predictions = []

with torch.no_grad():
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)

        # Apply a threshold to obtain binary predictions
        predicted = (outputs > 0.5).type(torch.int)

        # Store predictions and labels to calculate F1 score later
        all_labels.append(labels.cpu().numpy())
        all_predictions.append(predicted.cpu().numpy())

# Convert lists to numpy arrays for compatibility with scikit-learn
all_labels = np.vstack(all_labels)
all_predictions = np.vstack(all_predictions)

# Calculate F1 scores
f1_macro = f1_score(all_labels, all_predictions, average='macro')
f1_micro = f1_score(all_labels, all_predictions, average='micro')

print(f'Validation F1 Macro: {f1_macro:.4f}')
print(f'Validation F1 Micro: {f1_micro:.4f}')


Validation Accuracy: 0.9606
Validation F1 Macro: 0.2854
Validation F1 Micro: 0.7028


### Model Training with Focal Loss and Adam Optimizer

In [16]:
# Define the criterion with Focal Loss
criterion = FocalLossMultiLabel(gamma=0.7)
optimizer = Adam(model.parameters(), lr=1e-4)
#scheduler = CosineAnnealingLR(optimizer, T_max=100)  # Adjust T_max based on your training epochs
# Initialize early stopping object
early_stopping = EarlyStopping(patience=5, verbose=True, path='best_model_focal_adam.pt')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, captions, labels in train_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)

        # Forward pass
        outputs = model(images, captions)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Learning rate scheduling step
    #scheduler.step()

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():  # Use torch.no_grad() during validation
        for images, captions, labels in val_dataloader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)
            outputs = model(images, captions)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    # Calculate average losses
    train_loss = total_loss / len(train_dataloader)
    val_loss /= len(val_dataloader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping and model checkpointing
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break


Epoch [1/30], Train Loss: 0.1034, Val Loss: 0.1033
Validation loss decreased (inf --> 0.103344).  Saving model ...
Epoch [2/30], Train Loss: 0.1033, Val Loss: 0.1033
Validation loss decreased (0.103344 --> 0.103333).  Saving model ...
Epoch [3/30], Train Loss: 0.1033, Val Loss: 0.1033
Validation loss decreased (0.103333 --> 0.103315).  Saving model ...
Epoch [4/30], Train Loss: 0.1033, Val Loss: 0.1033
Validation loss decreased (0.103315 --> 0.103295).  Saving model ...
Epoch [5/30], Train Loss: 0.1033, Val Loss: 0.1033
Validation loss decreased (0.103295 --> 0.103273).  Saving model ...
Epoch [6/30], Train Loss: 0.1032, Val Loss: 0.1032
Validation loss decreased (0.103273 --> 0.103181).  Saving model ...
Epoch [7/30], Train Loss: 0.1031, Val Loss: 0.1031
Validation loss decreased (0.103181 --> 0.103141).  Saving model ...
Epoch [8/30], Train Loss: 0.1031, Val Loss: 0.1031
Validation loss decreased (0.103141 --> 0.103125).  Saving model ...
Epoch [9/30], Train Loss: 0.1031, Val Loss: 0

#### Evaluatuin

In [17]:
model.load_state_dict(torch.load('best_model_focal_adam.pt'))
model.eval()
# Proceed with evaluation or application of the model
with torch.no_grad():
    correct_predictions = 0
    total_predictions = 0
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)
        predicted = outputs > 0.5  # Thresholding at 0.5
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.numel()

    accuracy = correct_predictions / total_predictions
    print(f'Validation Accuracy: {accuracy:.4f}')
    all_labels = []
all_predictions = []

with torch.no_grad():
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)

        # Apply a threshold to obtain binary predictions
        predicted = (outputs > 0.5).type(torch.int)

        # Store predictions and labels to calculate F1 score later
        all_labels.append(labels.cpu().numpy())
        all_predictions.append(predicted.cpu().numpy())

# Convert lists to numpy arrays for compatibility with scikit-learn
all_labels = np.vstack(all_labels)
all_predictions = np.vstack(all_predictions)

# Calculate F1 scores
f1_macro = f1_score(all_labels, all_predictions, average='macro')
f1_micro = f1_score(all_labels, all_predictions, average='micro')

print(f'Validation F1 Macro: {f1_macro:.4f}')
print(f'Validation F1 Micro: {f1_micro:.4f}')


Validation Accuracy: 0.9671
Validation F1 Macro: 0.4817
Validation F1 Micro: 0.7651


### Model Training with Weighted BCE Loss

Weighted BCE loss allows you to assign different weights to positive and negative samples, helping to address the class imbalance. You can calculate weights based on the frequency of each class in your dataset.


In [18]:
# Set up the loss function and optimizer
criterion = BCELoss() #/focal loss
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)
#scheduler = CosineAnnealingLR(optimizer, T_max=100)  # Adjust T_max based on your training epochs

# Initialize early stopping object
early_stopping = EarlyStopping(patience=5, verbose=True, path='best_model_bce_adamw.pt')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, captions, labels in train_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)

        # Forward pass
        outputs = model(images, captions)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Learning rate scheduling step
    #scheduler.step()

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():  # Use torch.no_grad() during validation
        for images, captions, labels in val_dataloader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)
            outputs = model(images, captions)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    # Calculate average losses
    train_loss = total_loss / len(train_dataloader)
    val_loss /= len(val_dataloader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping and model checkpointing
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break



Epoch [1/30], Train Loss: 0.3241, Val Loss: 0.2945
Validation loss decreased (inf --> 0.294454).  Saving model ...
Epoch [2/30], Train Loss: 0.2873, Val Loss: 0.2621
Validation loss decreased (0.294454 --> 0.262103).  Saving model ...
Epoch [3/30], Train Loss: 0.2550, Val Loss: 0.2321
Validation loss decreased (0.262103 --> 0.232103).  Saving model ...
Epoch [4/30], Train Loss: 0.2246, Val Loss: 0.2040
Validation loss decreased (0.232103 --> 0.203976).  Saving model ...
Epoch [5/30], Train Loss: 0.1951, Val Loss: 0.1759
Validation loss decreased (0.203976 --> 0.175861).  Saving model ...
Epoch [6/30], Train Loss: 0.1663, Val Loss: 0.1490
Validation loss decreased (0.175861 --> 0.148990).  Saving model ...
Epoch [7/30], Train Loss: 0.1392, Val Loss: 0.1244
Validation loss decreased (0.148990 --> 0.124385).  Saving model ...
Epoch [8/30], Train Loss: 0.1158, Val Loss: 0.1049
Validation loss decreased (0.124385 --> 0.104867).  Saving model ...
Epoch [9/30], Train Loss: 0.0985, Val Loss: 0

#### Evaluation

In [19]:
model.load_state_dict(torch.load('best_model_bce_adamw.pt'))
model.eval()
# Proceed with evaluation or application of the model
with torch.no_grad():
    correct_predictions = 0
    total_predictions = 0
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)
        predicted = outputs > 0.5  # Thresholding at 0.5
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.numel()

    accuracy = correct_predictions / total_predictions
    print(f'Validation Accuracy: {accuracy:.4f}')
    all_labels = []
all_predictions = []

with torch.no_grad():
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)

        # Apply a threshold to obtain binary predictions
        predicted = (outputs > 0.5).type(torch.int)

        # Store predictions and labels to calculate F1 score later
        all_labels.append(labels.cpu().numpy())
        all_predictions.append(predicted.cpu().numpy())

# Convert lists to numpy arrays for compatibility with scikit-learn
all_labels = np.vstack(all_labels)
all_predictions = np.vstack(all_predictions)

# Calculate F1 scores
f1_macro = f1_score(all_labels, all_predictions, average='macro')
f1_micro = f1_score(all_labels, all_predictions, average='micro')

print(f'Validation F1 Macro: {f1_macro:.4f}')
print(f'Validation F1 Micro: {f1_micro:.4f}')


Validation Accuracy: 0.9762
Validation F1 Macro: 0.7588
Validation F1 Micro: 0.8489


### Model Training with BCE loss and Adam Optimizer

In [21]:
# Set up the loss function and optimizer
criterion = BCELoss() #/focal loss
optimizer = Adam(model.parameters(), lr=1e-4)
#scheduler = CosineAnnealingLR(optimizer, T_max=100)  # Adjust T_max based on your training epochs

# Initialize early stopping object
early_stopping = EarlyStopping(patience=5, verbose=True, path='best_model_bce_adam.pt')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, captions, labels in train_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)

        # Forward pass
        outputs = model(images, captions)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Learning rate scheduling step
    #scheduler.step()

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():  # Use torch.no_grad() during validation
        for images, captions, labels in val_dataloader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)
            outputs = model(images, captions)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    # Calculate average losses
    train_loss = total_loss / len(train_dataloader)
    val_loss /= len(val_dataloader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

    # Early stopping and model checkpointing
    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break



Epoch [1/30], Train Loss: 0.0699, Val Loss: 0.0723
Validation loss decreased (inf --> 0.072288).  Saving model ...
Epoch [2/30], Train Loss: 0.0698, Val Loss: 0.0723
Validation loss decreased (0.072288 --> 0.072251).  Saving model ...
Epoch [3/30], Train Loss: 0.0696, Val Loss: 0.0721
Validation loss decreased (0.072251 --> 0.072112).  Saving model ...
Epoch [4/30], Train Loss: 0.0695, Val Loss: 0.0720
Validation loss decreased (0.072112 --> 0.072047).  Saving model ...
Epoch [5/30], Train Loss: 0.0693, Val Loss: 0.0720
Validation loss decreased (0.072047 --> 0.072000).  Saving model ...
Epoch [6/30], Train Loss: 0.0692, Val Loss: 0.0719
Validation loss decreased (0.072000 --> 0.071851).  Saving model ...
Epoch [7/30], Train Loss: 0.0690, Val Loss: 0.0718
Validation loss decreased (0.071851 --> 0.071791).  Saving model ...
Epoch [8/30], Train Loss: 0.0689, Val Loss: 0.0717
Validation loss decreased (0.071791 --> 0.071712).  Saving model ...
Epoch [9/30], Train Loss: 0.0688, Val Loss: 0

#### Evaluation

In [22]:
model.load_state_dict(torch.load('best_model_bce_adam.pt'))
model.eval()
# Proceed with evaluation or application of the model
with torch.no_grad():
    correct_predictions = 0
    total_predictions = 0
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)
        predicted = outputs > 0.5  # Thresholding at 0.5
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.numel()

    accuracy = correct_predictions / total_predictions
    print(f'Validation Accuracy: {accuracy:.4f}')
    all_labels = []
all_predictions = []

with torch.no_grad():
    for images, captions, labels in val_dataloader:
        images, captions, labels = images.to(device), captions.to(device), labels.to(device)
        outputs = model(images, captions)

        # Apply a threshold to obtain binary predictions
        predicted = (outputs > 0.5).type(torch.int)

        # Store predictions and labels to calculate F1 score later
        all_labels.append(labels.cpu().numpy())
        all_predictions.append(predicted.cpu().numpy())

# Convert lists to numpy arrays for compatibility with scikit-learn
all_labels = np.vstack(all_labels)
all_predictions = np.vstack(all_predictions)

# Calculate F1 scores
f1_macro = f1_score(all_labels, all_predictions, average='macro')
f1_micro = f1_score(all_labels, all_predictions, average='micro')

print(f'Validation F1 Macro: {f1_macro:.4f}')
print(f'Validation F1 Micro: {f1_micro:.4f}')


Validation Accuracy: 0.9766
Validation F1 Macro: 0.7657
Validation F1 Micro: 0.8517


## Test

In [9]:
# Extract image paths and captions from the DataFrame
test_image_paths = test_df['image_path'].tolist()
test_captions = test_df['Caption'].tolist()

# Preprocess images and tokenize captions
processed_test_images = preprocess_images(test_image_paths)
tokenized_test_captions = tokenize_captions(test_captions)

In [10]:
# Initialize the model
clip_model, _ = clip.load("ViT-B/32", device=device)
model = MultiLabelCLIPClassifier(clip_model, 18).to(device)  # 18 is the number of labels
# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()
# Load the best model
model.load_state_dict(torch.load('best_model_focal_adamw.pt'))
model.eval()  # Set the model to evaluation mode

MultiLabelCLIPClassifier(
  (clip_model): CLIP(
    (visual): VisionTransformer(
      (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (transformer): Transformer(
        (resblocks): Sequential(
          (0): ResidualAttentionBlock(
            (attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
            )
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): Sequential(
              (c_fc): Linear(in_features=768, out_features=3072, bias=True)
              (gelu): QuickGELU()
              (c_proj): Linear(in_features=3072, out_features=768, bias=True)
            )
            (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          )
          (1): ResidualAttentionBlock(
            (attn): MultiheadAttention(
              (out_proj): NonDyn

In [11]:
# Make predictions on the test set
with torch.no_grad():
    test_outputs = model(processed_test_images, tokenized_test_captions)

# Convert output probabilities to binary predictions using a threshold
predictions = (test_outputs > 0.5).float()

# Convert predictions to binary labels
predictions_np = predictions.cpu().numpy()

# Use the inverse transform method of MultiLabelBinarizer to get original labels
predicted_labels = mlb.inverse_transform(predictions_np)


NotFittedError: This MultiLabelBinarizer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Submission

In [None]:
# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'ImageID': test_df['ImageID'],
    'Labels': [' '.join(map(str, labels)) for labels in predicted_labels]
})

# Save the DataFrame to a CSV file without the index
submission_df.to_csv('submission.csv', index=False)


NameError: name 'predicted_labels' is not defined

In [None]:
!kaggle competitions submit -c multi-label-classification-competition-2024 -f submission.csv -m "490236424_480004000_49038816_new"