In [12]:
import os
import clip
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from itertools import permutations
from scipy.special import kl_div
import itertools
import numpy as np
import copy
import shutil

In [13]:
train_features_path = "/home/samyakr2/multilabel/ARK/coco_train_clip_features_rn50.pt"
train_labels_path = '/home/samyakr2/multilabel/ARK/coco_train_clip_labels_rn50.pt'

val_features_path = "/home/samyakr2/multilabel/ARK/coco_val_clip_features_rn50.pt"
val_labels_path = '/home/samyakr2/multilabel/ARK/coco_val_clip_labels_rn50.pt'

train_features = torch.load(train_features_path)
train_labels = torch.load(train_labels_path)
val_features = torch.load(val_features_path)
val_labels = torch.load(val_labels_path)


In [14]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(clip.available_models())
clip_model, preprocess = clip.load('RN50', device)
clip_model = clip_model#.float()
print(preprocess)

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7f04ce8cbf60>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)


In [15]:
class adapter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(adapter, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, input_dim, bias=False),
            nn.ReLU(inplace=True)
        )
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = 0.2*x + 0.8*out
#         out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
#         out = self.softmax(out)
        return out

In [16]:
input_size = train_features[0].size(1)  
hidden_size = 128  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels

# Initialize the model
model = adapter(input_size, hidden_size, num_classes).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001

# Training loop
best_model_state_dict = None
best_loss = float('inf')
num_epochs = 15
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32)
        # Forward pass
        outputs = model(features_batch.to(device))

        # Compute loss
        loss = criterion(outputs, labels_tensor.to(device))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Print loss for the epoch
#     if epoch %100 == 0:
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()

# Save the best model
torch.save(best_model_state_dict, "/home/samyakr2/multilabel/ARK/best_coco_adapter.pth")

Epoch [1/15], Loss: 206.66398294456303
Epoch [2/15], Loss: 156.81515271402895
Epoch [3/15], Loss: 148.81492136605084
Epoch [4/15], Loss: 143.34774073865265
Epoch [5/15], Loss: 138.84297532215714
Epoch [6/15], Loss: 134.82613654900342
Epoch [7/15], Loss: 131.0647864714265
Epoch [8/15], Loss: 127.49264791980386
Epoch [9/15], Loss: 124.0575183276087
Epoch [10/15], Loss: 120.81760278902948
Epoch [11/15], Loss: 117.67343028727919
Epoch [12/15], Loss: 114.70382125675678
Epoch [13/15], Loss: 111.80963187944144
Epoch [14/15], Loss: 109.08913774508983
Epoch [15/15], Loss: 106.53205505479127


In [17]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model, criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device)
            labels_tensor = labels_batch.type(torch.float32).to(device)
            
            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

            # Forward pass
            outputs = model(features_batch)

            # Compute loss
            loss = criterion(outputs, labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = outputs.cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
    print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")


best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/best_coco_adapter.pth")
model.load_state_dict(best_model_state_dict)
test_model(model, criterion, val_features, val_labels, device)


Test Loss: 0.05052581588458863
Average Precision Score: 0.8124294885654584


### CLIP_2FC

In [19]:
class clip_2fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_2fc, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

class clip_fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_fc, self).__init__()
        
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
#         out = self.fc1(x)
#         out = self.relu(out)
        out = x
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [21]:
input_size = train_features[0].size(1)  
hidden_size = input_size//4  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels


print(input_size)
# Initialize the model
# model = clip_2fc(input_size, hidden_size, num_classes).to(device)
model = clip_fc(input_size, hidden_size, num_classes).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001

# Training loop

best_model_state_dict = None
best_loss = float('inf')
num_epochs = 20
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32).clone().detach()#torch.tensor(labels_batch, dtype=torch.float32)
        # Forward pass
        outputs = model(features_batch.to(device))

        # Compute loss
        loss = criterion(outputs, labels_tensor.to(device))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Print loss for the epoch
#     if epoch %100 == 0:
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()

# Save the best model
# torch.save(best_model_state_dict, "/home/samyakr2/multilabel/ARK/clip+2fc/best_clip_2fc.pth")

1024
Epoch [1/20], Loss: 314.1097912527621
Epoch [2/20], Loss: 176.5797239728272
Epoch [3/20], Loss: 164.6356262564659
Epoch [4/20], Loss: 159.67549636028707
Epoch [5/20], Loss: 156.81481060571969
Epoch [6/20], Loss: 154.8943969514221
Epoch [7/20], Loss: 153.4889848306775
Epoch [8/20], Loss: 152.40156694501638
Epoch [9/20], Loss: 151.5267536882311
Epoch [10/20], Loss: 150.80239959992468
Epoch [11/20], Loss: 150.189174618572
Epoch [12/20], Loss: 149.66080917231739
Epoch [13/20], Loss: 149.19900161586702
Epoch [14/20], Loss: 148.7905497495085
Epoch [15/20], Loss: 148.42566429637372
Epoch [16/20], Loss: 148.0969048589468
Epoch [17/20], Loss: 147.7985060941428
Epoch [18/20], Loss: 147.52592206560075
Epoch [19/20], Loss: 147.2755086068064
Epoch [20/20], Loss: 147.04431140236557


In [23]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model, criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device)
            labels_tensor = labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32).to(device)

            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

            # Forward pass
            outputs = model(features_batch)

            # Compute loss
            loss = criterion(outputs, labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = outputs.cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
    print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")


# best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/clip+2fc/best_clip_2fc.pth")
model.load_state_dict(best_model_state_dict)
test_model(model, criterion, val_features, val_labels, device)


Test Loss: 0.04260067014130437
Average Precision Score: 0.8310624944687632


### With label similarity

In [4]:
# from pycocotools.coco import COCO

# phase = 'val'
# root_dir = '/home/samyakr2/multilabel/data/coco/{}2017/'.format(phase)
# dire = '/home/samyakr2/multilabel/data/coco'
# annotation_file = '{}/annotations/instances_{}.json'.format(dire,phase+'2017') ## for val
# coco=COCO(annotation_file)


# cats = coco.loadCats(coco.getCatIds())
# nms=[cat['name'] for cat in cats]
# # print('COCO categories: \n{}\n'.format(' '.join(nms)))
# coco_label_list = []
# for nm in nms:
#     coco_label_list.append(nm)

In [5]:
# device = "cuda" if torch.cuda.is_available() else "cpu"

# labels_coco = coco_label_list
# items_coco = ["A photo of a " + item for item in labels_coco]

# text = clip.tokenize(items_coco).to(device)
# text_features = clip_model.encode_text(text)
# text_features /= text_features.norm(dim=-1, keepdim=True)
# text_features_path = '/home/samyakr2/multilabel/ARK/coco_labels_features.pt'
# torch.save(text_features, text_features_path)


text_features_path = '/home/samyakr2/multilabel/ARK/coco_labels_features.pt'
text_features = torch.load(text_features_path).to(torch.float32)

In [8]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

class clip_2fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_2fc, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim,bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU(inplace = True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
#         out = self.sigmoid(out)
        return out


class clip_fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_fc, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, output_dim,bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU(inplace = True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
#         out = self.relu(out)
#         out = self.fc2(out)
#         out = self.sigmoid(out)
        return out

    
input_size = train_features[0].size(1)  
hidden_size = input_size // 4 #128  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels

print(input_size, hidden_size, num_classes)
# in_s = text_features.shape[1]  
# hs = 200  # Define the size of the hidden layer
# nc = 100  

# print(in_s)
# Initialize the model
# model = clip_2fc(input_size, hidden_size, num_classes).to(device)

model = clip_fc(input_size, hidden_size, num_classes).to(device)
# model_text = projector(in_s, hs, nc).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification

params_to_optimize = list(model.parameters())
optimizer = torch.optim.Adam(params_to_optimize, lr=0.001)  # Adam optimizer with learning rate 0.001

# # Training loop

best_loss = float('inf')
num_epochs = 500

model.load_state_dict(torch.load("/home/samyakr2/multilabel/ARK/coco_weights/best_epoch_1999.pth"))

import torch.optim.lr_scheduler as lr_scheduler
scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)  # Adjust parameters as needed


for epoch in range(num_epochs):
    epoch_loss = 0.0
    
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32)#
        # Forward pass
        outputs = model(features_batch.to(device))
        
        similarity_text = (text_features @ text_features.T)
        
        normalized_similarity_text = similarity_text
#         normalized_similarity_text = F.normalize(similarity_text, p=2, dim=1)  # Normalize along the second dimension (rows)
#         normalized_similarity_text = torch.clamp(normalized_similarity_text, min=0, max=1)  # Clamp values to be between 0 and 1
#         print('normalized_similarity_text', normalized_similarity_text)
#         break
        outputs_reshaped = outputs.unsqueeze(1)
    
        result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
        
        pred = result  #F.normalize(result, p=2, dim=1)
        pred = torch.sigmoid(pred)
#         print(pred)
#         Compute loss
        loss = criterion(pred, labels_tensor)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        epoch_loss += loss.item()
#         print("=="*50)
    scheduler.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()
    if (epoch + 1) % 100 ==0:
        torch.save(best_model_state_dict, f"/home/samyakr2/multilabel/ARK/coco_weights/best_epoch_{epoch+2000}.pth")


1024 256 80
Epoch [1/500], Loss: 156.07333743199706
Epoch [2/500], Loss: 156.06257692538202
Epoch [3/500], Loss: 156.05998257920146
Epoch [4/500], Loss: 156.05725506693125
Epoch [5/500], Loss: 156.05452283471823
Epoch [6/500], Loss: 156.05177883431315
Epoch [7/500], Loss: 156.04904186353087
Epoch [8/500], Loss: 156.0462955981493
Epoch [9/500], Loss: 156.0435591004789
Epoch [10/500], Loss: 156.0408183503896
Epoch [11/500], Loss: 156.0380785483867
Epoch [12/500], Loss: 156.03534500300884
Epoch [13/500], Loss: 156.0326125267893
Epoch [14/500], Loss: 156.02988204173744
Epoch [15/500], Loss: 156.027152268216
Epoch [16/500], Loss: 156.02443394251168
Epoch [17/500], Loss: 156.021707797423
Epoch [18/500], Loss: 156.01898860000074
Epoch [19/500], Loss: 156.01627091132104
Epoch [20/500], Loss: 156.01355604082346
Epoch [21/500], Loss: 156.01084561273456
Epoch [22/500], Loss: 156.00813775509596
Epoch [23/500], Loss: 156.00542914308608
Epoch [24/500], Loss: 156.00272031128407
Epoch [25/500], Loss: 

KeyboardInterrupt: 

In [9]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model,text_features,criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    similarity_text = (text_features @ text_features.T)
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device).to(torch.float32)
            labels_tensor =  labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32).to(device)

            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1)

            # Forward pass
            outputs = model(features_batch)
            normalized_similarity_text = similarity_text
#             normalized_similarity_text = F.normalize(similarity_text, p=2, dim=1)
# #             normalized_similarity_text = torch.clamp(normalized_similarity_text, min=0, max=1)  # Clamp values to be between 0 and 1

# #           
            outputs_reshaped = outputs.unsqueeze(1)
            result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
            pred = result
#             pred = outputs
        
            loss = criterion(torch.sigmoid(pred), labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = torch.sigmoid(pred).cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
#     print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")



for i in range (2099,2605,100):
    best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/coco_weights/best_epoch_{}.pth".format(i))
    model.load_state_dict(best_model_state_dict)
    test_model(model, text_features, criterion, val_features, val_labels, device)


Average Precision Score: 0.8265330812642958
Average Precision Score: 0.8265363667419814
Average Precision Score: 0.8265363418830116


FileNotFoundError: [Errno 2] No such file or directory: '/home/samyakr2/multilabel/ARK/coco_weights/best_epoch_2399.pth'