In [54]:
import os
import clip
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from itertools import permutations
from scipy.special import kl_div
import itertools
import numpy as np
import copy
import shutil

In [55]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(clip.available_models())
clip_model, preprocess = clip.load('RN50', device)
clip_model = clip_model#.float()
print(preprocess)

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7fed4b6bd9e0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)


In [56]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np

class VOCDataset(Dataset):
    def __init__(self, image_names, label_names, image_dir, transforms=None):
        self.image_names = image_names
        self.label_names = label_names
        self.transforms = transforms
        self.image_dir = image_dir
#         self.label_dir = label_dir

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_names[idx])
        label_name = self.label_names[idx]

        image = Image.open(img_name).convert("RGB")
#         label = np.load(label_name)

        if self.transforms:
            image = self.transforms(image)

        return image, label_name

# Load filenames
train_image_names = np.load('/home/samyakr2/multilabel/data/pascal/formatted_train_images.npy')
train_label_names = np.load('/home/samyakr2/multilabel/data/pascal/formatted_train_labels.npy')
val_image_names = np.load('/home/samyakr2/multilabel/data/pascal/formatted_val_images.npy')
val_label_names = np.load('/home/samyakr2/multilabel/data/pascal/formatted_val_labels.npy')

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
])

batch_size = 16

path_to_images = '/home/samyakr2/multilabel/data/pascal/VOCdevkit/VOC2012/JPEGImages'
train_dataset = VOCDataset(image_names=train_image_names, label_names=train_label_names, 
                     image_dir=path_to_images, transforms=transform)

val_dataset = VOCDataset(image_names=val_image_names, label_names=val_label_names, 
                     image_dir=path_to_images, transforms=transform)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [57]:
def get_features(dataloader):
    all_features_batches = []
    all_labels_batches = []
    for images, labels in dataloader:
        features = clip_model.encode_image(images.to(device))
        all_features_batches.append(features.detach())
        all_labels_batches.append(labels)
    return all_features_batches, all_labels_batches

# train_features, train_labels = get_features(train_dataloader)

In [58]:
# val_features, val_labels = get_features(val_dataloader)

In [59]:
train_features_path = "/home/samyakr2/multilabel/ARK/pascal_train_clip_features_res.pt"
train_labels_path = '/home/samyakr2/multilabel/ARK/pascal_train_clip_labels_res.pt'

# torch.save(train_features, train_features_path)
# torch.save(train_labels, train_labels_path)



In [60]:
val_features_path = "/home/samyakr2/multilabel/ARK/pascal_val_clip_features_res.pt"
val_labels_path = '/home/samyakr2/multilabel/ARK/pascal_val_clip_labels_res.pt'

# torch.save(val_features, val_features_path)
# torch.save(val_labels, val_labels_path)

In [61]:
train_features = torch.load(train_features_path)
train_labels = torch.load(train_labels_path)
val_features = torch.load(val_features_path)
val_labels = torch.load(val_labels_path)

### SAMPLE RUN FOR ADAPTER

In [105]:
class adapter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(adapter, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, input_dim, bias=False),
            nn.ReLU(inplace=True)
        )
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = 0.15*x + 0.85*out
#         out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
#         out = self.softmax(out)
        return out

In [106]:
input_size = train_features[0].size(1)  
hidden_size = 128  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels




# Initialize the model
model = adapter(input_size, hidden_size, num_classes).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001

# Training loop

best_model_state_dict = None
best_loss = float('inf')
num_epochs = 25
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32)
        # Forward pass
        outputs = model(features_batch.to(device))

        # Compute loss
        loss = criterion(outputs, labels_tensor.to(device))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Print loss for the epoch
#     if epoch %100 == 0:
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()

# Save the best model
torch.save(best_model_state_dict, "/home/samyakr2/multilabel/ARK/best_adapter.pth")

Epoch [1/25], Loss: 57.22605918161571
Epoch [2/25], Loss: 23.0267473179847
Epoch [3/25], Loss: 18.775454313494265
Epoch [4/25], Loss: 16.494120688177645
Epoch [5/25], Loss: 14.700544707942754
Epoch [6/25], Loss: 13.078270009718835
Epoch [7/25], Loss: 11.537449382711202
Epoch [8/25], Loss: 10.07292242324911
Epoch [9/25], Loss: 8.634749432792887
Epoch [10/25], Loss: 7.249338253168389
Epoch [11/25], Loss: 5.9778112104395404
Epoch [12/25], Loss: 4.794350068084896
Epoch [13/25], Loss: 3.7715130201540887
Epoch [14/25], Loss: 3.035509691311745
Epoch [15/25], Loss: 2.927876503119478
Epoch [16/25], Loss: 2.777086539758784
Epoch [17/25], Loss: 2.5220454066584352
Epoch [18/25], Loss: 2.2505690500238416
Epoch [19/25], Loss: 1.8087644804022602
Epoch [20/25], Loss: 1.375393045363694
Epoch [21/25], Loss: 1.001847508492574
Epoch [22/25], Loss: 0.9790723231467382
Epoch [23/25], Loss: 0.8029761141015115
Epoch [24/25], Loss: 0.953280106841703
Epoch [25/25], Loss: 1.345641831323519


In [107]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model, criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device)
            labels_tensor = labels_batch.type(torch.float32).to(device)
            
            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

            # Forward pass
            outputs = model(features_batch)

            # Compute loss
            loss = criterion(outputs, labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = outputs.cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
    print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")


best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/best_adapter.pth")
model.load_state_dict(best_model_state_dict)
test_model(model, criterion, val_features, val_labels, device)


Test Loss: 0.1814886390635805
Average Precision Score: 0.9066961938494863


### Sample run for CLip_2fc

In [108]:
import os
import clip
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from itertools import permutations
from scipy.special import kl_div
import itertools
import numpy as np
import copy
import shutil

import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np

In [109]:
class clip_2fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_2fc, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [110]:
input_size = train_features[0].size(1)  
hidden_size = 128  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels


seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)


print(input_size)
# Initialize the model
model = clip_2fc(input_size, hidden_size, num_classes).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001

# Training loop

best_model_state_dict = None
best_loss = float('inf')
num_epochs = 30
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32).clone().detach()#torch.tensor(labels_batch, dtype=torch.float32)
        # Forward pass
        outputs = model(features_batch.to(device))

        # Compute loss
        loss = criterion(outputs, labels_tensor.to(device))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Print loss for the epoch
#     if epoch %100 == 0:
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()

# Save the best model
# torch.save(best_model_state_dict, "/home/samyakr2/multilabel/ARK/clip+2fc/best_clip_2fc.pth")

1024
Epoch [1/30], Loss: 72.24327901378274
Epoch [2/30], Loss: 28.514060135930777
Epoch [3/30], Loss: 21.761897983029485
Epoch [4/30], Loss: 19.047771512530744
Epoch [5/30], Loss: 17.462923507206142
Epoch [6/30], Loss: 16.343970615416765
Epoch [7/30], Loss: 15.45307993888855
Epoch [8/30], Loss: 14.692373333964497
Epoch [9/30], Loss: 14.01062468253076
Epoch [10/30], Loss: 13.38658216316253
Epoch [11/30], Loss: 12.801371071953326
Epoch [12/30], Loss: 12.248455919325352
Epoch [13/30], Loss: 11.725259666796774
Epoch [14/30], Loss: 11.22183607891202
Epoch [15/30], Loss: 10.741930334828794
Epoch [16/30], Loss: 10.282103229314089
Epoch [17/30], Loss: 9.839452604297549
Epoch [18/30], Loss: 9.41108566429466
Epoch [19/30], Loss: 8.995342708192766
Epoch [20/30], Loss: 8.595993102528155
Epoch [21/30], Loss: 8.200912269763649
Epoch [22/30], Loss: 7.829425361007452
Epoch [23/30], Loss: 7.458271794952452
Epoch [24/30], Loss: 7.108375128125772
Epoch [25/30], Loss: 6.767341563012451
Epoch [26/30], Loss

In [111]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model, criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device)
            labels_tensor = labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32).to(device)

            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

            # Forward pass
            outputs = model(features_batch)

            # Compute loss
            loss = criterion(outputs, labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = outputs.cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
    print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")


# best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/clip+2fc/best_clip_2fc.pth")
# model.load_state_dict(best_model_state_dict)
test_model(model, criterion, val_features, val_labels, device)


Test Loss: 0.06536759815056
Average Precision Score: 0.9310676658148249


### SAMPLE RUN WITH NEW IDEA

In [69]:
device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

labels_pascal = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
items_pascal = ["A photo of a " + item for item in labels_pascal]

text = clip.tokenize(items_pascal).to(device)
text_features = clip_model.encode_text(text)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_features_path = '/home/samyakr2/multilabel/ARK/pascal_labels_features.pt'
torch.save(text_features, text_features_path)


text_features_path = '/home/samyakr2/multilabel/ARK/pascal_labels_features.pt'
text_features = torch.load(text_features_path).to(torch.float32)

In [85]:
class projector(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(projector, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim,bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
#         out = self.relu(out)
        return out

# class projector_adapter(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(projector_adapter, self).__init__()
        
#         self.fc1 = nn.Sequential(
#             nn.Linear(input_dim, hidden_dim, bias=False),
#             nn.ReLU(inplace=True),
#             nn.Linear(hidden_dim, input_dim, bias=False),
#             nn.ReLU(inplace=True)
#         )
        
# #         self.fc1 = nn.Linear(input_dim, hidden_dim),
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(input_dim, hidden_dim)
#         self.fc3 = nn.Linear(hidden_dim, output_dim)
#         self.softmax = nn.Softmax(dim=1)
#         self.dropout = nn.Dropout(0.3)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         out = self.fc1(x)
#         out = 0.15*x + 0.85*out
#         out = self.fc2(out) 
# #         out = self.relu(out)
# #         out = self.fc3(out)
#         return out




class adapter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(adapter, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, input_dim, bias=False),
            nn.ReLU(inplace=True)
        )
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = 0.15*x + 0.85*out
#         out = self.relu(out)
        out = self.fc2(out)
#         out = self.sigmoid(out)
#         out = self.softmax(out)
        return out


class clip_2fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_2fc, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim,bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
#         out = self.sigmoid(out)
        return out
    
    
input_size = train_features[0].size(1)  
hidden_size = input_size // 4 #128  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels

in_s = text_features.shape[1]  
hs = 200  # Define the size of the hidden layer
nc = 100  

print(in_s)
# Initialize the model
model = clip_2fc(input_size, hidden_size, num_classes).to(device)
model_text = projector(in_s, hs, nc).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification

params_to_optimize = list(model.parameters()) + list(model_text.parameters())
optimizer = torch.optim.Adam(params_to_optimize, lr=0.001)  # Adam optimizer with learning rate 0.001

# # Training loop

best_loss = float('inf')
num_epochs = 40

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32)#
        # Forward pass
        outputs = model(features_batch.to(device))
        
#         outputs_reshaped_normalized = F.normalize(outputs_reshaped, p=2, dim=1)
        
        
#         outputs_text = model_text(text_features.to(device))
        similarity_text = (outputs_text @ outputs_text.T)
#         print(similarity_text.unsqueeze(0).expand(outputs.shape[0], -1, -1).shape)
#         print(outputs_reshaped.shape)
        normalized_similarity_text = F.normalize(similarity_text, p=2, dim=1)  # Normalize along the second dimension (rows)
        normalized_similarity_text = torch.clamp(normalized_similarity_text, min=0, max=1)  # Clamp values to be between 0 and 1
        
        outputs_reshaped = outputs.unsqueeze(1)
        result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
        pred = result
        pred = torch.sigmoid(pred)
#         Compute loss
        loss = criterion(pred, labels_tensor)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
#         print("=="*50)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()
        best_text_model_dict = model_text.state_dict()
    
    # Save the model every 20 epochs
    if (epoch + 1) > 30:
        torch.save(best_model_state_dict, f"/home/samyakr2/multilabel/ARK/weights/best_epoch_{epoch+1}.pth")
        torch.save(best_text_model_dict, f"/home/samyakr2/multilabel/ARK/weights/best_text_epoch_{epoch+1}.pth")


1024
Epoch [1/40], Loss: 89.74820558726788
Epoch [2/40], Loss: 62.922473318874836
Epoch [3/40], Loss: 55.832789085805416
Epoch [4/40], Loss: 40.65219435840845
Epoch [5/40], Loss: 34.40826351568103
Epoch [6/40], Loss: 30.976004537194967
Epoch [7/40], Loss: 28.646626928821206
Epoch [8/40], Loss: 26.41601443104446
Epoch [9/40], Loss: 24.24040306173265
Epoch [10/40], Loss: 22.42756497859955
Epoch [11/40], Loss: 20.881624405272305
Epoch [12/40], Loss: 19.47545178886503
Epoch [13/40], Loss: 18.2890788000077
Epoch [14/40], Loss: 17.188557134009898
Epoch [15/40], Loss: 16.11521647684276
Epoch [16/40], Loss: 15.161091508809477
Epoch [17/40], Loss: 14.268621180672199
Epoch [18/40], Loss: 13.414743405766785
Epoch [19/40], Loss: 12.548974276520312
Epoch [20/40], Loss: 11.756639181636274
Epoch [21/40], Loss: 11.042209830135107
Epoch [22/40], Loss: 10.285180860664696
Epoch [23/40], Loss: 9.564039612188935
Epoch [24/40], Loss: 8.861705026356503
Epoch [25/40], Loss: 8.248630786547437
Epoch [26/40], Lo

In [89]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model, model_text,text_features,criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device).to(torch.float32)
            labels_tensor =  labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32).to(device)

            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1)

            # Forward pass
            outputs = model(features_batch)
            outputs_text = model_text(text_features.to(device))
#             similarity_text = (outputs_text @ outputs_text.T)
#             print(outputs)
            outputs_reshaped = outputs.unsqueeze(1)
            result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
            pred = result
        


            loss = criterion(torch.sigmoid(pred), labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = torch.sigmoid(pred).cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
#     print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")

for i in range (31,41):
    best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/weights/best_epoch_{}.pth".format(i))
    model.load_state_dict(best_model_state_dict)

    best_model_state_dict_text = torch.load("/home/samyakr2/multilabel/ARK/weights/best_text_epoch_{}.pth".format(i))
    model_text.load_state_dict(best_model_state_dict_text)

    test_model(model, model_text, text_features, criterion, val_features, val_labels, device)


Average Precision Score: 0.9085115426343914
Average Precision Score: 0.908814628972699
Average Precision Score: 0.9086622240384808
Average Precision Score: 0.9093521096926107
Average Precision Score: 0.908863135621381
Average Precision Score: 0.9082268415619115
Average Precision Score: 0.908274981611412
Average Precision Score: 0.9089263651844174
Average Precision Score: 0.9082790539761887
Average Precision Score: 0.9088957480361628


In [88]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model, model_text,text_features,criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device).to(torch.float32)
            labels_tensor =  labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32).to(device)

            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1)

            # Forward pass
            outputs = model(features_batch)
            outputs_text = model_text(text_features.to(device))
            similarity_text = (outputs_text @ outputs_text.T)
            
            normalized_similarity_text = F.normalize(similarity_text, p=2, dim=1)
            normalized_similarity_text = torch.clamp(normalized_similarity_text, min=0, max=1)  # Clamp values to be between 0 and 1

#           
            outputs_reshaped = outputs.unsqueeze(1)
            result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
            pred = result
        
            loss = criterion(torch.sigmoid(pred), labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = torch.sigmoid(pred).cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
#     print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")

for i in range (31,41):
    best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/weights/best_epoch_{}.pth".format(i))
    model.load_state_dict(best_model_state_dict)

    best_model_state_dict_text = torch.load("/home/samyakr2/multilabel/ARK/weights/best_text_epoch_{}.pth".format(i))
    model_text.load_state_dict(best_model_state_dict_text)

    test_model(model, model_text, text_features, criterion, val_features, val_labels, device)


Average Precision Score: 0.9087580143147297
Average Precision Score: 0.9099161784027876
Average Precision Score: 0.9072797200443597
Average Precision Score: 0.9094465998504123
Average Precision Score: 0.907855784623597
Average Precision Score: 0.9085619297995249
Average Precision Score: 0.9086105877375811
Average Precision Score: 0.909325637728984
Average Precision Score: 0.909588983646286
Average Precision Score: 0.9090291895770016


In [103]:
class projector(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(projector, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim,bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
#         out = self.relu(out)
        return out

# class projector_adapter(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(projector_adapter, self).__init__()
        
#         self.fc1 = nn.Sequential(
#             nn.Linear(input_dim, hidden_dim, bias=False),
#             nn.ReLU(inplace=True),
#             nn.Linear(hidden_dim, input_dim, bias=False),
#             nn.ReLU(inplace=True)
#         )
        
# #         self.fc1 = nn.Linear(input_dim, hidden_dim),
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(input_dim, hidden_dim)
#         self.fc3 = nn.Linear(hidden_dim, output_dim)
#         self.softmax = nn.Softmax(dim=1)
#         self.dropout = nn.Dropout(0.3)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         out = self.fc1(x)
#         out = 0.15*x + 0.85*out
#         out = self.fc2(out) 
# #         out = self.relu(out)
# #         out = self.fc3(out)
#         return out




class adapter(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(adapter, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, input_dim, bias=False),
            nn.ReLU(inplace=True)
        )
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = 0.15*x + 0.85*out
#         out = self.relu(out)
        out = self.fc2(out)
#         out = self.sigmoid(out)
#         out = self.softmax(out)
        return out


class clip_2fc(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(clip_2fc, self).__init__()
        
        self.fc1 = nn.Sequential(
            nn.Linear(input_dim, hidden_dim,bias=False)
        )
        
#         self.fc1 = nn.Linear(input_dim, hidden_dim),
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
#         out = self.sigmoid(out)
        return out
    
    
input_size = train_features[0].size(1)  
hidden_size = input_size // 4 #128  # Define the size of the hidden layer
num_classes = len(train_labels[0][0])  # Assuming labels_batches is a list of lists of labels

in_s = text_features.shape[1]  
hs = 200  # Define the size of the hidden layer
nc = 100  

print(in_s)
# Initialize the model
model = clip_2fc(input_size, hidden_size, num_classes).to(device)
# model_text = projector(in_s, hs, nc).to(device)

# # Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification

params_to_optimize = list(model.parameters())
optimizer = torch.optim.Adam(params_to_optimize, lr=0.001)  # Adam optimizer with learning rate 0.001

# # Training loop

best_loss = float('inf')
num_epochs = 500



for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features_batch, labels_batch in zip(train_features, train_labels):
        # Flatten features batch
        features_batch = features_batch.view(features_batch.size(0), -1).to(torch.float32)

        # Convert labels to tensor
        labels_tensor = labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32)#
        # Forward pass
        outputs = model(features_batch.to(device))
        
        similarity_text = (text_features @ text_features.T)
        normalized_similarity_text = F.normalize(similarity_text, p=2, dim=1)  # Normalize along the second dimension (rows)
        normalized_similarity_text = torch.clamp(normalized_similarity_text, min=0, max=1)  # Clamp values to be between 0 and 1


        outputs_reshaped = outputs.unsqueeze(1)
        result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
        pred = result
        pred = torch.sigmoid(pred)
#         Compute loss
        loss = criterion(pred, labels_tensor)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
#         print("=="*50)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}")
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state_dict = model.state_dict()
        best_text_model_dict = model_text.state_dict()
    
    # Save the model every 20 epochs
    if (epoch + 1) % 25 == 0:
        torch.save(best_model_state_dict, f"/home/samyakr2/multilabel/ARK/weights/best_epoch_{epoch+1}.pth")
        torch.save(best_text_model_dict, f"/home/samyakr2/multilabel/ARK/weights/best_text_epoch_{epoch+1}.pth")


1024
Epoch [1/500], Loss: 96.44356305897236
Epoch [2/500], Loss: 90.29537300765514
Epoch [3/500], Loss: 83.84662951529026
Epoch [4/500], Loss: 75.83372184634209
Epoch [5/500], Loss: 69.87644106149673
Epoch [6/500], Loss: 65.5928166359663
Epoch [7/500], Loss: 62.3098468631506
Epoch [8/500], Loss: 59.56852859258652
Epoch [9/500], Loss: 57.219581834971905
Epoch [10/500], Loss: 55.1781942024827
Epoch [11/500], Loss: 53.41724266856909
Epoch [12/500], Loss: 51.903966300189495
Epoch [13/500], Loss: 50.65931884944439
Epoch [14/500], Loss: 49.578438363969326
Epoch [15/500], Loss: 48.16227797418833
Epoch [16/500], Loss: 46.51636801660061
Epoch [17/500], Loss: 44.906714610755444
Epoch [18/500], Loss: 43.18814776092768
Epoch [19/500], Loss: 41.618738643825054
Epoch [20/500], Loss: 40.21063008159399
Epoch [21/500], Loss: 38.88230871409178
Epoch [22/500], Loss: 37.66335577890277
Epoch [23/500], Loss: 36.78212880715728
Epoch [24/500], Loss: 37.41269452869892
Epoch [25/500], Loss: 36.34268646687269
Ep

Epoch [201/500], Loss: 13.141182909719646
Epoch [202/500], Loss: 12.56497246120125
Epoch [203/500], Loss: 12.068079037126154
Epoch [204/500], Loss: 11.628813423682004
Epoch [205/500], Loss: 11.210640305653214
Epoch [206/500], Loss: 10.877250183839351
Epoch [207/500], Loss: 10.578811769839376
Epoch [208/500], Loss: 10.325852145440876
Epoch [209/500], Loss: 10.140036481432617
Epoch [210/500], Loss: 10.2832321068272
Epoch [211/500], Loss: 10.94728335691616
Epoch [212/500], Loss: 12.636299756355584
Epoch [213/500], Loss: 14.133107278496027
Epoch [214/500], Loss: 14.035703132394701
Epoch [215/500], Loss: 13.902478980831802
Epoch [216/500], Loss: 15.008661983069032
Epoch [217/500], Loss: 20.681186533533037
Epoch [218/500], Loss: 27.364548878744245
Epoch [219/500], Loss: 22.99447402730584
Epoch [220/500], Loss: 17.0984687237069
Epoch [221/500], Loss: 14.975271162576973
Epoch [222/500], Loss: 13.887362780049443
Epoch [223/500], Loss: 13.033658078871667
Epoch [224/500], Loss: 12.300059041939676

Epoch [400/500], Loss: 10.694876493653283
Epoch [401/500], Loss: 11.463050327729434
Epoch [402/500], Loss: 12.000728024402633
Epoch [403/500], Loss: 12.562993163242936
Epoch [404/500], Loss: 13.53737099817954
Epoch [405/500], Loss: 13.978801410412416
Epoch [406/500], Loss: 12.874284705845639
Epoch [407/500], Loss: 10.644078368786722
Epoch [408/500], Loss: 9.364924165885895
Epoch [409/500], Loss: 8.39986131945625
Epoch [410/500], Loss: 7.613598751835525
Epoch [411/500], Loss: 6.958346167812124
Epoch [412/500], Loss: 6.452461265027523
Epoch [413/500], Loss: 6.038589170901105
Epoch [414/500], Loss: 5.696699589723721
Epoch [415/500], Loss: 5.435079563176259
Epoch [416/500], Loss: 5.211739170132205
Epoch [417/500], Loss: 5.113548719091341
Epoch [418/500], Loss: 4.993936296901666
Epoch [419/500], Loss: 5.061198897659779
Epoch [420/500], Loss: 5.494357937714085
Epoch [421/500], Loss: 6.502426681807265
Epoch [422/500], Loss: 6.075238751247525
Epoch [423/500], Loss: 5.938120853621513
Epoch [424

In [104]:
import numpy as np
from sklearn.metrics import average_precision_score

# Define a function for testing the model
def test_model(model,text_features,criterion, features_batches, labels_batches, device):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_labels = []
    all_outputs = []
    with torch.no_grad():  # Disable gradient computation
        for features_batch, labels_batch in zip(features_batches, labels_batches):
            # Move batch to device
            features_batch = features_batch.to(device).to(torch.float32)
            labels_tensor =  labels_batch.type(torch.float32).to(device)#torch.tensor(labels_batch, dtype=torch.float32).to(device)

            # Flatten features batch
            features_batch = features_batch.view(features_batch.size(0), -1)

            # Forward pass
            outputs = model(features_batch)
            similarity_text = (text_features @ text_features.T)
            
            normalized_similarity_text = F.normalize(similarity_text, p=2, dim=1)
            normalized_similarity_text = torch.clamp(normalized_similarity_text, min=0, max=1)  # Clamp values to be between 0 and 1

#           
            outputs_reshaped = outputs.unsqueeze(1)
            result = torch.sum(outputs_reshaped * normalized_similarity_text.unsqueeze(0), dim =2)
            pred = result
        
            loss = criterion(torch.sigmoid(pred), labels_tensor)

            test_loss += loss.item()

            # Convert outputs and labels to numpy arrays
            outputs_np = torch.sigmoid(pred).cpu().detach().numpy()
            labels_np = labels_tensor.cpu().detach().numpy()

            all_outputs.append(outputs_np)
            all_labels.append(labels_np)

    # Concatenate outputs and labels
    all_outputs = np.concatenate(all_outputs)
    all_labels = np.concatenate(all_labels)

    # Compute average precision score
    avg_precision = average_precision_score(all_labels, all_outputs, average='micro')

    # Average test loss
    avg_test_loss = test_loss / len(features_batches)
#     print(f"Test Loss: {avg_test_loss}")
    print(f"Average Precision Score: {avg_precision}")

for i in range (25,501,25):
    best_model_state_dict = torch.load("/home/samyakr2/multilabel/ARK/weights/best_epoch_{}.pth".format(i))
    model.load_state_dict(best_model_state_dict)

#     best_model_state_dict_text = torch.load("/home/samyakr2/multilabel/ARK/weights/best_text_epoch_{}.pth".format(i))
#     model_text.load_state_dict(best_model_state_dict_text)

    test_model(model, text_features, criterion, val_features, val_labels, device)


Average Precision Score: 0.7924123209218925
Average Precision Score: 0.8484004351930238
Average Precision Score: 0.8957862637898228
Average Precision Score: 0.8988103143005933
Average Precision Score: 0.9082210976599276
Average Precision Score: 0.9101223944040837
Average Precision Score: 0.8940270599418854
Average Precision Score: 0.9104353404618091
Average Precision Score: 0.9046137144445521
Average Precision Score: 0.9092494974006811
Average Precision Score: 0.8793966233694841
Average Precision Score: 0.902264594753317
Average Precision Score: 0.9138198007007856
Average Precision Score: 0.9063380203989768
Average Precision Score: 0.9029659049333684
Average Precision Score: 0.8929673094797538
Average Precision Score: 0.9002565627850398
Average Precision Score: 0.8798827097409766
Average Precision Score: 0.8969564695294033
Average Precision Score: 0.9010584002249271
