In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
import torch
import clip
from PIL import Image

In [2]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50", device=device)

In [4]:
image = preprocess(Image.open("yeom.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a human", "a beast"]).to(device)

In [5]:
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    print(image_features.shape, text_features.shape)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

torch.Size([1, 1024]) torch.Size([2, 1024])


In [6]:
base_path = '.'
weights_path = os.path.join(base_path, 'model_best_blurpool_78_528.pth.tar')
train_csv_file = os.path.join(base_path, 'imagenet_caption_train_with_labels.csv')
val_csv_file = os.path.join(base_path, 'imagenet_caption_val_with_labels.csv')
print(weights_path, train_csv_file, val_csv_file)

./model_best_blurpool_78_528.pth.tar ./imagenet_caption_train_with_labels.csv ./imagenet_caption_val_with_labels.csv


In [7]:
import wandb
wandb.init(project='lgd2024_clip')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrichkang715[0m ([33mansungwook[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
import torchvision.transforms as transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
image_size = 240

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(image_size),
    transforms.RandomHorizontalFlip(),
    transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
    transforms.RandomGrayscale(p=0.2),
    transforms.ToTensor(),
    normalize,
    ])
val_transforms = transforms.Compose([
    transforms.Resize(image_size, interpolation=Image.BICUBIC),
    transforms.CenterCrop(image_size),
    transforms.ToTensor(),
    normalize,
])

In [9]:
import pandas as pd
train_df = pd.read_csv(train_csv_file)
val_df = pd.read_csv(val_csv_file)

In [10]:
from torch.utils.data import DataLoader, Dataset
class ImageTextDataset(Dataset):
    def __init__(self, df, transform=None):
        self.data_frame = df
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        img_name = self.data_frame.iloc[idx, 0]
        images = Image.open(img_name).convert('RGB')
        
        if self.transform:
            images = self.transform(images)
        
        texts = self.data_frame.iloc[idx, 1]
        labels = self.data_frame.iloc[idx, 2]

        return images, texts, labels

In [11]:
batch_size = 128
train_dataset = ImageTextDataset(train_df, train_transforms)
val_dataset = ImageTextDataset(val_df, val_transforms)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [12]:
import torch.optim as optim
import torch.nn as nn
epochs = 10

optimizer = optim.Adam(model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=1e-6)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs, eta_min=1e-7)

In [13]:
def contrastive_loss(logits_per_image, logits_per_text):
    labels = torch.arange(logits_per_image.shape[0]).to(device)
    image_loss = nn.CrossEntropyLoss()(logits_per_image, labels)
    text_loss = nn.CrossEntropyLoss()(logits_per_text, labels)
    return (image_loss + text_loss) / 2

In [None]:
from tqdm import tqdm
def validate(model):
    model.eval()
    image_features_all = []
    text_features_all = []
    labels_all = []
    
    print("VALIDATION")
    with torch.no_grad():
        for images, texts, labels in tqdm(val_loader):
            images = images.to(device)
            texts = clip.tokenize(texts).to(device)
            labels = labels.to(device)
            
            image_features = model.encode_image(images)
            text_features = model.encode_text(texts)
            
            image_features_all.append(image_features)
            text_features_all.append(text_features)
            labels_all.append(labels)
            
        image_features_all = torch.cat(image_features_all)
        text_features_all = torch.cat(text_features_all)
        labels_all = torch.cat(labels_all)
        
        
        # normalized features
        image_features_all = image_features_all / image_features_all.norm(dim=1, keepdim=True)
        text_features_all = text_features_all / text_features_all.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = model.logit_scale.exp()
        logits_per_image = logit_scale * image_features_all @ text_features_all.t()
        logits_per_text = logits_per_image.t()
        
        loss = contrastive_loss(logits_per_image, logits_per_text)
        print("Validation Loss: ", loss.item())
        
        top1_pred_indices = logits_per_image.argmax(dim=1)
        top1_pred_labels = labels_all[top1_pred_indices]
        correct = (top1_pred_labels == labels_all).sum().item()
        acc = correct / len(labels_all)
        print("Validation Accuracy: ", acc * 100)
        wandb.log({'top1_accuracy': acc * 100})
        
    torch.cuda.empty_cache()

In [None]:
def classification(model):
    model.eval()
    image_features_all = []
    text_features_all = []
    labels_all = []
    
    with torch.no_grad():
        

In [15]:
# validate(model)

In [None]:
print_freq = 5
for epoch in range(0, epochs):
    model.train()
    print(f"Training epoch {epoch}")
    for idx, (images, texts, labels) in enumerate(tqdm(train_loader, ncols=100)):
        images = images.to(device)
        texts = clip.tokenize(texts).to(device)
        labels = labels.to(device)

        image_features = model.encode_image(images)
        text_features = model.encode_text(texts)

        logits_per_image, logits_per_text = model(images, texts)
        loss = contrastive_loss(logits_per_image, logits_per_text)
        
        top1_pred_indices = logits_per_image.argmax(dim=1)
        top1_pred_labels = labels[top1_pred_indices]
        correct = (top1_pred_labels == labels).sum().item()
        top1_accuracy = correct / labels.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if idx % print_freq == 0:
            wandb.log({'train_top1_accuracy': top1_accuracy*100, 'train_loss': loss}, step = int((epoch*len(train_loader)*batch_size + idx*batch_size) / print_freq))

    validate(model)
    scheduler.step()

Training epoch 0


100%|██████████| 10009/10009 [5:32:34<00:00,  1.99s/it] 


VALIDATION


100%|██████████| 391/391 [09:09<00:00,  1.41s/it]


Validation Loss:  3.984375
Validation Accuracy:  49.106
Training epoch 1


100%|██████████| 10009/10009 [4:45:18<00:00,  1.71s/it] 


VALIDATION


100%|██████████| 391/391 [09:59<00:00,  1.53s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.66 GiB (GPU 0; 23.68 GiB total capacity; 9.17 GiB already allocated; 3.25 GiB free; 20.07 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF