In [7]:
import os

# 图像文件夹路径
image_folder = "./Images"

# 文本描述文件路径
caption_file = "./captions.txt"


# 加载图像文件名和对应的文字描述
image_paths = []
captions = []

with open(caption_file, "r") as file:
    lines = file.readlines()
    for line in lines:
        parts = line.strip().split(",")
        if len(parts) == 2:
            image_paths.append(os.path.join(image_folder, parts[0]))
            captions.append(parts[1])


In [8]:
from PIL import Image
import torchvision.transforms as transforms

# 创建图像预处理转换器
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 预处理并加载图像
images = [transform(Image.open(image_path)) for image_path in image_paths]


In [9]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision.datasets import CIFAR10
from torchvision import datasets
from torchvision import transforms
import torchvision

from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from PIL import Image
from clip import clip

In [10]:
# # random seed
# SEED = 1 
# NUM_CLASS = 10

# Training
BATCH_SIZE = 128
# NUM_EPOCHS = 30
# EVAL_INTERVAL=1
# SAVE_DIR = './log'

# # Optimizer
# LEARNING_RATE = 1e-1
# MOMENTUM = 0.9
# STEP=5
# GAMMA=0.5

# CLIP
VISUAL_BACKBONE = 'ViT-B/32' # RN50, ViT-B/32, ViT-B/16


In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
from sklearn.model_selection import train_test_split

# 划分数据集
train_images, test_images, train_captions, test_captions = train_test_split(images, captions, test_size=0.2, random_state=42)


In [13]:
# Load the model
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [14]:

def prompt_encode(prompt):
    """
    Args:
        prompt (str): the text prefix before the class

    Returns:
        text_inputs(torch.Tensor)

    """
    ##################### Write your answer here ##################
    text_inputs = clip.tokenize(prompt).to(device)
    ###############################################################
    
    return text_inputs
prompt = 'a photo of a dog' # you can try different prompt
text_inputs = prompt_encode(prompt)
text_inputs

tensor([[49406,   320,  1125,   539,   320,  1929, 49407,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], device='cuda:0',
       dtype=torch.int32)

In [15]:
def model_inference(model, image, text_inputs):
    """
    Perform model inference to calculate logits for zero-shot learning.

    Args:
        model (CLIPModel): The CLIP model for inference.
        image (torch.Tensor): The batch of images to encode.
        text_inputs (torch.Tensor): The batch of text inputs to encode.

    Returns:
        torch.Tensor: Logits representing the similarity between images and texts.
    """
    # Ensure the model is in evaluation mode
    model.eval()

    # Encode the image using the CLIP model
    with torch.no_grad():
        image_features = model.encode_image(image)

    # Normalize the image features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)

    # Encode the text using the CLIP model's text encoder
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)

    # Normalize the text features
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

    # Calculate the similarity (logits) between each image and text
    logits = image_features @ text_features.t()

    return logits


In [34]:
from torch.utils.data import Dataset
from PIL import Image
import torch

class Flickr8kDataset(Dataset):
    def __init__(self, images, captions, transform=None, text_transform=None):
        self.images = images
        self.captions = captions
        self.transform = transform
        self.text_transform = text_transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        caption = self.captions[idx]

        # Check if the image is already a tensor (skip transform if it is)
        if not isinstance(image, torch.Tensor):
            if self.transform:
                image = self.transform(Image.open(image))
            else:
                image = Image.open(image)

        # Convert caption to tensor using text_transform
        if self.text_transform:
            caption = self.text_transform(caption)

        return image, caption


In [35]:
from torchvision import transforms

# 创建图像预处理转换器
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = Flickr8kDataset(train_images, train_captions, transform=transform)
test_dataset = Flickr8kDataset(test_images, test_captions, transform=transform)


In [36]:
from torch.utils.data import DataLoader

# 设置批量大小
batch_size = 128

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [39]:
from tqdm import tqdm
import torch

# Zero-shot accuracy calculation
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    model.eval()

    for images, test1_captions in test_loader:  # Use test_captions here
        # Move images and test_captions to the same device as the model
        images = images.to(device)
        test1_captions = test1_captions.to(device)  # Rename captions to test_captions

        # Perform model inference to get logits
        logits = model_inference(model, images, test1_captions)
        # Predictions are the indices of the max logit values
        predictions = logits.argmax(dim=-1)

        # Update correct predictions and total predictions
        correct_predictions += (predictions == test1_captions).sum().item()  # Change labels to test_captions
        total_predictions += test_1captions.size(0)  # Change labels to test_captions

# Calculate accuracy
val_acc = correct_predictions / total_predictions

print(f"the zero-shot performance on {dataset_name} is {val_acc*100:.2f}%, visual encoder is {VISUAL_BACKBONE}.")


AttributeError: 'tuple' object has no attribute 'to'