## Using CLIP model to find the class of cifar-10 photos

In [1]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision.datasets import CIFAR10,MNIST
from torchvision import datasets
from torchvision import transforms
import torchvision

from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from PIL import Image
from clip import clip

In [2]:
# random seed
SEED = 1 
NUM_CLASS = 10

# Training
BATCH_SIZE = 128
NUM_EPOCHS = 30
EVAL_INTERVAL=1
SAVE_DIR = './log'

# Optimizer
LEARNING_RATE = 1e-1
MOMENTUM = 0.9
STEP=5
GAMMA=0.5

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
transform_cifar10_test = transforms.Compose([
    transforms.Resize(size=224),
    transforms.CenterCrop(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_set = torchvision.datasets.CIFAR10(root='/shareddata', train=False,
                                       download=True, transform=transform_cifar10_test)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
dataset_name = 'CIFAR10'

Files already downloaded and verified


In [5]:
VISUAL_BACKBONE = 'ViT-B/16'
# Load the model
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [6]:
prompt = [f"A photo of a {class_name}" for class_name in class_names]

text_inputs = torch.cat([clip.tokenize(f"{prompt} {c}") for c in class_names]).to(device)

# 对文本提示进行编码
with torch.no_grad():
    text_features = model.encode_text(text_inputs)

In [7]:
torch.cuda.empty_cache()

In [8]:
import torch
import clip
from PIL import Image

clip_model, preprocess = clip.load("ViT-B/16", device=device)

# 定义CIFAR-10的类别
cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

for i in range(5):
    # 打开本地的图像文件

    # 生成随机文件编号，范围从1到117
    random_number = random.randint(1, 117)

    file_number = str(random_number).zfill(4)

    # 生成随机文件路径
    image_path = f"/shareddata/dataset/caltech-101/101_ObjectCategories/car_side/image_{file_number}.jpg"
    image = Image.open(image_path)
    plt.imshow(image)
    plt.axis('off')  # 不显示坐标轴
    plt.show()
    
    # 使用CLIP提供的预处理方法处理图像
    image_preprocessed = preprocess(image).unsqueeze(0).to(device)

    
    # 使用CLIP模型进行预测
    with torch.no_grad():
        # 为每个类别生成一个文本输入
        text_inputs = torch.cat([clip.tokenize(f"a photo of a {classname}") for classname in cifar10_classes]).to(device)

        # 计算图像和文本输入的特征向量
        image_features = clip_model.encode_image(image_preprocessed)
        text_features = clip_model.encode_text(text_inputs)

        # 计算图像和每个文本输入之间的相似度
        logits_per_image, logits_per_text = clip_model(image_preprocessed, text_inputs)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    # 找出概率最高的类别
    predicted_class = cifar10_classes[probs.argmax()]

    print(f"The image{file_number} is most likely an: {predicted_class}")

RuntimeError: NVML_SUCCESS == r INTERNAL ASSERT FAILED at "/opt/pytorch/pytorch/c10/cuda/CUDACachingAllocator.cpp":1150, please report a bug to PyTorch. 

In [None]:
torch.cuda.empty_cache()