In [1]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision import datasets
from torchvision import transforms
import torchvision

from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from PIL import Image
from clip import clip

In [2]:
# random seed
SEED = 1 
NUM_CLASS = 10

# Training
BATCH_SIZE = 128
NUM_EPOCHS = 30
EVAL_INTERVAL=1
SAVE_DIR = './log'

# Optimizer
LEARNING_RATE = 1e-1
MOMENTUM = 0.9
STEP=5
GAMMA=0.5

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
import os

# Caltech-101 数据集的路径
dataset_path = '/shareddata/dataset/caltech-101/101_ObjectCategories'

# 遍历每个类别的文件夹
for folder_name in os.listdir(dataset_path):
    folder_path = os.path.join(dataset_path, folder_name)
    
    # 确保是一个文件夹
    if os.path.isdir(folder_path):
        jpg_files = [f for f in os.listdir(folder_path) if f.endswith('.jpg')]

        # 检查是否每个文件夹至少有一个 .jpg 文件
        if len(jpg_files) == 0:
            print(f"No .jpg files found in {folder_name}")
        else:
            print(f".jpg files found in {folder_name}: {len(jpg_files)} files")


.jpg files found in chair: 62 files
.jpg files found in mandolin: 43 files
.jpg files found in tick: 49 files
.jpg files found in lamp: 61 files
.jpg files found in kangaroo: 86 files
.jpg files found in gerenuk: 34 files
.jpg files found in gramophone: 51 files
.jpg files found in cougar_face: 69 files
.jpg files found in umbrella: 75 files
.jpg files found in euphonium: 64 files
.jpg files found in dolphin: 65 files
.jpg files found in ibis: 80 files
.jpg files found in dollar_bill: 52 files
.jpg files found in barrel: 47 files
.jpg files found in butterfly: 91 files
.jpg files found in mayfly: 40 files
.jpg files found in ceiling_fan: 47 files
.jpg files found in stegosaurus: 59 files
.jpg files found in windsor_chair: 56 files
.jpg files found in flamingo_head: 45 files
.jpg files found in pizza: 53 files
.jpg files found in cannon: 43 files
.jpg files found in soccer_ball: 64 files
.jpg files found in sea_horse: 57 files
.jpg files found in Motorbikes: 798 files
.jpg files found i

In [5]:
# Caltech-101 数据集的路径
caltech101_path = '/shareddata/dataset/caltech-101/101_ObjectCategories'

# 定义对 Caltech-101 图像的变换
transform_caltech101 = transforms.Compose([
    transforms.Resize(size=224),
    transforms.CenterCrop(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# 排除的文件夹列表
excluded_folders = ['BACKGROUND_Google', 'caltech101']  # 根据需要添加其他需要排除的文件夹

# 提取所有子文件夹名称作为类名，并排除特定的文件夹
class_names = [d for d in os.listdir(caltech101_path) if os.path.isdir(os.path.join(caltech101_path, d)) and d not in excluded_folders]
print("Class names in Caltech-101 dataset:", class_names)

# 加载 Caltech-101 数据集
caltech101_dataset = datasets.ImageFolder(root=caltech101_path, transform=transform_caltech101)
caltech101_dataloader = torch.utils.data.DataLoader(caltech101_dataset, batch_size=BATCH_SIZE, shuffle=True)

# 更新数据集名称
dataset_name = 'Caltech-101'

Class names in Caltech-101 dataset: ['chair', 'mandolin', 'tick', 'lamp', 'kangaroo', 'gerenuk', 'gramophone', 'cougar_face', 'umbrella', 'euphonium', 'dolphin', 'ibis', 'dollar_bill', 'barrel', 'butterfly', 'mayfly', 'ceiling_fan', 'stegosaurus', 'windsor_chair', 'flamingo_head', 'pizza', 'cannon', 'soccer_ball', 'sea_horse', 'Motorbikes', 'octopus', 'starfish', 'binocular', 'dragonfly', 'cougar_body', 'grand_piano', 'pagoda', 'pyramid', 'anchor', 'scorpion', 'chandelier', 'buddha', 'saxophone', 'crocodile', 'panda', 'dalmatian', 'yin_yang', 'minaret', 'garfield', 'stop_sign', 'metronome', 'crocodile_head', 'brontosaurus', 'sunflower', 'menorah', 'cup', 'Faces_easy', 'llama', 'hedgehog', 'car_side', 'wheelchair', 'inline_skate', 'schooner', 'pigeon', 'hawksbill', 'emu', 'wild_cat', 'headphone', 'laptop', 'ferry', 'platypus', 'wrench', 'airplanes', 'bass', 'ewer', 'brain', 'watch', 'water_lilly', 'rhino', 'trilobite', 'Faces', 'snoopy', 'helicopter', 'strawberry', 'cellphone', 'rooster

In [6]:
# 提取所有子文件夹名称作为类名
class_names = [d for d in os.listdir(caltech101_path) if os.path.isdir(os.path.join(caltech101_path, d))]
print("Class names in DTD dataset:", class_names)

Class names in DTD dataset: ['chair', 'mandolin', 'tick', 'lamp', 'kangaroo', 'gerenuk', 'gramophone', 'cougar_face', 'umbrella', 'euphonium', 'dolphin', 'ibis', 'dollar_bill', 'barrel', 'butterfly', 'mayfly', 'ceiling_fan', 'stegosaurus', 'windsor_chair', 'flamingo_head', 'pizza', 'cannon', 'soccer_ball', 'sea_horse', 'Motorbikes', 'octopus', 'starfish', 'binocular', 'dragonfly', 'cougar_body', 'grand_piano', 'pagoda', 'pyramid', 'anchor', 'scorpion', 'chandelier', 'buddha', 'saxophone', 'crocodile', 'panda', 'dalmatian', 'yin_yang', 'minaret', 'garfield', 'stop_sign', 'metronome', 'crocodile_head', 'brontosaurus', 'sunflower', 'menorah', 'cup', 'Faces_easy', 'llama', 'hedgehog', 'car_side', 'wheelchair', 'inline_skate', 'schooner', 'pigeon', 'hawksbill', 'emu', 'wild_cat', 'headphone', 'laptop', 'ferry', 'platypus', 'wrench', 'airplanes', 'bass', 'ewer', 'brain', 'watch', 'water_lilly', 'rhino', 'trilobite', 'Faces', 'snoopy', 'helicopter', 'strawberry', 'cellphone', 'rooster', 'cray

In [7]:
text_inputs = torch.cat([clip.tokenize(f"A photo of a {class_name}").to(device) for class_name in class_names], dim=0)

In [8]:
def model_inference(model, image):
    
    image_features = model.encode_image(image)
    text_features = model.encode_text(text_inputs)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    logit_scale = model.logit_scale.exp()

    logits = logit_scale * image_features @ text_features.t()

    return logits

In [9]:
torch.cuda.empty_cache()

In [10]:
VISUAL_BACKBONE = 'RN50'
# Load the model
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device)

CLIP(
  (visual): ModifiedResNet(
    (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu1): ReLU(inplace=True)
    (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu2): ReLU(inplace=True)
    (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu3): ReLU(inplace=True)
    (avgpool): AvgPool2d(kernel_size=2, stride=2, padding=0)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
     

In [11]:
with torch.no_grad():
    model.eval()

    val_loss = 0.0
    val_corrects = 0

    for batch_idx, (image, target) in enumerate(caltech101_dataloader):

        image = image.to(device)
        target = target.to(device)

        # test model
        logits = model_inference(model, image)
        _, preds = torch.max(logits, 1)
        
        val_corrects += torch.sum(preds == target.data)

    val_acc = val_corrects.double() / len(caltech101_dataset)

    print(f"the zero-shot performance on {dataset_name} is {val_acc*100:.2f}%, visual encoder is {VISUAL_BACKBONE}.")
acc1 = val_acc

the zero-shot performance on Caltech-101 is 0.25%, visual encoder is RN50.


In [12]:
torch.cuda.empty_cache()

In [13]:
VISUAL_BACKBONE = 'ViT-B/32'
# Load the model
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [14]:
with torch.no_grad():
    model.eval()

    val_loss = 0.0
    val_corrects = 0

    for batch_idx, (image, target) in enumerate(caltech101_dataloader):

        image = image.to(device)
        target = target.to(device)

        # test model
        logits = model_inference(model, image)
        _, preds = torch.max(logits, 1)
        
        val_corrects += torch.sum(preds == target.data)

    val_acc = val_corrects.double() / len(caltech101_dataset)

    print(f"the zero-shot performance on {dataset_name} is {val_acc*100:.2f}%, visual encoder is {VISUAL_BACKBONE}.")
acc2 = val_acc

the zero-shot performance on Caltech-101 is 0.02%, visual encoder is ViT-B/32.


In [15]:
torch.cuda.empty_cache()

In [16]:
VISUAL_BACKBONE = 'ViT-B/16'
# Load the model
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [17]:
with torch.no_grad():
    model.eval()

    val_loss = 0.0
    val_corrects = 0

    for batch_idx, (image, target) in enumerate(caltech101_dataloader):

        image = image.to(device)
        target = target.to(device)

        # test model
        logits = model_inference(model, image)
        _, preds = torch.max(logits, 1)
        
        val_corrects += torch.sum(preds == target.data)

    val_acc = val_corrects.double() / len(caltech101_dataset)

    print(f"the zero-shot performance on {dataset_name} is {val_acc*100:.2f}%, visual encoder is {VISUAL_BACKBONE}.")
acc3 = val_acc

the zero-shot performance on Caltech-101 is 0.02%, visual encoder is ViT-B/16.


### Caltech101

In [18]:
print(f"RN50 : {acc1*100:.2f}%")
print(f"ViT-B/32 : {acc2*100:.2f}%")
print(f"ViT-B/16 : {acc3*100:.2f}%")

RN50 : 0.25%
ViT-B/32 : 0.02%
ViT-B/16 : 0.02%


In [19]:
torch.cuda.empty_cache()