# Exercise 03: CLIP zero-shot prediction
In this exercise, you will perform zero-shot prediction using CLIP.

### Basic Imports

In [None]:
#pip install ftfy

In [2]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision.datasets import CIFAR10
from torchvision import datasets
from torchvision import transforms
import torchvision

from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from PIL import Image
from clip import clip

### Hyperparameters

In [3]:
# # random seed
# SEED = 1 
# NUM_CLASS = 10

# Training
BATCH_SIZE = 128
# NUM_EPOCHS = 30
# EVAL_INTERVAL=1
# SAVE_DIR = './log'

# # Optimizer
# LEARNING_RATE = 1e-1
# MOMENTUM = 0.9
# STEP=5
# GAMMA=0.5

# CLIP



### Device

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


### Dataset


In [5]:
from torchvision.datasets import ImageFolder
import tarfile
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


# 定义DTD数据集路径
dtd_data_path = "/data/lab/STA303-Assignment02/data/dtd/images"  # 请替换为实际的DTD数据集路径


### Model

In [6]:

# 数据集预处理
class DTDDataset(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image_path = self.images[idx]
        label = self.labels[idx]
        
        # 读取图像
        image = Image.open(image_path).convert("RGB")
        
        # 应用变换
        if self.transform:
            image = self.transform(image)

        return image, label

def read_data(data_path):
    classes = os.listdir(data_path)
    images = []
    labels = []

    for class_idx, class_name in enumerate(classes):
        class_path = os.path.join(data_path, class_name)
        class_images = [os.path.join(class_path, filename) for filename in os.listdir(class_path) if filename.endswith(('.png', '.jpg', '.jpeg'))]
        class_labels = [class_name] * len(class_images)

        images.extend(class_images)
        labels.extend(class_labels)

    return images, labels

# 数据集划分
dtd_images, dtd_labels = read_data(dtd_data_path)

# 定义图像变换
dtd_image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# 划分训练集和测试集
image_label_pairs = list(zip(dtd_images, dtd_labels))
train_data, test_data = train_test_split(image_label_pairs, test_size=0.2, random_state=1)

# 创建DTD训练集和测试集
train_dataset = DTDDataset([item[0] for item in train_data], [item[1] for item in train_data], transform=dtd_image_transform)
test_dataset = DTDDataset([item[0] for item in test_data], [item[1] for item in test_data], transform=dtd_image_transform)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class_names = ['knitted', 'crosshatched', 'scaly', 'polka-dotted', 'woven', 'stratified', 'striped', 'wrinkled', 'cobwebbed', 'smeared', 'veined', 'zigzagged', 'banded', 'fibrous', 'crystalline', 'blotchy', 'freckled', 'grooved', 'spiralled', 'marbled', 'frilly', 'interlaced', 'bubbly', 'porous', 'lacelike', 'swirly', 'bumpy', 'potholed', 'honeycombed', 'stained', 'meshed', 'sprinkled', 'pleated', 'gauzy', 'matted', 'dotted', 'paisley', 'flecked', 'studded', 'braided', 'waffled', 'chequered', 'grid', 'perforated', 'cracked', 'pitted', 'lined']

dataset_name = 'DTD'

In [7]:


class BaselineModel(nn.Module):
    def __init__(self, num_classes):
        super(BaselineModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(32 * 56 * 56, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [8]:
label_dict = {'knitted': 0, 'crosshatched': 1, 'scaly': 2, 'polka-dotted': 3, 'woven': 4, 'stratified': 5, 'striped': 6, 'wrinkled': 7, 'cobwebbed': 8, 'smeared': 9, 'veined': 10, 'zigzagged': 11, 'banded': 12, 'fibrous': 13, 'crystalline': 14, 'blotchy': 15, 'freckled': 16, 'grooved': 17, 'spiralled': 18, 'marbled': 19, 'frilly':  20, 'interlaced': 21, 'bubbly': 22, 'porous': 23, 'lacelike': 24, 'swirly': 25, 'bumpy': 26, 'potholed':  27, 'honeycombed': 28, 'stained': 29, 'meshed': 30, 'sprinkled': 31, 'pleated': 32, 'gauzy': 33, 'matted': 34, 'dotted': 35, 'paisley': 36, 'flecked': 37, 'studded': 38, 'braided': 39, 'waffled': 40, 'chequered': 41, 'grid': 42, 'perforated': 43, 'cracked': 44, 'pitted': 45, 'lined': 46}
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    
    running_loss = 0.0
    correct = 0
    total = 0
    
    #for images, labels in dataloader:
        #images, labels = images.to(device), labels.to(device)
    for batch in dataloader:
        #images, labels = batch[0].to(device), batch[1].to(device)
        images = batch[0].to(device)
        #labels = batch[1].to(device)
        #labels = torch.tensor(batch[1], dtype=torch.long).to(device)
        labels_str = batch[1]
        labels = torch.tensor([label_dict[label] for label in labels_str], dtype=torch.long).to(device)

        
        optimizer.zero_grad()
        
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
    train_loss = running_loss / len(dataloader)
    train_acc = correct / total
    
    return train_loss, train_acc

def test(model, dataloader, criterion, device):
    model.eval()
    
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in dataloader:
            #images, labels = images.to(device), labels.to(device)
            images = images.to(device)
            #labels = labels.to(device)
            #images = batch[0].to(device)
            #labels_str = batch[1]
            labels_str = labels
            labels = torch.tensor([label_dict[label] for label in labels_str], dtype=torch.long).to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    test_loss = running_loss / len(dataloader)
    test_acc = correct / total
    
    return test_loss, test_acc


In [9]:
# 定义模型和损失函数
model = BaselineModel(num_classes=len(class_names)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 设置训练参数
NUM_EPOCHS = 10

# 开始训练和测试
for epoch in range(NUM_EPOCHS):
    # 训练模型
    train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
    
    # 测试模型
    test_loss, test_acc = test(model, test_loader, criterion, device)
    
    # 打印训练和测试结果
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}:")
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")
    print("-" * 50)


Epoch 1/10:
Train Loss: 4.4094 | Train Acc: 0.0361
Test Loss: 3.8398 | Test Acc: 0.0479
--------------------------------------------------
Epoch 2/10:
Train Loss: 3.6644 | Train Acc: 0.0842
Test Loss: 3.6849 | Test Acc: 0.0913
--------------------------------------------------
Epoch 3/10:
Train Loss: 2.6925 | Train Acc: 0.3757
Test Loss: 3.7445 | Test Acc: 0.1410
--------------------------------------------------
Epoch 4/10:
Train Loss: 1.0444 | Train Acc: 0.7786
Test Loss: 5.2658 | Test Acc: 0.1365
--------------------------------------------------
Epoch 5/10:
Train Loss: 0.4939 | Train Acc: 0.8938
Test Loss: 6.3754 | Test Acc: 0.1108
--------------------------------------------------
Epoch 6/10:
Train Loss: 0.3174 | Train Acc: 0.9324
Test Loss: 6.6343 | Test Acc: 0.1223
--------------------------------------------------
Epoch 7/10:
Train Loss: 0.2076 | Train Acc: 0.9521
Test Loss: 7.0997 | Test Acc: 0.1073
--------------------------------------------------
Epoch 8/10:
Train Loss: 0.1