# Assignment 02: Applications of CLIP model
Design experiments to validate the test performance of CLIP model in a new dataset, which is
selected by students. This task is to explore the potential applications of CLIP in the real world, so
students are encouraged to apply CLIP to some new scenarios (new datasets/settings/tasks/…).

### Basic Imports

In [1]:
#conda create -n clip python=3.9
#activate clip

In [2]:
#pip install torch==1.9.0
#pip install torchaudio==0.9.0
#pip install torchvision==0.10.0

In [3]:
#! pip install ftfy regex tqdm
#! pip install git+https://github.com/openai/CLIP.git

In [4]:
import os
import time
import os.path as osp

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision.datasets import CIFAR10
from torchvision import datasets
from torchvision import transforms
import torchvision

from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from PIL import Image
from clip import clip

### Hyperparameters

In [5]:
# # random seed
# SEED = 1 
# NUM_CLASS = 10

# Training
BATCH_SIZE = 128
# NUM_EPOCHS = 30
# EVAL_INTERVAL=1
# SAVE_DIR = './log'

# # Optimizer
# LEARNING_RATE = 1e-1
# MOMENTUM = 0.9
# STEP=5
# GAMMA=0.5

# CLIP
#VISUAL_BACKBONE = 'RN50' # RN50, ViT-B/32, ViT-B/16
VISUAL_BACKBONE = 'ViT-B/32' # ViT-B/32, ViT-B/16


### Device

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


### Dataset


In [7]:
"""
transform_cifar10_test = transforms.Compose([
    transforms.Resize(size=224),
    transforms.CenterCrop(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

test_set = torchvision.datasets.CIFAR10(root='/shareddata', train=False,
                                       download=True, transform=transform_cifar10_test)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE,
                                         shuffle=False, num_workers=2)

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
dataset_name = 'CIFAR10'
"""

"""
transform_dtd_test = transforms.Compose([
    transforms.Resize(size=224),
    transforms.CenterCrop(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),  # 使用ImageNet的均值和标准差
])

test_set = torchvision.datasets.DatasetFolder(root='/shareddata/dtd', loader=torchvision.datasets.folder.default_loader,
                                              extensions=('jpeg', 'jpg', 'png'), transform=transform_dtd_test)

test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE,
                                              shuffle=False, num_workers=2)
                                              
class_names = ['ImageNet', 'Omniglot', 'Aircraft', 'Brids', 'DTD', 'QuickDraw', 'Funji', 'VGGFlower', 'TrafficSigns', 'MSCOCO']
dataset_name = 'dtd'
"""

transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

test_set = torchvision.datasets.DatasetFolder(root='/shareddata/dtd', loader=torchvision.datasets.folder.default_loader,
                                              extensions=('jpeg', 'jpg', 'png'), transform=transform_train)

test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE,
                                               shuffle=True, num_workers=2)

class_names = ['ImageNet', 'Omniglot', 'Aircraft', 'Brids', 'DTD', 'QuickDraw', 'Funji', 'VGGFlower', 'TrafficSigns', 'MSCOCO']
dataset_name = 'dtd'


### Model

In [8]:
# Load the model
model, preprocess = clip.load(name=VISUAL_BACKBONE, device=device, download_root='/shareddata/clip/')
model.to(device)

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

### Task 1: Prompt Gereration
---

Please denfine a function named ``prompt_encode`` to encode the text using CLIP text encoder.


In [9]:
#import clip
#clip.available_models()

In [10]:
#prompt = 'a photo of a' # you can try different prompt

prompt = "asdqwedqwdwda"
text_inputs = torch.cat([clip.tokenize(f"{prompt} {c}") for c in class_names]).to(device)

def prompt_encode(prompt):
        
    text_inputs = torch.cat([clip.tokenize(f"{prompt} {c}") for c in class_names]).to(device)
    
    return text_inputs


### Task 2: Zero-shot inference
---

Please denfine a function named ``model_inference``. The function is essential for training and evaluating machine learning models using batched data from dataloaders.

**To do**: 
1. Encode the image.
2. Encode the text.
3. Calculate the logits.

In [11]:

def model_inference(model, image, text_inputs):
    
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_inputs)
        
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    logit_scale = model.logit_scale.exp()

    logits = logit_scale * image_features @ text_features.t()
    
    return logits

### Task 3: Zero-shot accuracy calculation
---

In [None]:

testing_loss = []
testing_acc = []

with torch.no_grad():
    model.eval()
    
    val_loss = 0.0
    val_corrects = 0

    for batch_idx, (image, target) in enumerate(test_dataloader):

        image = image.to(device)
        target = target.to(device)

        # test model
        logits = model_inference(model, image, text_inputs)
        _, preds = torch.max(logits, 1)
        
        val_corrects += torch.sum(preds == target.data)

    val_acc = val_corrects.double() / len(test_set)

    print(f"the zero-shot performance on {dataset_name} is {val_acc*100:.2f}%, visual encoder is {VISUAL_BACKBONE}.")




In [None]:
import torch
import torchvision
from torch.autograd import Variable
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import tarfile
from torchvision.datasets import ImageFolder
"""
train_dataset = datasets.MNIST(root = 'data/', train = True, 
                               transform = transforms.ToTensor(), download = True)
test_dataset = datasets.MNIST(root = 'data/', train = False, 
                               transform = transforms.ToTensor(), download = True)

train_loader = DataLoader(dataset = train_dataset, batch_size = 100, shuffle = True)
test_loader = DataLoader(dataset = test_dataset, batch_size= 100, shuffle = True)
""" 
"""
data_folder = './dtd-r1.0.1.tar.gz'
extract_folder = './extracted_data'

with tarfile.open(data_folder, 'r:gz') as tar:
    tar.extractall(path=extract_folder)
"""

# 读取解压后的文件
extracted_folder = '/data/lab/STA303-Assignment02/data/dtd'  # 解压后的文件夹路径
# 在这里可以对解压后的文件夹进行进一步操作，如读取文件等

# 定义数据集路径和转换
train_folder = os.path.join(extracted_folder, 'images')
test_folder = os.path.join(extracted_folder, 'images')
#transform = transforms.ToTensor()
transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((382, 540)),
    torchvision.transforms.ToTensor()
])


# 创建数据集对象
#train_dataset = ImageFolder(root=train_folder, transform=transform)
#test_dataset = ImageFolder(root=test_folder, transform=transform)
train_dataset = torchvision.datasets.ImageFolder(train_folder, transform=transform)
test_dataset = torchvision.datasets.ImageFolder(test_folder, transform=transform)


# 创建数据加载器
train_loader = DataLoader(dataset=train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=100, shuffle=True)




In [None]:
class Model(torch.nn.Module):
    def __init__(self) :
        super(Model, self).__init__()
        self.conv1 = torch.nn.Sequential(torch.nn.Conv2d(3, 64, 3, 1, 1),
                                         torch.nn.ReLU(),
                                         torch.nn.Conv2d(64, 128, 3, 1, 1),
                                         torch.nn.ReLU(),
                                         torch.nn.MaxPool2d(2, 2)) 
        self.dense = torch.nn.Sequential(torch.nn.Linear(14*14*128, 1024),
                                         torch.nn.ReLU(),
                                         torch.nn.Dropout(p=0.5),
                                         torch.nn.Linear(1024, 10))
        
    def forward(self, x) :
        x = self.conv1(x)
        x = x.view(-1, 14*14*128)
        x = self.dense(x)
        return x


In [None]:
device = torch.device('cuda')
model = Model().to(device)
cost = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
if __name__ == "__main__":
    epochs = 5
    scaler = torch.cuda.amp.GradScaler()  # 定义梯度缩放器
    for epoch in range(epochs):
        # train
        sum_loss = 0.0
        train_correct = 0
        for data in train_loader:
            inputs, labels = data
            inputs, labels = inputs.cuda(), labels.cuda()
            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():  # 使用混合精度训练加速
                outputs = model(inputs)
                loss = cost(outputs, labels)
            
            scaler.scale(loss).backward()  # 梯度缩放
            scaler.step(optimizer)
            scaler.update()

            _, id = torch.max(outputs.data, 1)
            sum_loss += loss.data.item()
            train_correct += torch.sum(id == labels.data)

        print('[%d/%d] loss: %.03f' % (epoch + 1, epochs, sum_loss / len(train_loader)))
        print('        correct: %.03f%%' % (100 * train_correct / len(train_dataset)))


In [None]:
model.eval()
test_correct = 0
for data in test_loader:
    inputs, lables = data
    inputs, lables = Variable(inputs).cuda(), Variable(lables).cuda()
    outputs = model(inputs)
    _, id = torch.max(outputs.data, 1)
    test_correct += torch.sum(id == lables.data)
print("correct:%.3f%%" % (100 * test_correct / len(test_dataset)))