# Basic Import

In [1]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision.models as models
import tqdm
import clip
import torch
import torch.nn as nn
import torch.optim as optim
import funct

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
EPOCH=20

# Dataset

In [3]:
transform=transforms.Compose([
    transforms.Resize(size=(224,224)),
    transforms.ToTensor()
])
# datasets for fine-tuning and evaluating resnet50
resnet_train = datasets.OxfordIIITPet(root='./data',split='trainval',download=True,transform=transform)
resnet_test=datasets.OxfordIIITPet(root='./data',split='test',download=True,transform=transform)
# dataset for evaluating Clip
clip_test=datasets.OxfordIIITPet(root='./data',split='test',download=True)

train_dataloader=torch.utils.data.DataLoader(resnet_train,batch_size=50,shuffle=True,num_workers=2)
test_dataloader=torch.utils.data.DataLoader(resnet_test,batch_size=20,shuffle=False,num_workers=2)

# Evaluating CLIP

In [4]:
VISUAL_BACKBONE='RN50x64'
names=resnet_train.classes

model, preprocess = clip.load(VISUAL_BACKBONE, device ,download_root='/shareddata/clip/')

text_inputs=torch.cat([clip.tokenize(f"a photo of the pet {c}") for c in names]).to(device)

In [5]:
accuracy=funct.clip_testing(model,preprocess,clip_test,device,text_inputs)

print(f"the accuracy of Clip on OxfordIIITPet dataset is {accuracy*100:.2f}%, visual encoder is {VISUAL_BACKBONE}")

100%|██████████| 3669/3669 [03:15<00:00, 18.78it/s]

the accuracy of Clip on OxfordIIITPet dataset is 93.32%, visual encoder is RN50x64





# Fine-tuning and Evaluating ResNet50

In [6]:
resnet50=models.resnet50(pretrained=True)
resnet50.fc=torch.nn.Linear(2048,37) ##add a fully connected layer to adjust the output dimension
resnet50=resnet50.to(device)

criterion=nn.CrossEntropyLoss()
optimizer = optim.SGD(resnet50.parameters(), lr=1e-2, momentum=0.9)



In [7]:
for i in range(EPOCH): # training resnet50 until the accuracy converges.
    funct.resnet_training(resnet50,criterion,optimizer,train_dataloader,device)
    if(i%4==0 or i+1==EPOCH):
        corrects=funct.resnet_testing(resnet50,test_dataloader,device)
        accuracy=corrects/len(resnet_test)
        print(f"the accuracy of ResNet on OxfordIIITPet dataset is {accuracy*100:.2f}%, the training epoch is {i+1}")

100%|██████████| 74/74 [00:18<00:00,  3.95it/s]
100%|██████████| 184/184 [00:09<00:00, 18.55it/s]


the accuracy of ResNet on OxfordIIITPet dataset is 75.33%, the training epoch is 1


100%|██████████| 74/74 [00:17<00:00,  4.13it/s]
100%|██████████| 74/74 [00:18<00:00,  4.02it/s]
100%|██████████| 74/74 [00:17<00:00,  4.14it/s]
100%|██████████| 74/74 [00:18<00:00,  3.99it/s]
100%|██████████| 184/184 [00:10<00:00, 17.81it/s]


the accuracy of ResNet on OxfordIIITPet dataset is 88.28%, the training epoch is 5


100%|██████████| 74/74 [00:18<00:00,  4.05it/s]
100%|██████████| 74/74 [00:18<00:00,  4.00it/s]
100%|██████████| 74/74 [00:18<00:00,  4.02it/s]
100%|██████████| 74/74 [00:18<00:00,  4.03it/s]
100%|██████████| 184/184 [00:10<00:00, 18.18it/s]


the accuracy of ResNet on OxfordIIITPet dataset is 90.52%, the training epoch is 9


100%|██████████| 74/74 [00:18<00:00,  4.11it/s]
100%|██████████| 74/74 [00:18<00:00,  4.04it/s]
100%|██████████| 74/74 [00:17<00:00,  4.12it/s]
100%|██████████| 74/74 [00:17<00:00,  4.17it/s]
100%|██████████| 184/184 [00:09<00:00, 19.46it/s]


the accuracy of ResNet on OxfordIIITPet dataset is 90.76%, the training epoch is 13


100%|██████████| 74/74 [00:17<00:00,  4.21it/s]
100%|██████████| 74/74 [00:17<00:00,  4.16it/s]
100%|██████████| 74/74 [00:17<00:00,  4.17it/s]
100%|██████████| 74/74 [00:18<00:00,  4.11it/s]
100%|██████████| 184/184 [00:08<00:00, 21.09it/s]


the accuracy of ResNet on OxfordIIITPet dataset is 90.92%, the training epoch is 17


100%|██████████| 74/74 [00:17<00:00,  4.17it/s]
100%|██████████| 74/74 [00:17<00:00,  4.20it/s]
100%|██████████| 74/74 [00:17<00:00,  4.11it/s]
100%|██████████| 184/184 [00:10<00:00, 17.86it/s]

the accuracy of ResNet on OxfordIIITPet dataset is 91.09%, the training epoch is 20



