## Import Library and load model

In [1]:
import numpy as np
import torch
import clip
from pkg_resources import packaging

print("Torch version:", torch.__version__)

#clip.available_models() # List the name of clip models

model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)


Torch version: 1.13.0


# Image Preprocessing



In [None]:
# train dataset

from pycocotools.coco import COCO
from PIL import Image
from tqdm import tqdm
import skimage.io as io
import matplotlib.pyplot as plt
#import urllib.request

annFile = "annotations/captions_train2014.json"
coco = COCO(annFile)
imgIds = coco.getImgIds()
img = []
anns_lst = []
images= []

for i in tqdm(range(len(imgIds[0:50000]))):
  img.append(coco.loadImgs(imgIds)[i])
  ann_ids = coco.getAnnIds(imgIds=img[i]['id'], iscrowd=None)
  anns = coco.loadAnns(ann_ids)
  #urllib.request.urlretrieve(img[i]['coco_url'],"temp_file")
  image_file = 'train2014\COCO_train2014_' + str(anns[0]['image_id']).zfill(12) + '.jpg'
  I = Image.open(image_file)
  anns_lst.append(anns)
  images.append(preprocess(I))

In [3]:
# test datset

from pycocotools.coco import COCO
from PIL import Image
from tqdm import tqdm
import skimage.io as io
import matplotlib.pyplot as plt
#import urllib.request

annFile = "annotations/captions_train2014.json"
coco = COCO(annFile)
total_imgIds = coco.getImgIds()
imgIds = total_imgIds[60000:]
img = []
anns_lst = []
images= []

for i in tqdm(range(len(imgIds[0:2000]))):
  img.append(coco.loadImgs(imgIds)[i])
  ann_ids = coco.getAnnIds(imgIds=img[i]['id'], iscrowd=None)
  anns = coco.loadAnns(ann_ids)
  #urllib.request.urlretrieve(img[i]['coco_url'],"temp_file")
  image_file = 'train2014\COCO_train2014_' + str(anns[0]['image_id']).zfill(12) + '.jpg'
  I = Image.open(image_file)
  anns_lst.append(anns)
  images.append(preprocess(I))

loading annotations into memory...
Done (t=0.70s)
creating index...


  0%|          | 7/2000 [00:00<00:29, 67.81it/s]

index created!


100%|██████████| 2000/2000 [00:32<00:00, 62.23it/s]


## Building features for 50000 separately

In [None]:
chunk_size = 2000
images_list = [images[i:i+chunk_size] for i in range(0, len(images), chunk_size)]
text_list = [anns_lst[i:i+chunk_size] for i in range(0, len(anns_lst), chunk_size)]

for n in tqdm(range(25)):

    image_input = torch.tensor(np.stack(images_list[n])).cuda()
    text_tokens = clip.tokenize([text_list[n][i][0]['caption'] for i in range(chunk_size)]).cuda()

    with torch.no_grad():
        image_features = model.encode_image(torch.tensor(image_input)).float()
        text_features = model.encode_text(torch.tensor(text_tokens)).float()

    image_file = 'image_features_' + str(n+1) + '.pt'
    text_file = 'text_features_' + str(n+1) + '.pt'
    
    torch.save(image_features, image_file)
    torch.save(text_features, text_file)

    del image_input
    del text_tokens
    del image_features
    del text_features

    torch.cuda.empty_cache()

In [None]:
image_file_list = []
text_file_list = []

for i in range(25):
    image_feature_file = 'image_features_' + str(i+1) + '.pt'
    text_feature_file = 'text_features_' + str(i+1) + '.pt'

    image_tensor = torch.load(image_feature_file, map_location=lambda storage, loc: storage)
    text_tensor = torch.load(text_feature_file, map_location=lambda storage, loc: storage)

    image_file_list.append(image_tensor)
    text_file_list.append(text_tensor)

image_feature = torch.cat(tuple(image_file_list))
text_feature = torch.cat(tuple(text_file_list))

In [None]:
torch.save(image_feature, 'image_feature_50000.pt')
torch.save(text_feature, 'text_feature_50000.pt')

## Building features for single set

In [4]:
image_input = torch.tensor(np.stack(images)).cuda()
text_tokens = clip.tokenize([anns_lst[i][0]['caption'] for i in range(len(img))]).cuda()

In [5]:
with torch.no_grad():
    image_features = model.encode_image(torch.tensor(image_input)).float()
    text_features = model.encode_text(torch.tensor(text_tokens)).float()

torch.save(image_features, 'image_features_test2000.pt')
torch.save(text_features, 'text_features_test2000.pt')

  image_features = model.encode_image(torch.tensor(image_input)).float()
  text_features = model.encode_text(torch.tensor(text_tokens)).float()
