In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

# Path to the folder you want to set as the main path
path = "/content/drive/MyDrive/smallcap"

# Change the current working directory
os.chdir(path)

# Now, your operations will be relative to this path


In [3]:
import torch
from transformers import ViTFeatureExtractor, AutoTokenizer, CLIPFeatureExtractor, AutoModel, AutoModelForCausalLM
from transformers.models.auto.configuration_auto import AutoConfig
from src.vision_encoder_decoder import SmallCap, SmallCapConfig
from src.gpt2 import ThisGPT2Config, ThisGPT2LMHeadModel
from src.utils import prep_strings, postprocess_preds


device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# load feature extractor
feature_extractor = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32")


# load and configure tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = '!'
tokenizer.eos_token = '.'

# load model
AutoConfig.register("this_gpt2", ThisGPT2Config)
AutoModel.register(ThisGPT2Config, ThisGPT2LMHeadModel)
AutoModelForCausalLM.register(ThisGPT2Config, ThisGPT2LMHeadModel)
AutoConfig.register("smallcap", SmallCapConfig)
AutoModel.register(SmallCapConfig, SmallCap)


model = AutoModel.from_pretrained("/content/drive/MyDrive/smallcap/")

model= model.to(device)

template = open('src/template.txt').read().strip() + ' '

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-4g_xz0b3
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-4g_xz0b3
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369497 sha256=aaa827c8e54a441e850e53342859c81049ac9fee12e9afcb92d925690343107a
  Stored in directory: /tmp/pip-ephem-wheel-cache-plfmn26u/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Inst

In [6]:
!pip install faiss-gpu


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [7]:
# precompute captions for retrieval
import json
import clip
import faiss


from src.retrieve_caps import *

captions = json.load(open('datastore/flickr30k_index_captions.json'))
retrieval_model, feature_extractor_retrieval = clip.load("RN50x64", device=device)
retrieval_index = faiss.read_index('datastore/flickr30k_index')
res = faiss.StandardGpuResources()
retrieval_index = faiss.index_cpu_to_gpu(res, 0, retrieval_index)

100%|█████████████████████████████████████| 1.26G/1.26G [00:51<00:00, 26.5MiB/s]


In [8]:
def retrieve_caps(image_embedding, index, k=4):
    xq = image_embedding.astype(np.float32)
    faiss.normalize_L2(xq)
    D, I = index.search(xq, k)
    return I

In [9]:
import os
from PIL import Image
import torch
import time

image_dir = 'data/'

for image_file in os.listdir(image_dir):
    if image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        image_path = os.path.join(image_dir, image_file)
        print(f"Processing: {image_path}")


        image = Image.open(image_path).convert("RGB")


        start = time.time()

        pixel_values_retrieval = feature_extractor_retrieval(image).to(device)
        with torch.no_grad():
            image_embedding = retrieval_model.encode_image(pixel_values_retrieval.unsqueeze(0)).cpu().numpy()

        nns = retrieve_caps(image_embedding, retrieval_index)[0]
        caps = [captions[i] for i in nns][:4]

        decoder_input_ids = prep_strings('', tokenizer, template=template, retrieved_caps=caps, k=4, is_test=True)

        pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
        with torch.no_grad():
            pred = model.generate(pixel_values.to(device),
                                  decoder_input_ids=torch.tensor([decoder_input_ids]).to(device),
                                  max_new_tokens=25, no_repeat_ngram_size=0, length_penalty=0,
                                  min_length=1, num_beams=3, eos_token_id=tokenizer.eos_token_id)

        elapsed_time = time.time() - start
        print(f"Time taken: {elapsed_time} seconds")
        display(image)
        #Print prompt
        #print("\nPrompt:\n")
        #print(tokenizer.decode(decoder_input_ids))
        #Print generated caption
        generated_caption = postprocess_preds(tokenizer.decode(pred[0]), tokenizer)
        print("\nGenerated caption:\n")
        print(generated_caption)
        print("\n" + "-"*50 + "\n")


Output hidden; open in https://colab.research.google.com to view.