# Setup Code


### Autoreload

First, run this cell load the [autoreload](https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload) extension. This allows us to edit `.py` source files, and re-import them into the notebook for a seamless editing and debugging experience.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os

# TODO: Fill in the Google Drive path where you uploaded the assignment
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = "/content/drive/MyDrive/sem4/CIS583/Project"#"Teaching/DeepLearning/Homework/HW4-Solutions"
# GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = None
assert GOOGLE_DRIVE_PATH_AFTER_MYDRIVE is not None
GOOGLE_DRIVE_PATH = os.path.join('drive', 'MyDrive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))

['cis583_project.ipynb', 'cis583_image captioning.ipnyb', 'Notes.docx', 'dataset', 'Image_data']


In [None]:
#Library Imports
import tarfile
import pickle
import sys
import numpy as np

# ***DATASET***

1. Pretraining Dataset (100k samples): This dataset is used for pretraining purposes and includes a large number of driving scenarios with ground-truth representations of the driving scenes. The data collected from this subset is used to optimize the model's weights, particularly focusing on the vector representation of the environment.

2. QA Labeling and Fine-tuning Dataset (10k samples): This subset is dedicated to QA labeling and fine-tuning of the model. It contains a set of driving scenarios similar to those in the pretraining dataset but with additional labels and annotations for fine-tuning the model's performance in question-answering tasks related to driving scenarios.
**bold text**

In [None]:


# Path to the dataset folder
dataset_folder = os.path.join(GOOGLE_DRIVE_PATH, 'dataset')

# extracting the test data file
test_data_path = os.path.join(dataset_folder, 'vqa_test_1k.tar.gz')
with tarfile.open(test_data_path, 'r:gz') as tar:
    tar.extractall(dataset_folder)

# extracting the train data file
train_data_path = os.path.join(dataset_folder, 'vqa_train_10k.tar.gz')
with tarfile.open(train_data_path, 'r:gz') as tar:
    tar.extractall(dataset_folder)


with open(os.path.join(dataset_folder, 'vqa_test_1k.pkl'), 'rb') as f:
    test_data = pickle.load(f)

with open(os.path.join(dataset_folder, 'vqa_train_10k.pkl'), 'rb') as f:
    train_data = pickle.load(f)

# Select only subset data
test_data_subset = test_data[:50]
train_data_subset = train_data[:500]



In [None]:
len(train_data_subset)

500

In [None]:
print(type(train_data_subset))

<class 'list'>


In [None]:
train_data_subset[:1]

[{'frame_num': 0,
  'observation': {'ego_vehicle_descriptor': tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 2.3250, 1.0050, 0.7800, 1.0000,
           0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.3679, 0.0000,
           0.0000, 0.0798, 0.0798, 0.0798, 0.1423, 0.1751, 1.0000, 1.0000, 0.8573,
           0.4652, 0.3694, 0.3694, 0.4039]),
   'liable_vehicles': None,
   'pedestrian_descriptors': tensor([[ 1.0000,  1.7488,  3.0088,  0.3324,  0.1082,  0.9687,  0.2483,  0.0000,
             0.0000],
           [ 1.0000,  1.5271,  4.8531, -0.6271,  0.1032, -1.0000, -0.0086,  0.0000,
             0.0000],
           [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
             0.0000],
           [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
             0.0000],
           [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
             0.0000],
           [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.000

IMAGE CAPTION

In [None]:
from transformers import CLIPProcessor, CLIPModel

# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load  image
def load_image(image_path):
  # Replace this with your preferred method to load the image (e.g., OpenCV, PIL)
  # This example uses placeholder code
  return np.array([128, 128, 3])  # Replace with actual image data

# Preprocess the image
def preprocess_image(image):
  image = image[:, :, ::-1].copy()  # BGR to RGB conversion (if necessary)
  image = processor(image=image, return_tensors="pt")
  return image

# Generate text descriptions
def describe_image(image_path):
  image = preprocess_image(load_image(image_path))
  with torch.no_grad():
    outputs = model(**image)
    logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text
  probs = logits_per_text.softmax(dim=-1)
  top_5_probs, top_5_labels = probs.topk(5, dim=-1)

  captions = []
  for i in range(top_5_probs.shape[0]):
    for j in range(top_5_probs.shape[1]):
      caption = processor.decode(top_5_labels[i, j].item(), skip_special_tokens=True)
      captions.append(f"{caption} - {top_5_probs[i, j].item():.3f}")
  return captions




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
import torch

from transformers import CLIPProcessor, CLIPModel

# Load the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


# Load image (using OpenCV)
def load_image(image_path):
  import cv2
  try:
    image = cv2.imread(image_path)
    if image is None:
      raise ValueError(f"Failed to read image from path: {image_path}")
    return image
  except Exception as e:
    print(f"Error loading image: {e}")
    return None


# Preprocess the image
def preprocess_image(image):
  if image is None:
    return None
  image = image[:, :, ::-1].copy()  # BGR to RGB conversion (if necessary)
  image = processor(image=image, return_tensors="pt")
  return image


# Generate text descriptions
def describe_image(image_path):
  image = load_image(image_path)
  if image is None:
    return None

  preprocessed_image = preprocess_image(image)  # Get preprocessed image
  if preprocessed_image is None:
    return None

  with torch.no_grad():
    outputs = model(**preprocessed_image)  # Pass preprocessed_image using **
    logits_per_image, logits_per_text = outputs.logits_per_image, outputs.logits_per_text
  probs = logits_per_text.softmax(dim=-1)
  top_5_probs, top_5_labels = probs.topk(5, dim=-1)

  captions = []
  for i in range(top_5_probs.shape[0]):
    for j in range(top_5_probs.shape[1]):
      caption = processor.decode(top_5_labels[i, j].item(), skip_special_tokens=True)
      captions.append(f"{caption} - {top_5_probs[i, j].item():.3f}")
  return captions

# Test
image_path = "/content/quebeccanada-driving-down-papineau-street-in-montreal-J7E17C.jpg"
descriptions = describe_image(image_path)

if descriptions is not None:
  print("Descriptions:")
  for desc in descriptions:
    print(desc)
else:
  print("Error: Could not generate descriptions for the image.")


Error loading image: Failed to read image from path: /content/quebeccanada-driving-down-papineau-street-in-montreal-J7E17C.jpg
Error: Could not generate descriptions for the image.


Q-A Module

YOLO v3 for object n feature detection





In [None]:
!git clone https://github.com/AlexeyAB/darknet

Cloning into 'darknet'...
remote: Enumerating objects: 15833, done.[K
remote: Total 15833 (delta 0), reused 0 (delta 0), pack-reused 15833[K
Receiving objects: 100% (15833/15833), 14.35 MiB | 20.55 MiB/s, done.
Resolving deltas: 100% (10670/10670), done.


In [2]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=b677a0ab12a15febea99f50d862db1d9e58d47d1f857992327ea1d97ff8afd82
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [3]:
import wget
import cv2
# Download YOLOv3 weights and configuration files
yolo_weights_url = "https://pjreddie.com/media/files/yolov3.weights" #"https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.weights?raw=true"
yolo_config_url = "https://github.com/pjreddie/darknet/blob/master/cfg/yolov3.cfg?raw=true"
yolo_names_url = "https://github.com/pjreddie/darknet/blob/master/data/coco.names?raw=true"

yolo_weights_path = "yolov3.weights"
yolo_config_path = "yolov3.cfg"
yolo_names_path = "coco.names"

wget.download(yolo_weights_url, yolo_weights_path)
wget.download(yolo_config_url, yolo_config_path)
wget.download(yolo_names_url, yolo_names_path)

# Load YOLO model and class names
net = cv2.dnn.readNet(yolo_weights_path, yolo_config_path)
with open(yolo_names_path, "r") as f:
    class_names = [line.strip() for line in f.readlines()]


In [4]:
def detect_objects(image):
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layer_names = net.getLayerNames()

    # Convert numpy array to list
    unconnected_layers = net.getUnconnectedOutLayers().flatten().tolist()
    output_layers_indices = [layer_names[i - 1] for i in unconnected_layers]

    detected_objects = []
    outputs = net.forward(output_layers_indices)

    for output in outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:  # threshold for detection confidence
                box = detection[0:4] * np.array([image.shape[1], image.shape[0], image.shape[1], image.shape[0]])
                (centerX, centerY, width, height) = box.astype("int")
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                detected_objects.append((class_names[class_id], confidence, (x, y, int(width), int(height))))

    return detected_objects



In [5]:
def objects_to_caption(detected_objects):
    captions = []
    for obj in detected_objects:
        class_name, confidence, (x, y, width, height) = obj
        caption = f"{class_name} with confidence {confidence:.2f} at position ({x}, {y}) with width {width} and height {height}"
        captions.append(caption)
    return captions

In [6]:
def generate_image_cap_clip(
    image,
    clip_model,
    model,
    tokenizer,
    preprocess,
    device,
    prefix_length,
    entry_length=100,
    temperature=0.9,
    top_p=0.7,
    beam_size=10,
    use_beam_search=False
):

    image_model, preprocess = clip.load("ViT-B/32", device=device)

    image = preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        prefix = image_model.encode_image(image).to(device, dtype=torch.float32)
        prefix_embed = model.clip_project(prefix)

    token_features = prefix_embed.size(-1)
    expected_shape = (1, -1, token_features)
    prefix_embed = prefix_embed.view(expected_shape)

    if use_beam_search:
        generated_text_prefix = generate_beam(
            model,
            tokenizer,
            embed=prefix_embed,
            beam_size=beam_size,
            entry_length=entry_length,
            temperature=temperature
        )[0]
    else:
        generated_text_prefix = generate2(
            model,
            tokenizer,
            embed=prefix_embed,
            entry_length=entry_length,
            temperature=temperature,
            top_p=top_p
        )

    return generated_text_prefix

In [7]:
import torch.nn.functional as nnf

def generate_beam(model, tokenizer, beam_size: int = 10, prompt=None, embed=None,
                  entry_length=100, temperature=0.7, stop_token: str = '.'):

    model.eval()
    stop_token_index = tokenizer.encode(stop_token)[0]
    tokens = None
    scores = None
    device = next(model.parameters()).device
    seq_lengths = torch.ones(beam_size, device=device)
    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
    with torch.no_grad():
        if embed is not None:
            generated = embed
        else:
            if tokens is None:
                tokens = torch.tensor(tokenizer.encode(prompt))
                tokens = tokens.unsqueeze(0).to(device)
                generated = model.gpt.transformer.wte(tokens)
        for i in range(entry_length):
            outputs = model.gpt_model(inputs_embeds=generated)
            logits = outputs.logits
            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
            logits = logits.softmax(-1).log()
            if scores is None:
                scores, next_tokens = logits.topk(beam_size, -1)
                generated = generated.expand(beam_size, *generated.shape[1:])
                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
                if tokens is None:
                    tokens = next_tokens
                else:
                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
                    tokens = torch.cat((tokens, next_tokens), dim=1)
            else:
                logits[is_stopped] = -float(np.inf)
                logits[is_stopped, 0] = 0
                scores_sum = scores[:, None] + logits
                seq_lengths[~is_stopped] += 1
                scores_sum_average = scores_sum / seq_lengths[:, None]
                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
                next_tokens_source = next_tokens // scores_sum.shape[1]
                seq_lengths = seq_lengths[next_tokens_source]
                next_tokens = next_tokens % scores_sum.shape[1]
                next_tokens = next_tokens.unsqueeze(1)
                tokens = tokens[next_tokens_source]
                tokens = torch.cat((tokens, next_tokens), dim=1)
                generated = generated[next_tokens_source]
                scores = scores_sum_average * seq_lengths
                is_stopped = is_stopped[next_tokens_source]
            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
            generated = torch.cat((generated, next_token_embed), dim=1)
            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
            if is_stopped.all():
                break
    scores = scores / seq_lengths
    output_list = tokens.cpu().numpy()
    output_texts = [tokenizer.decode(output[:int(length)]) for output, length in zip(output_list, seq_lengths)]
    order = scores.argsort(descending=True)
    output_texts = [output_texts[i] for i in order]
    return output_texts


def generate2(
        model,
        tokenizer,
        tokens=None,
        prompt=None,
        embed=None,
        entry_count=1,
        entry_length=100,  # maximum number of words
        top_p=0.8,
        temperature=0.7,
        stop_token: str = '.'
):
    model.eval()
    generated_num = 0
    generated_list = []
    stop_token_index = tokenizer.encode(stop_token)[0]
    filter_value = -float("Inf")
    device = next(model.parameters()).device

    with torch.no_grad():

        for entry_idx in range(entry_count):
            if embed is not None:
                generated = embed
            else:
                if tokens is None:
                    tokens = torch.tensor(tokenizer.encode(prompt))
                    tokens = tokens.unsqueeze(0).to(device)

                generated = model.gpt_model.transformer.wte(tokens)

            for i in range(entry_length):

                outputs = model.gpt_model(inputs_embeds=generated)
                logits = outputs.logits
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(nnf.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                                                    ..., :-1
                                                    ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value
                next_token = torch.argmax(logits, -1).unsqueeze(0)
                next_token_embed = model.gpt_model.transformer.wte(next_token)
                if tokens is None:
                    tokens = next_token
                else:
                    tokens = torch.cat((tokens, next_token), dim=1)
                generated = torch.cat((generated, next_token_embed), dim=1)
                if stop_token_index == next_token.item():
                    break

            output_list = list(tokens.squeeze().cpu().numpy())
            output_text = tokenizer.decode(output_list)
            generated_list.append(output_text)

    return generated_list[0]

In [8]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-yclu4jmi
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-yclu4jmi
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->clip==1.0)
  Using cached nvidi

In [9]:
!pip install imageio



In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt2_model_name = 'gpt2'
# Load the pre-trained GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
import torch
from torch import nn
from transformers import GPT2LMHeadModel

class ClipGPT2Model(nn.Module):
    def __init__(self, gpt_model_name='gpt2', clip_output_size=512, gpt_embedding_size=768):
        super().__init__()
        self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name)
        self.clip_project = nn.Linear(clip_output_size, gpt_embedding_size)

    def forward(self, clip_embeddings, gpt_input_ids=None, attention_mask=None, labels=None):
        # Transform CLIP's embeddings before passing to GPT-2
        transformed_embeddings = self.clip_project(clip_embeddings)

        # Use the transformed embeddings as inputs_embeds for GPT-2
        gpt_outputs = self.gpt_model(inputs_embeds=transformed_embeddings,
                                     attention_mask=attention_mask,
                                     labels=labels)

        return gpt_outputs

In [13]:
import torch
import torchvision.transforms as transforms
from PIL import Image
from google.colab import files
import clip
import imageio
import numpy as np
from transformers import BertForQuestionAnswering, BertTokenizer

image_model, preprocess = clip.load("ViT-B/32", device="cpu")
prefix_length = gpt2_tokenizer.model_max_length
clip_output_size = 512  # Example size, replace with actual size
gpt_embedding_size = 768  # Example size, replace with actual size
clip_gpt2_model = ClipGPT2Model(gpt_model_name='gpt2', clip_output_size=clip_output_size, gpt_embedding_size=gpt_embedding_size)

# Define your generate_image_cap_clip and objects_to_caption functions here

def qa_answering(image_model, tokenizer, qa_model):
    # Prompt for image upload
    uploaded = files.upload()

    if not uploaded:
        print("No image uploaded.")
        return

    # Process the uploaded image
    image_path = list(uploaded.keys())[0]
    image_bytes = imageio.imread(image_path)
    image_pil = Image.fromarray(image_bytes)  # Convert NumPy array to PIL image

    # Prompt for question
    question = input("Enter your question: ")

    # Generate captions and objects from the image
    caption = generate_image_cap_clip(
        image=image_pil,
        clip_model=image_model,
        model=clip_gpt2_model,  # You need to provide the appropriate model here
        tokenizer=tokenizer,
        preprocess=preprocess,  ## to edit
        device="cpu",  # You can specify the device here
        prefix_length=prefix_length,  # You need to provide the appropriate value here
        entry_length=100,
        temperature=0.9,
        top_p=0.7,
        beam_size=10,
        use_beam_search=False
    )

    image_cv = np.array(image_pil)[:, :, ::-1]
    detected_objects = detect_objects(image_cv)
    objects = objects_to_caption(detected_objects)

    # Combine the generated caption and objects with the question
    combined_input = f"Question: {question} Image Caption: {caption} Detected Objects: {objects}"

    # Use the tokenizer to prepare inputs for the QA model
    inputs = tokenizer.encode_plus(
        combined_input,
        add_special_tokens=True,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )

    # Use the QA model to answer the question
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    qa_outputs = qa_model(input_ids, attention_mask=attention_mask)
    answer_start = torch.argmax(qa_outputs.start_logits)
    answer_end = torch.argmax(qa_outputs.end_logits) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end]))

    print("Answer:", answer)

# Initialize BERT tokenizer and QA model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
qa_model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Example usage
# Replace ... with appropriate model and preprocessing functions
image_model = clip.load("ViT-B/32")#, device="cpu")
qa_answering(image_model, tokenizer, qa_model)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  image_bytes = imageio.imread(image_path)


Saving istockphoto-1371333584-2048x2048-transformed.jpeg to istockphoto-1371333584-2048x2048-transformed (1).jpeg
Enter your question: can i go left
Answer: [SEP]
