# GPT-4V(ision) Image Captioning on Dishes

In [None]:
from datetime import datetime
import pytz
now = datetime.now(pytz.timezone('America/Chicago'))
print(now)

## Set-up environment

In [None]:
pip install -q git+https://github.com/huggingface/peft.git datasets

In [None]:
now = datetime.now(pytz.timezone('America/Chicago'))
print(now)

## Load the image captioning dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("advancedcv/Food500Cap_test", split="test")

In [None]:
label_set = {"Aloo_gobi","Baingan_bharta","Chakli","Sambar","Vindaloo","Bon_bon_chicken","Chinese_chicken_salad","Shanghai_fried_noodles","Taro_dumpling","Wonton_noodles","Katsudon","Soba","Tonkotsu_ramen"}

In [None]:
idx_list_test = []
for i in range(len(dataset)):
  if dataset[i]["cat"] in label_set:
    idx_list_test.append(i)

In [None]:
new_dataset = dataset.select(idx_list_test)

In [None]:
len(new_dataset)

In [None]:
now = datetime.now(pytz.timezone('America/Chicago'))
print(now)

## Load the GPT-4V Model

In [None]:
!pip install --upgrade openai

In [None]:
import os
api_key = # TODO
os.environ['OPENAI_API_KEY'] = api_key

In [None]:
from openai import OpenAI
client = OpenAI()

In [None]:
now = datetime.now(pytz.timezone('America/Chicago'))
print(now)

## Captioning

In [None]:
import base64
import requests
from io import BytesIO
import numpy as np

# Function to encode the image
def encode_image(image):
  buffer = BytesIO()
  if image.mode == 'RGB':
    image.save(buffer, format="JPEG")
  else:
    image.save(buffer, format="PNG")
  base64_img = base64.b64encode(buffer.getvalue())
  return base64_img.decode('utf-8')

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

In [None]:
index = 0
round = 1

In [None]:
generated_texts = []
file_name = "GPT4V_selected_results_part" + str(round) + ".npy"

for i in range(index, len(new_dataset)):
  # Getting the base64 string
  base64_image = encode_image(new_dataset[i]['image'])

  payload = {
      "model": "gpt-4-vision-preview",
      "messages": [
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": "What are the name and visible ingredients of the dish in the image? Answer in one sentence."
            },
            {
              "type": "image_url",
              "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
              }
            }
          ]
        }
      ],
      "max_tokens": 60
  }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  try:
    generated_texts.append(response.json()['choices'][0]['message']['content'])
  except:
    print(response.json())
    print(index, 'captions generated')
    break

  index += 1

print(generated_texts)
np.save(file_name, generated_texts)

In [None]:
now = datetime.now(pytz.timezone('America/Chicago'))
print(now)