Learning Objectives
- OpenAI GPT-4V 모델 이용해서 비정형 이미지 데이터로부터 정형 필드 파싱해보기

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

openai_api_key = os.environ.get('OPENAI_API_KEY')

In [2]:
from pathlib import Path

input_image_path = Path("../data/restaurant_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [4]:
from PIL import Image
import matplotlib.pyplot as plt

# Data downlaod: https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq
imageUrl = "../data/restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")

# plt.figure(figsize=(16, 5))
# plt.imshow(image)

In [5]:
from pydantic import BaseModel

class Restaurant(BaseModel):
    # 파싱 클래스 설명넣기
    """Data model for an restaurant."""
    # 파싱 필드 정의하기
    restaurant: str
    food: str
    discount: str
    price: str
    rating: str
    review: str

In [6]:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal
from llama_index.core import SimpleDirectoryReader

image_documents = SimpleDirectoryReader("../data/restaurant_images").load_data()

# 유저가 업로드한 파일을 파싱하는 경우 openai 최신 모델은 token limit이 부족할 수 있음
# 혹은 비용이 너무 비쌀 수 있음
# 사용량이 많지 않은 비정형 데이터 파싱인 경우 gpt-4-vision 사용이 가능
openai_mm_llm = OpenAIMultiModal(
    model="gpt-4-vision-preview", max_new_tokens=1000, api_key=openai_api_key,
)

In [7]:
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser

#파싱 인스트럭션
# GPT-4 이전 모델의 경우 return format을 원하는대로 지정 불가
# 따라서 production level의 application을 구현할 경우 pydantic 사용 필수
# pydantic 설명: https://data-newbie.tistory.com/836
prompt_template_str = """\
    can you summarize what is in the image\
    and return the answer with json format \
"""
openai_program = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Restaurant),
    image_documents=image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)

In [8]:
response = openai_program()
for res in response:
    print(res)

[1;3;38;2;90;149;237m> Raw output: ```json
{
  "restaurant": "Not Specified",
  "food": "8 Wings or Chicken Poppers",
  "discount": "Black Friday Offer",
  "price": "$8.73",
  "rating": "Not Specified",
  "review": "Not Specified"
}
```
[0m('restaurant', 'Not Specified')
('food', '8 Wings or Chicken Poppers')
('discount', 'Black Friday Offer')
('price', '$8.73')
('rating', 'Not Specified')
('review', 'Not Specified')


In [10]:
# 아마존 상품
input_image_path = Path("../data/amazon_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [12]:
# Image downlaod: https://docs.google.com/uc?export=download&id=1p1Y1qAoM68eC4sAvvHaiJyPhdUZS0Gqb
imageUrl = "../data/amazon_images/amazon.png"
image = Image.open(imageUrl).convert("RGB")

# plt.figure(figsize=(16, 5))
# plt.imshow(image)

In [13]:
class Product(BaseModel):
    # 클래스 디스크립션 작성
    """Data model for a Amazon Product."""

    title: str
    category: str
    discount: str
    inventory: str
    description: str
    # 파싱 필드 정의

In [15]:
amazon_image_documents = SimpleDirectoryReader("../data/amazon_images").load_data()

prompt_template_str = """\
    can you summarize what is in the image\
    and return the answer with json format \
"""
openai_program_amazon = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(Product),
    image_documents=amazon_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)

In [16]:
response = openai_program_amazon()
for res in response:
    print(res)

[1;3;38;2;90;149;237m> Raw output: ```json
{
  "title": "Instant Vortex 5.7QT Air Fryer Oven Combo, The Makers of Instant Pot, Customizable Smart Cooking Programs, Digital Touchscreen, Nonstick and Dishwasher-Safe Basket, App with over 100 Recipes",
  "category": "Kitchen Appliances",
  "discount": "20% off, save $25.19 from $129.99",
  "inventory": "In stock",
  "description": "A versatile and convenient air fryer by Instant Pot with customizable smart cooking programs and a digital touchscreen interface. It features a nonstick and dishwasher-safe basket, and comes with an app that includes over 100 recipes. The 5.7QT capacity is suitable for preparing family-sized meals."
}
```
[0m('title', 'Instant Vortex 5.7QT Air Fryer Oven Combo, The Makers of Instant Pot, Customizable Smart Cooking Programs, Digital Touchscreen, Nonstick and Dishwasher-Safe Basket, App with over 100 Recipes')
('category', 'Kitchen Appliances')
('discount', '20% off, save $25.19 from $129.99')
('inventory', 'In

In [17]:
input_image_path = Path("../data/instagram_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)

In [19]:
from PIL import Image
import matplotlib.pyplot as plt

# Image download: https://docs.google.com/uc?export=download&id=12ZpBBFkYu-jzz1iz356U5kMikn4uN9ww
imageUrl = "../data/instagram_images/jordan.png"
image = Image.open(imageUrl).convert("RGB")

# plt.figure(figsize=(16, 5))
# plt.imshow(image)

In [20]:
class InsAds(BaseModel):
    # 파싱 클래스 디스크립션 작성
    """Data model for a Instagram Ads."""

    account: str
    brand: str
    product: str
    price: str
    comments: str
    # 파싱 희망 필드 작성

In [21]:
ins_image_documents = SimpleDirectoryReader("../data/instagram_images").load_data()

prompt_template_str = """\
    can you summarize what is in the image\
    and return the answer with json format \
"""
openai_program_ins = MultiModalLLMCompletionProgram.from_defaults(
    output_parser=PydanticOutputParser(InsAds),
    image_documents=ins_image_documents,
    prompt_template_str=prompt_template_str,
    multi_modal_llm=openai_mm_llm,
    verbose=True,
)


response = openai_program_ins()
for res in response:
    print(res)

[1;3;38;2;90;149;237m> Raw output: ```json
{
  "account": "jordansdaily",
  "brand": "Air Jordan",
  "product": "Air Jordan 2 'Italy'",
  "price": "$175",
  "comments": "Liked by cemmck and others"
}
```
[0m('account', 'jordansdaily')
('brand', 'Air Jordan')
('product', "Air Jordan 2 'Italy'")
('price', '$175')
('comments', 'Liked by cemmck and others')
