!pip install gradio vllm transformers triton PyPDF2 Pillow sentence_transformers numpy typing faiss-gpu spacy pymupdf4llm fitz frontend tools semchunk

In [None]:
import gradio as gr
import faiss
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer
import os
import time
import semchunk
import pymupdf as fitz
import pymupdf4llm
from vllm import LLM, SamplingParams
from typing import List, Tuple, Dict, Optional
from PIL import Image
import hashlib
import logging

In [None]:
class LLaVAImageQAProcessor:
    def __init__(self):
        self.llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", dtype='half', max_model_len=8192)
        self.sampling_params = SamplingParams(temperature=0.7, max_tokens=512)

    def get_prompt(self, question: str):
        return f"""[INST] 
                    Explain me about this image precisely in bullet points. 
                    Your response shuold be in complete sentences.
                    <image>\n{question} [/INST]"""

    def process_image(self, image_path, question):
        prompt = self.get_prompt(question)
        image = Image.open(image_path)

        inputs = {
            "prompt": prompt,
            "multi_modal_data": {"image": image},
        }

        outputs = self.llm.generate(inputs, sampling_params=self.sampling_params)
        return outputs[0].outputs[0].text

In [None]:
# 클래스 초기화
img_processor = LLaVAImageQAProcessor()

# 이미지 경로 입력
image_path = "/content/image.jpg"

# 사용자 입력 받기
while True:
    query = input("\n질문을 입력하세요 (종료하려면 'exit' 입력): ")
    
    if query.lower() == "exit":
        print("프로그램을 종료합니다.")
        break

    # 이미지와 질문 처리
    result = img_processor.process_image(image_path, query)
    
    # 결과 출력
    print("\n답변:")
    print(result)