In [60]:
!pip install PyMuPDF
!pip install pillow
!pip install -q --upgrade google-generativeai langchain-google-genai
!pip install langchain faiss-gpu



In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
import fitz
import os
import requests
import re
from tqdm import tqdm
from PIL import Image
from tqdm import tqdm
from pathlib import Path
from google.colab import userdata
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel

# pdf 경로를 설정하세요.
pdf_path = "/content/drive/MyDrive/프로젝트/data/lp.pdf"

In [63]:
# 실행하기 전에 열쇠창에 API키를 등록하세요.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [64]:
# 모델을 불러옵니다.
gemini_vision = genai.GenerativeModel('gemini-pro-vision')
gemini_pro = ChatGoogleGenerativeAI(model="gemini-pro")
embed_doc = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type = "RETRIEVAL_DOCUMENT")

In [65]:
# pdf에서 이미지를 추출해서 임시 저장합니다.
def extract_images_from_pdf(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    extracted_images = []

    for page_number in tqdm(range(doc.page_count), desc="Extracting images"):
        page = doc[page_number]
        images = page.get_images(full=True)
        for img_index, image in enumerate(images):
            xref = image[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = f"{output_folder}/page{page_number + 1}_img{img_index + 1}.png"

            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)

            extracted_images.append(image_filename)

    return extracted_images

output_folder = "temporary"
extracted_images = extract_images_from_pdf(pdf_path, output_folder)

def get_image_files(dir_path):
    dir_path = Path(dir_path)
    image_paths = list(dir_path.glob("*.png"))
    return image_paths

image_files = get_image_files(output_folder)

Extracting images: 100%|██████████| 64/64 [00:00<00:00, 3597.17it/s]


In [66]:
# 이미지 설명 텍스트를 추출합니다.
image_folder = "/content/temporary/"
image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

responses = []

for image_file in tqdm(image_files, desc="Generating content"):
    image_path = os.path.join(image_folder, image_file)
    img = Image.open(image_path)
    response = gemini_vision.generate_content(img)
    responses.append({"image_path": image_path, "response": response})

pdf_img = []

response_values = [result["response"] for result in responses]

for response in response_values:
    pdf_img.append(str(response.text))

pdf_img = ' '.join(pdf_img)

Generating content: 100%|██████████| 30/30 [04:31<00:00,  9.05s/it]


In [70]:
# pdf에서 텍스트를 추출합니다.
def extract_and_clean_text_from_pdf(pdf_path):
    text_content = ""

    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text_content += page.get_text()

    cleaned_text = re.sub(r'[^\w\s.,?!]', '', text_content)
    cleaned_text = re.sub(r'\b\d+\b', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    return cleaned_text

pdf_txt = extract_and_clean_text_from_pdf(pdf_path)

In [71]:
# 텍스트 파일 임베딩
vectorstore1 = FAISS.from_texts([pdf_txt], embedding=embed_doc)
retriever1 = vectorstore1.as_retriever()

# 이미지 파일 임베딩
vectorstore2 = FAISS.from_texts([pdf_img], embedding=embed_doc)
retriever2 = vectorstore2.as_retriever()

In [73]:
# 질문답변 프롬프트
query = "tell me how many images are there and description of them?"
prompt_str = """Answer the question step by step. and you can refer to the following context:
{context_a}
{context_b}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(prompt_str)
retrieval = RunnableParallel(
    {   "context_a": retriever1, "context_b": retriever2,
        "question": RunnablePassthrough()  }
)
output_parser = StrOutputParser()
chain = retrieval | prompt | gemini_pro | output_parser

chain.invoke(query)

'There are 19 images in total.\n\n1. A picture of a lamb.\n2. An illustration of a snake eating a mouse.\n3. A drawing of a man sitting at a desk, smoking a pipe and reading a document.\n4. A picture of a king sitting on a small planet.\n5. A picture of Curt Jurgens.\n6. A flower, peg footprints, a desert, and a sun.\n7. A man drinking at a table, with two bottles and a glass.\n8. An illustration of a boa constrictor digesting an elephant.\n9. A picture of a Lockheed P-38 Lightning aircraft.\n10. An illustration of an old man with a magnifying glass, wearing a green suit and a purple turban, sitting at a table with an open book.\n11. A drawing of the little prince from the story "The Little Prince" by Antoine de Saint-Exupéry.\n12. A drawing of a man standing on a small planet wearing a suit and a hat.\n13. A picture of a Faraday cage.\n14. A drawing of the little prince and the snake.\n15. A picture of a rabbit in a cave, a tree on a hill, and some flowers on the ground.\n16. A pictur