# Извлечение данных с PDF файла

In [None]:
# for linux
# !apt-get install poppler-utils tesseract-ocr libmagic-dev

# for mac
# !brew install poppler tesseract libmagic

In [None]:
#%pip install -Uq "unstructured[all-docs]" pillow lxml pillow
#%pip install -Uq chromadb tiktoken
#%pip install -Uq langchain langchain-community langchain-openai langchain-groq
#%pip install -Uq python_dotenv

In [None]:
from unstructured.partition.pdf import partition_pdf

PATH_TO_PDF_FILE = "./photoshop_manual.pdf"

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables
    extract_image_block_types=["Image"],  
    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage
    chunking_strategy="by_title",          # or 'basic'
    max_characters=10000,                  # defaults to 500
    combine_text_under_n_chars=2000,       # defaults to 0
    new_after_n_chars=6000,
)

In [None]:
def get_tables(chunks):
    tables = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Table" in str(type(el)):
                    tables.append({"image_base64": el.metadata.image_base64, "page_number": el.metadata.page_number})
    return tables

tables = get_tables(chunks)

In [None]:
# Get the images from the CompositeElement objects
def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64

images = get_images_base64(chunks)

In [None]:
import base64
from IPython.display import Image, display

def display_base64_image(base64_code):
    # Decode the base64 string to binary
    image_data = base64.b64decode(base64_code)
    # Display the image
    display(Image(data=image_data))

display_base64_image(images[0])

In [None]:
import json

with open("./extracted_data/extracted_images.json", "w", encoding="utf-8") as file:
    json.dump(images, file, ensure_ascii=False, indent=4)

with open("./extracted_data/extracted_texts.json", "w", encoding="utf-8") as file:
    json.dump(texts, file, ensure_ascii=False, indent=4)

with open("./extracted_data/extracted_tables.json", "w", encoding="utf-8") as file:
    json.dump(texts, file, ensure_ascii=False, indent=4)

In [3]:
#import json

#with open("./extracted_data/extracted_images.json", "r") as file:
#    images = json.load(file)

#with open("./extracted_data/extracted_texts.json", "r") as file:
#    texts = json.load(file)

In [4]:
#import base64

# Сохраняем кизображения в формате png для обраюотки через gigachat
#for i in range(len(images)):
#    image_data = base64.b64decode(images[i]["image_base64"])
#    with open(f"./extracted_data/source_images/image_{i}.png", "wb") as file:
#        file.write(image_data)

# Саммари по картинкам

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain_gigacaht import Gigachat
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

def init_gigachat():
    return Gigachat(model="GigaChat-Max", verify_ssl_certs=False, temperature=1e-15, timeout=100)

llm = init_gigachat()

In [None]:
from langchain_core.messages import HumanMessage

from prompts import IMAGE_SUMMARY_PROMPT

llm = init_gigachat()

def get_gigachat_image_summary(prompt, file_path, llm):
    file = llm.upload_file(open(file_path, "rb"))
    image_summary = llm.invoke([HumanMessage(
        content=IMAGE_SUMMARY_PROMPT,
        additional_kwargs={"attachemant": [file.id_]}
    )]).content
    return image_summary

In [None]:
imagies_summary = []
i = 0

In [None]:
while i < len(images):
    image_summary = get_gigachat_image_summary(prompt, "./extracted_data/source_images/image_{i}.png", llm)
    imagies_summary.append({"image_summary": image_summary, "source": "./extracted_data/source_images/image_{i}.png", "page_number": images[i]["page_number"]})
    if i % 10 == 0:
        print(f"{i}/{len(images)}")

In [None]:
with open("./extracted_data/imagies_summary.json", "w", encoding="utf-8") as file:
    json.dump(texts, file, ensure_ascii=False, indent=4)