1. weaviate client 만들기
2. client 만든거에서 파일 가져오기 (이 부분 아마 바꿔야 할 수도 있음 llamaindex 통합 위해서 -> weaviate 참조)
3. 검색 - reranking
4. 결과 parse

In [1]:
import weaviate
import os

weaviate_client = weaviate.Client("http://localhost:8080",
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY"),
    })

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [2]:
# Unstructured api calling from local docker container
# docker run -p 8000:8000 -d --rm --name unstructured-api -e UNSTRUCTURED_PARALLEL_MODE_THREADS=5 downloads.unstructured.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared

document_handle_client = UnstructuredClient(
    server_url="http://localhost:8000",
    api_key_auth="", #no need to authorize this parameter cause you don't use SASS api key.
)

In [3]:
filename = "dockerstest/pdf/sample_file.pdf"

with open(filename, "rb") as f:
    files = shared.Files(
        content=f.read(),
        file_name=filename,
    )

In [4]:
req = shared.PartitionParameters(
    files=files,
    chunking_strategy="by_title",
    strategy='hi_res',
    split_pdf_page=True,
    coordinates=True, 
    ## this is just example. but if you want split_pdf_page, recommand to use hi_res strategy.
)

In [5]:
try:
    resp = document_handle_client.general.partition(req)
    print("Handled results :", len(resp.elements))
except Exception as e:
    print("Exception :", e)

INFO: Splitting PDF by page on client. Using 5 threads when calling API.
INFO: Set UNSTRUCTURED_CLIENT_SPLIT_CALL_THREADS env var if you want to change that.
Handled results : 23


In [6]:
from llama_index.core.schema import TextNode

def integrate_unstructured_metadata_with_llama_index(element):
    text = element['text']
    
    metadata = {key: value for key, value in element.items() if key != 'text'}
    metadata.update(metadata.pop('metadata', {}))

    if 'coordinates' in metadata and 'points' in metadata['coordinates']:
        flat_points = [coord for point in metadata['coordinates']['points'] for coord in point]
        metadata['flat_points'] = flat_points

        metadata['coordinate_system'] = metadata['coordinates']['system']
        metadata['layout_width'] = metadata['coordinates']['layout_width']
        metadata['layout_height'] = metadata['coordinates']['layout_height']

        del metadata['coordinates']

    return text, metadata

nodes = []

for element in resp.elements:
    text, metadata = integrate_unstructured_metadata_with_llama_index(element)
    nodes.append(TextNode(
        text=text,
        metadata=metadata,
    ))

print(nodes)

[TextNode(id_='c1fe9d05-3456-4558-9279-5f6210bb1e1d', embedding=None, metadata={'type': 'CompositeElement', 'element_id': '28b187b9436cfc48154d3b50f0a387ad', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 1, 'filename': 'sample_file.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Revision Date\n\n6/18/2009\n\nRevision\n\nChange Reason\n\nRevise specification. Change Vanadium from 0.075\n\n0.100\n\nChanged Jessica Merczak | Keith Paarfusser\n\nApproved\n\nN TN\n\nAMERICAN NTN BEARING MFG. CORP. ELGIN PLANT\n\n1/4\n\nBRG. NO.\n\nHUB BEARINGS\n\nSTANDARDIZATION OF PRODUCT QUALITY\n\nMACHINE\n\nN/A\n\nNO.\n\n06-M-5010.005', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), TextNode(id_='3774b148-abae-44e8-91fb-ef8c33dd2d6a', embedding=None, metadata={'type': 'CompositeElement', 'element_id': 'd853de28175bb6f04912d94de8f18882', 

In [7]:
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker
from llama_index.core import StorageContext, VectorStoreIndex

In [8]:
vector_store = WeaviateVectorStore(weaviate_client=weaviate_client, index_name="Sample_pdf_document_4")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [None]:
retriever = index.as_retriever(similarity_top_k=5)
retriever.retrieve("What is chemical composition?")

In [10]:
response = retriever.retrieve("What is chemical composition?")

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [None]:
print(response[3].metadata)

In [None]:
from llama_index.llms.openai import OpenAI

## config llm like this
llm = OpenAI(model="gpt-3.5-turbo")

In [None]:
rerank = FlagEmbeddingReranker(model="BAAI/bge-reranker-v2-m3", top_n=5)
query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, node_postprocessors=[rerank]) ## from this part check llm.
# response = query_engine.query("What is chemical composition?")

In [None]:
# print(response)

In [None]:
# print(response.get_formatted_sources(length=200))

In [None]:
resp2 = query_engine.query("specific non-metallic inclusion with json format")

In [None]:
print(resp2.get_formatted_sources(length=200))

In [None]:
print(resp2)