In [2]:
import json
import os

from langchain.schema import Document
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings

from plot_graph import weaviate_client


def process_pdfs(chunk, folder_path):
    pdf_docs = []
    print(f'processing chunk: of len:{len(chunk)}')
    for file_name in chunk:
        if file_name.endswith('.pdf'):
            try:
                # TODO usar tesseract para extrair texto de pdfs imagem.
                loader = PDFPlumberLoader(os.path.join(folder_path, file_name))
                docs = loader.load()
                if len(docs) == 0:
                    print(f'warning: doc: {file_name} is empty')
                pdf_docs.extend(docs)
            except Exception as e:
                print(f'erro: {e} ao processar pdf {file_name}')
    return pdf_docs


def save_docs_to_jsonl(array, file_path: str) -> None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')


def ensure_utf8(text):
    return text.encode('utf-8', errors='replace').decode('utf-8')


def load_docs_from_jsonl(file_path):
    array = []

    with open(file_path, 'r', encoding='utf-8') as jsonl_file:
        lines = jsonl_file.readlines()

    for i, line in enumerate(lines):
        # Converte a linha para UTF-8, garantindo que todos os caracteres sejam válidos
        utf8_line = ensure_utf8(line)
        data = json.loads(utf8_line)
        results = Document(**data)
        array.append(results)

    return array


# # Carregar os documentos e aplicar a conversão
# docs = load_docs_from_jsonl('../data_finish.jsonl')

# embeddings = HuggingFaceEmbeddings()
# text_splitter = SemanticChunker(embeddings)
# split_docs = text_splitter.split_documents(docs)
# 
# 
# save_docs_to_jsonl(split_docs, 'data_finish_semantic.jsonl')


In [3]:
# len(docs)

94130

In [3]:
split_docs = load_docs_from_jsonl('data_finish_semantic.jsonl')
len(split_docs)

275852

In [7]:
import weaviate
from langchain_weaviate.vectorstores import WeaviateVectorStore

weaviate_client = weaviate.connect_to_local()
embeddings = OllamaEmbeddings(model="llama3.1:8b", )
db = WeaviateVectorStore.from_documents(docs, embeddings, client=weaviate_client, index_name='protein_articles')


AttributeError: module 'weaviate' has no attribute 'connect_to_local'

In [10]:
import weaviate
from weaviate.collections.classes.config import DataType, Property, Configure

weaviate_client = weaviate.connect_to_local()
weaviate_client.collections.create(
    "ProteinCollectionSemantic",
    vectorizer_config=Configure.Vectorizer.text2vec_ollama(
        api_endpoint="http://ollama:11434",  # If using Docker, use this to contact your local Ollama instance
        model="mxbai-embed-large",  # The model to use, e.g. "nomic-embed-text"
    ),
    properties=[  # properties configuration is optional
        Property(name="title", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="body", data_type=DataType.TEXT),
        Property(name="page", data_type=DataType.INT, skip_vectorization=True),
        Property(name="doi", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="pk", data_type=DataType.TEXT, skip_vectorization=True),
        Property(name="proteins_structures", data_type=DataType.TEXT_ARRAY),
    ]
)
weaviate_client.close()

In [11]:
import weaviate
weaviate_client = weaviate.connect_to_local()
collection = weaviate_client.collections.get("ProteinCollectionSemantic")

from tqdm import tqdm

with collection.batch.dynamic() as batch:
    # Inicializa o progresso com tqdm
    for src_obj in tqdm(split_docs, desc="Processando documentos", unit="doc"):
        try:
            weaviate_obj = {
                "pk": src_obj.metadata.get("id", ''),
                "doi": src_obj.metadata.get("doi", ''),
                "page": src_obj.metadata.get("page", ''),
                "title": src_obj.metadata.get("title", ''),
                "body": src_obj.page_content,
                "proteins_structures": src_obj.metadata.get("proteins_structures", []),
            }

            batch.add_object(
                properties=weaviate_obj,
            )
        except Exception as err:
            print(f'Error: {err}, on id {src_obj.metadata.get("id", "")}')
weaviate_client.close()


Processando documentos: 100%|██████████| 275852/275852 [5:14:15<00:00, 14.63doc/s]  


In [None]:
from weaviate.collections.classes.grpc import MetadataQuery

weaviate_client = weaviate.connect_to_local()
collection = weaviate_client.collections.get("ProteinCollectionSemantic")

response = collection.query.hybrid(
    query="what is mobile loop ?",
    return_metadata=MetadataQuery(score=True, explain_score=True, distance=True, is_consistent=True),
    limit=3,
    # include_vector=True,
)

for o in response.objects:
    print(o.properties['body'][:100], '...')
    print(
        f'score: {o.metadata.score}, explain: {o.metadata.explain_score}, distance: {o.metadata.distance}, is_consistent: {o.metadata.is_consistent}')
    # print(o.vector["default"])
weaviate_client.close()

In [None]:
weaviate_client = weaviate.connect_to_local()
collection = weaviate_client.collections.get("ProteinCollectionSemantic")

count = 0
for item in collection.iterator():
    print(item.uuid, item.properties)
    count += 1
    if count > 3:
        break
weaviate_client.close()

In [None]:
from langchain_weaviate.vectorstores import WeaviateVectorStore

weaviate_client = weaviate.connect_to_local()
db = WeaviateVectorStore(client=weaviate_client, index_name='ProteinCollectionSemantic',
                         embedding=OllamaEmbeddings(model="mxbai-embed-large", ), text_key='body')
response = db.similarity_search("what is mobile loop ?", alpha=0.5, k=20)
weaviate_client.close()
len(response)

In [9]:
weaviate_client.connect()
weaviate_client.collections.delete_all()
weaviate_client.close()

In [None]:
weaviate_client.connect()
try:
    collections = weaviate_client.collections.list_all()
finally:
    weaviate_client.close()
collections

In [None]:
weaviate_client = weaviate.connect_to_local()
db = WeaviateVectorStore(client=weaviate_client, index_name='ProteinCollectionSemantic',
                         embedding=OllamaEmbeddings(model="mxbai-embed-large"),
                         text_key='body')
retriever = db.as_retriever(search_kwargs={'k': 20})

len(retriever.invoke('what is a mobile loop?'))


In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama

prompt = """
    1. Use the following pieces of context to answer the question at the end.
            2. If you don't know the answer, just say "I don't know" but don't make up an answer on your own.
            3. Keep the answer crisp and limited to 3-4 sentences.
            
            Context: {context}
            
            Question: {question}
            
            After the answer, always say the source and page.
            Helpful Answer:
    """
llm = Ollama(model="llama3")
qa_chain_prompt = PromptTemplate.from_template(prompt)
rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | qa_chain_prompt
        | llm
        | StrOutputParser()
)

rag_chain.invoke('what is a mobile loop?')