In [1]:
import httpx

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryByteStore
from langchain_openai import ChatOpenAI

In [2]:
embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-m3")

In [4]:
import getpass

API_KEY = getpass.getpass(prompt="API Key: ")

In [5]:
proxies = {
    'http://': f'',
    'https://': f''
}

In [48]:
llm = ChatOpenAI(api_key=API_KEY, model='gpt-4o', http_client=httpx.Client(proxies=proxies))

In [5]:
from langchain_community.document_loaders import JSONLoader


def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["url"] = record.get("url")
    metadata["title"] = record.get("title")

    return metadata


loader = JSONLoader(
    file_path='./dataset_unique_epta.json',
    jq_schema='.data[]',
    text_content=False,
    content_key='description',
    metadata_func=metadata_func,
)

docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs)




In [7]:
vectorstore = Chroma.from_documents(doc_splits, embedding=embedding_function, persist_directory='./chroma_db_test', collection_name='bge-test')

In [50]:
functions = [
    {
        "name": "atomic_topics",
        "description": "Write atomic topics",
        "parameters": {
            "type": "object",
            "properties": {
                "atomic_topics": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["atomic_topics"],
        },
    },
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

In [51]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

In [52]:
q_template = """Представь, что занимаешься бизнессом или хочешь начать заниматься им. 
Сгенерируй 10-20 вопросов, которые ты гипотетически мог задать, если бы не знал приведенный ниже текст:\n\n{doc}
Если текст большой вопросов может быть больше, если текст поменьше вопросов может быть и меньше."""

In [53]:
q_chain = (
        RunnablePassthrough()
        | ChatPromptTemplate.from_template(q_template)
        | llm.bind(
    functions=functions, function_call={"name": "hypothetical_questions"}
)
        | JsonKeyOutputFunctionsParser(key_name="questions")
)

In [54]:
atom_template = """Цель: Разделить длинную статью на несколько частеей отдельным темам для удобства чтения и возможности более детального анализа каждой статьи.

Инструкция:

1) Проанализируй статью: Прочитайте всю статью, чтобы получить общее представление о содержании и структуре.
2) Извелеки и реорганизуй текст: Извлеките текст, относящийся к каждой теме, и организуйте его в отдельные статьи.
\n\n{doc}"""

In [55]:
atom_chain = (
        RunnablePassthrough()
        | ChatPromptTemplate.from_template(atom_template)
        | llm.bind(
    functions=functions, function_call={"name": "atomic_topics"}
)
        | JsonKeyOutputFunctionsParser(key_name="atomic_topics")
)

In [56]:
TOKEN_LIMIT = 1000

def calculate_tokens_num(_text: str) -> int:
    return int(len(_text) / 2)

In [57]:
def get_sub_docs(_doc):
    # total_questions = []
    content = f"Заголовок статьи: {_doc.metadata.get('title')}\nКонтент:\n{_doc.page_content}"
    total_questions = q_chain.invoke(content)
    # if calculate_tokens_num(content) > TOKEN_LIMIT:
    #     atoms = atom_chain.invoke(content)
    #     questions = q_chain.batch(atoms, {"max_concurrency": 5})
    #     for q_list in questions:
    #         total_questions.extend(q_list)
    # else:
    #     total_questions = q_chain.invoke(content)
        
    return [Document(page_content=_q, metadata=_doc.metadata) for _q in total_questions]

In [58]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="hypo-questions", embedding_function=embedding_function, persist_directory='./chroma_db_test',
)
# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

In [59]:
sub_docs = []

for i, doc in enumerate(docs):
    print(i)
    _id = doc_ids[i]
    _sub_docs = get_sub_docs(doc)
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

In [None]:
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))