In [5]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [6]:
import os
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

  from tqdm.autonotebook import tqdm


In [7]:
import time

index_name = "gigachain-test-index-gpt-7"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [8]:
# from langchain_community.embeddings.gigachat import GigaChatEmbeddings
# embeddings = GigaChatEmbeddings(model="EmbeddingsGigaR")
from langchain_openai.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [9]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader
import os
import tqdm
markdown_path = "mdx_docs/"

docuemnts = []
# Iterate all mdx file in subdirectories
for root, dirs, files in tqdm.tqdm(os.walk(markdown_path)):
    for file in files:
        if file.endswith(".mdx") or file.endswith(".md"):
            loader = UnstructuredMarkdownLoader(os.path.join(root, file))
            document = loader.load()
            for doc in document:
                first_line = doc.page_content.split("\n")[0]
                if first_line.startswith("URL: "):
                    doc.metadata['source'] = first_line[5:]
            docuemnts.append(document)
print(f"Total documents: {len(docuemnts)}")

21it [00:02,  9.07it/s]

Total documents: 111





In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs_list = [item for sublist in docuemnts for item in sublist]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=16000, chunk_overlap=500)
doc_splits = text_splitter.split_documents(docs_list)
print(f"Documents splited. Count: {len(doc_splits)}")

Documents splited. Count: 116


# Обновим все ссылки на реальные URL

In [12]:
links = {
    "mdx_docs/main_faq.mdx": "https://giga.chat/help/articles/faq",
    "mdx_docs/faq/faq.mdx": "https://giga.chat/help/articles/faq",
    "mdx_docs/course/": "https://courses.sberuniversity.ru/llm-gigachat/",
    "mdx_docs/external/gigachat_readme.mdx": "https://github.com/ai-forever/gigachat",
    "mdx_docs/external/gigachain_readme.mdx": "https://github.com/ai-forever/gigachain",
    "mdx_docs/gigachat/prompts-hub/": "https://developers.sber.ru/docs/ru/gigachat/prompts-hub/overview",
    "mdx_docs/gigachat/api/": "https://developers.sber.ru/docs/ru/gigachat/api/reference/rest/gigachat-api",
    "mdx_docs/gigachat/individuals-quickstart.mdx": "https://giga.chat/help/articles/how-to-start-work-with-gigachat",
    "mdx_docs/gigachat_help_scrapper/b2b.mdx": "https://giga.chat/b2b",
    "mdx_docs/gigachat/models.mdx": "https://developers.sber.ru/docs/ru/gigachat/models",
    "mdx_docs/gigachat/certificates.mdx": "https://developers.sber.ru/docs/ru/gigachat/certificates",
    "mdx_docs/gigachat/prompt-design.mdx": "https://developers.sber.ru/docs/ru/gigachat/prompt-design",
    "mdx_docs/gigachat/about.mdx": "https://developers.sber.ru/docs/ru/gigachat/about",
    "mdx_docs/gigachat/limitations.mdx": "https://developers.sber.ru/docs/ru/gigachat/limitations",
    "mdx_docs/gigachat_help_scrapper/how_to_start.mdx": "https://giga.chat/help/articles/how-to-start-work-with-gigachat",
    "mdx_docs/gigachat_help_scrapper": "https://developers.sber.ru/portal/products/gigachat-api",
    "mdx_docs/gigachat/legal-quickstart.mdx": "https://developers.sber.ru/docs/ru/gigachat/legal-quickstart",
    "mdx_docs/gigachat/changelog/latest.mdx": "https://developers.sber.ru/docs/ru/gigachat/changelog/latest"
}

for doc in doc_splits:
    source = doc.metadata["source"]
    if source.startswith("https://"):
        continue
    updated = False
    for link, url in links.items():
        if source.startswith(link):
            doc.metadata["source"] = url
            # print("Updated source link:", source, "->", url)
            updated = True
    if not updated:
        print("!!! Not updated source link:", source)

In [13]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(doc_splits))]
vector_store.add_documents(documents=doc_splits, ids=uuids)
retriever = vector_store.as_retriever()
print("OK")

OK
