In [104]:
import arxiv
import os
import json

def download_from_arxiv(key_word, max_results=10, destination_path="assets/arxiv/", saved_list_path="assets/arxiv/papers.json"):
    client = arxiv.Client()

    search = arxiv.Search(
        query = str(key_word),
        max_results = max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate, 
        sort_order=arxiv.SortOrder.Descending
    )
    # Проверка существования файла с сохраненным списком скачанных arXiv ID
    if os.path.exists(saved_list_path):
        with open(saved_list_path, "r") as f:
            downloaded_arxiv_ids = json.load(f)
    else:
        downloaded_arxiv_ids = []
    
    for result in client.results(search):
        # Проверка наличия статьи в списке уже скачанных
        if result.entry_id.split("/")[-1] in downloaded_arxiv_ids:
            print(f"Статья {result.entry_id} уже скачана и пропущена.")
            continue
        
        # Скачивание PDF-файла статьи
        pdf_path = result.download_pdf(dirpath=destination_path)
        if pdf_path:
            print(f"Статья {result.entry_id} успешно скачана и сохранена в {pdf_path}")
            # Добавление arXiv ID в список скачанных
            downloaded_arxiv_ids.append(result.entry_id.split("/")[-1])
    # Сохранение списка скачанных arXiv ID в файл JSON
    with open(saved_list_path, "w") as f:
        json.dump(downloaded_arxiv_ids, f)

In [105]:
key_words = ["NLP", "RAG", "ChatBot", "LLM", "Speech Recognition", "LangChain", "LLM Agents"]

for key_word in key_words:
    # Пример использования функции
    download_from_arxiv(key_word, max_results=10)

Статья http://arxiv.org/abs/2404.15238v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.15104v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14977v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14963v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14943v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.15382v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14809v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14740v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14695v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14631v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.15488v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.14043v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.13948v1 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.13892v2 уже скачана и пропущена.
Статья http://arxiv.org/abs/2404.13781v1 уже скачана и пропущена.
Статья htt

In [109]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores.faiss import FAISS
from langchain_core.documents import Document

In [110]:
from typing import Any, Coroutine, List


class HuggingFaceE5Embeddings(HuggingFaceEmbeddings):
    def embed_query(self, text: str) -> List[float]:
        text = f"query: {text}"
        return super().embed_query(text)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        texts = [f"passage: {text}" for text in texts]
        return super().embed_documents(texts)

    async def aembed_query(self, text: str) -> Coroutine[Any, Any, List[float]]:
        text = f"query: {text}"
        return await super().aembed_query(text)

    async def aembed_documents(
        self, texts: List[str]
    ) -> Coroutine[Any, Any, List[List[float]]]:
        texts = [f"passage: {text}" for text in texts]
        return await super().aembed_documents(texts)

In [111]:
embedding = HuggingFaceE5Embeddings(model_name="intfloat/multilingual-e5-base")

In [112]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores.faiss import FAISS
from langchain_community.embeddings.gigachat import GigaChatEmbeddings
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)

# embeddings = GigaChatEmbeddings(
#     credentials=credentials, scope="GIGACHAT_API_CORP", verify_ssl_certs=True
# )

loader = PyPDFDirectoryLoader("assets/arxiv/")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10000,
    chunk_overlap=3000,
)
documents = text_splitter.split_documents(documents)

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 66 0 (offset 0)
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 80 0 (offset 0)
Ignoring wrong pointing object 85 0 (offset 0)
Ignoring wrong pointing object 232 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong p

In [113]:
faiss_db = FAISS.from_documents(documents, embedding=embedding)

In [114]:
import string

def tokenize(s: str) -> list[str]:
    """Очень простая функция разбития предложения на слова"""
    return s.lower().translate(str.maketrans("", "", string.punctuation)).split(" ")

embedding_retriever = faiss_db.as_retriever(search_kwargs={"k": 2})
bm25_retriever = BM25Retriever.from_documents(
    documents=documents,
    preprocess_func=tokenize,
    k=3,
)
ensemble_retriever = EnsembleRetriever(
    retrievers=[embedding_retriever, bm25_retriever],
    weights=[0.4, 0.6],
)

In [199]:
from langchain.chains import RetrievalQA
from langchain.llms.gigachat import GigaChat
from pprint import pprint
from langchain.chains import LLMChain
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


llm = GigaChat(credentials=credentials, 
               scope="GIGACHAT_API_CORP", 
               verify_ssl_certs=False, 
               temperature=0.5,
               max_tokens=200)
key_words = ['Natural Language Processing', 'Retrieval-Augmented Generation']
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=ensemble_retriever
)
pprint(qa.invoke(f"Придумай 4 похожих термина {key_words}. Не повторяйся"))

{'query': "Придумай 4 похожих термина ['Natural Language Processing', "
          "'Retrieval-Augmented Generation']. Не повторяйся",
 'result': '1. Natural Language Processing (NLP)\n'
           '2. Machine Translation (MT)\n'
           '3. Information Retrieval (IR)\n'
           '4. Text Mining (TM)'}


In [None]:
pprint(qa.invoke("Что такое RAD. Напиши пример кода"))

In [96]:
pprint(qa)

RetrievalQA(combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=GigaChat(credentials='YmZlNGMwYWQtM2E0ZS00NzQ3LWIzMzQtZWYxN2NjNTYxODEyOmIyMjkxZTI2LTg4NjktNDc1Yy05NjE5LTg2NzUxNzc3MWZmYg==', scope='GIGACHAT_API_CORP', verify_ssl_certs=False, temperature=1.5, _client=<gigachat.GigaChat object at 0x7fb9119ed8d0>)), document_variable_name='context'), retriever=EnsembleRetriever(retrievers=[VectorStoreRetriever(tags=['FAISS', 'HuggingFaceE5Embeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7fb8bffe93c0>, search_kwargs={'k': 2}), BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7fb8db1c19f0>, k=3, preprocess_func=<function tokenize at 0x7fbab4c817e0