In [1]:
import nest_asyncio

nest_asyncio.apply()

In [20]:
from langchain_huggingface import HuggingFaceEmbeddings


def get_embeddings_model():
    return HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")

In [27]:
from langchain_community.vectorstores import Chroma

db = Chroma(
    persist_directory='../chroma_data',
    collection_name='test_collection',
    embedding_function=get_embeddings_model()
)

In [31]:
len(db)

In [None]:
from urllib.parse import urljoin
from pathlib import Path
from langchain_community.document_loaders import SitemapLoader

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoader(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=True
).load()

In [16]:
import logging
from langchain_community.document_loaders import SitemapLoader

logger = logging.getLogger(__name__)

class SitemapLoaderWithChromium(SitemapLoader):
    async def _fetch(
        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
        """
        Asynchronously scrape the content of a given URL using Playwright's async API.

        Args:
            url (str): The URL to scrape.

        Returns:
            str: The scraped HTML content or an error message if an exception occurs.

        """
        from playwright.async_api import async_playwright

        logger.info("Starting scraping...")
        results = ""
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            try:
                page = await browser.new_page()
                await page.goto(url)
                results = await page.content()  # Simply get the HTML content
                logger.info("Content scraped")
            except Exception as e:
                results = f"Error: {e}"
            await browser.close()
        return results

In [17]:
from pathlib import Path

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoaderWithChromium(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=False
).load()


Fetching pages:   0%|          | 0/5 [00:00<?, ?it/s][A
Fetching pages:  20%|##        | 1/5 [00:01<00:06,  1.52s/it][A
Fetching pages:  60%|######    | 3/5 [00:03<00:01,  1.03it/s][A
Fetching pages: 100%|##########| 5/5 [00:04<00:00,  1.13it/s][A


In [18]:
docs

[Document(page_content='\n\n\nДокументация RuStore\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nПерейти к основному содержимомуДокументация пользователейДокументация разработчиковRuStore SDKRuStore APIСценарии использованияРусскийРусскийEnglishСписок доступных SDKПлатежи in-app и подпискиСписок зависимостейKotlin/JavaUnityGodotReact NativeFlutterUnreal EngineDefoldИстория обновлений5.1.15.0.15.0.0Push-уведомленияУниверсальные push-уведомленияПодключение отзывов и оценокОбновление приложенияКарты и геосервисыInstall ReferrerRuStore DeeplinksИстория изменений SDKTask APIСовместимость с остальными SDKПлатежи in-app и подпискиDefoldDefold 🟦 История обновлений🟦 5.1.1🟦 5.0.1🟦 5.0.0Предыдущая страница5.0.0Следующая страницаИстория обновлений\n©\xa0VK,\xa02024\n\nПользовательское соглашение\nКонфиденциальность\n\n\n\n\n', metadata={'source': 'https://www.rustore.ru/help/sdk/payments/defold', 'loc': 'https://www.rustore.ru/help/sdk/payments/defold', 'changefreq': 'weekly', 'priority': '0.5'}),
 Document(pa