In [1]:
import nest_asyncio

nest_asyncio.apply()

In [20]:
from langchain_huggingface import HuggingFaceEmbeddings


def get_embeddings_model():
    return HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")

In [27]:
from langchain_community.vectorstores import Chroma

db = Chroma(
    persist_directory='../chroma_data',
    collection_name='test_collection',
    embedding_function=get_embeddings_model()
)

In [31]:
len(db)

In [None]:
from urllib.parse import urljoin
from pathlib import Path
from langchain_community.document_loaders import SitemapLoader

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoader(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=True
).load()

In [16]:
import logging
from langchain_community.document_loaders import SitemapLoader

logger = logging.getLogger(__name__)

class SitemapLoaderWithChromium(SitemapLoader):
    async def _fetch(
        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
        """
        Asynchronously scrape the content of a given URL using Playwright's async API.

        Args:
            url (str): The URL to scrape.

        Returns:
            str: The scraped HTML content or an error message if an exception occurs.

        """
        from playwright.async_api import async_playwright

        logger.info("Starting scraping...")
        results = ""
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            try:
                page = await browser.new_page()
                await page.goto(url)
                results = await page.content()  # Simply get the HTML content
                logger.info("Content scraped")
            except Exception as e:
                results = f"Error: {e}"
            await browser.close()
        return results

In [17]:
from pathlib import Path

file_path = Path("./sitemap-help.xml").absolute()

docs = SitemapLoaderWithChromium(
    file_path,
    is_local=True,
    filter_urls=["https://www.rustore.ru/help/sdk/payments/defold"],
    default_parser="lxml",
    continue_on_failure=False
).load()


Fetching pages:   0%|          | 0/5 [00:00<?, ?it/s][A
Fetching pages:  20%|##        | 1/5 [00:01<00:06,  1.52s/it][A
Fetching pages:  60%|######    | 3/5 [00:03<00:01,  1.03it/s][A
Fetching pages: 100%|##########| 5/5 [00:04<00:00,  1.13it/s][A


In [18]:
docs

[Document(page_content='\n\n\n–î–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è RuStore\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n–ü–µ—Ä–µ–π—Ç–∏ –∫ –æ—Å–Ω–æ–≤–Ω–æ–º—É —Å–æ–¥–µ—Ä–∂–∏–º–æ–º—É–î–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π–î–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤RuStore SDKRuStore API–°—Ü–µ–Ω–∞—Ä–∏–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è–†—É—Å—Å–∫–∏–π–†—É—Å—Å–∫–∏–πEnglish–°–ø–∏—Å–æ–∫ –¥–æ—Å—Ç—É–ø–Ω—ã—Ö SDK–ü–ª–∞—Ç–µ–∂–∏ in-app –∏ –ø–æ–¥–ø–∏—Å–∫–∏–°–ø–∏—Å–æ–∫ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–µ–πKotlin/JavaUnityGodotReact NativeFlutterUnreal EngineDefold–ò—Å—Ç–æ—Ä–∏—è –æ–±–Ω–æ–≤–ª–µ–Ω–∏–π5.1.15.0.15.0.0Push-—É–≤–µ–¥–æ–º–ª–µ–Ω–∏—è–£–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω—ã–µ push-—É–≤–µ–¥–æ–º–ª–µ–Ω–∏—è–ü–æ–¥–∫–ª—é—á–µ–Ω–∏–µ –æ—Ç–∑—ã–≤–æ–≤ –∏ –æ—Ü–µ–Ω–æ–∫–û–±–Ω–æ–≤–ª–µ–Ω–∏–µ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è–ö–∞—Ä—Ç—ã –∏ –≥–µ–æ—Å–µ—Ä–≤–∏—Å—ãInstall ReferrerRuStore Deeplinks–ò—Å—Ç–æ—Ä–∏—è –∏–∑–º–µ–Ω–µ–Ω–∏–π SDKTask API–°–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç—å —Å –æ—Å—Ç–∞–ª—å–Ω—ã–º–∏ SDK–ü–ª–∞—Ç–µ–∂–∏ in-app –∏ –ø–æ–¥–ø–∏—Å–∫–∏DefoldDefold üü¶ –ò—Å—Ç–æ—Ä–∏—è –æ–±–Ω