In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

In [None]:
bs4_strainer= bs4.SoupStrainer(class_='theme-doc-markdown markdown') # inspect the html and find the class name of the content

loader= WebBaseLoader(web_paths=['https://python.langchain.com/docs/how_to/document_loader_web/',],
                    bs_kwargs={'parse_only':bs4_strainer}) # only parse the content with class 'theme-doc-markdown markdown'
docs= loader.load()
print(docs)




In [52]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter= RecursiveCharacterTextSplitter(chunk_size=1300, chunk_overlap=100, add_start_index=True)
all_splits= splitter.split_documents(docs)

In [53]:
all_splits

[Document(metadata={'source': 'https://python.langchain.com/docs/how_to/document_loader_web/', 'start_index': 0}, page_content='How to load web pages\nThis guide covers how to load web pages into the LangChain Document format that we use downstream. Web pages contain text, images, and other multimedia elements, and are typically represented with HTML. They may include links to other pages or resources.\nLangChain integrates with a host of parsers that are appropriate for web pages. The right parser will depend on your needs. Below we demonstrate two possibilities:\n\nSimple and fast parsing, in which we recover one Document per web page with its content represented as a "flattened" string;\nAdvanced parsing, in which we recover multiple Document objects per page, allowing one to identify and traverse sections, links, tables, and other structures.'),
 Document(metadata={'source': 'https://python.langchain.com/docs/how_to/document_loader_web/', 'start_index': 727}, page_content='Setup\u2

In [54]:
from langchain_ollama import OllamaEmbeddings
embedding= OllamaEmbeddings(model='all-minilm')


In [55]:
from langchain_chroma import Chroma
vector_db= Chroma.from_documents(documents=all_splits, embedding=embedding)

In [56]:
retriever= vector_db.as_retriever(search_type='similarity', search_kwargs={'k':3})
query= 'how to use web document loader in python'
retrieved_docs= retriever.invoke(query)
retrieved_docs

[Document(id='116d60a1-082d-4caf-9994-272b58747a01', metadata={'source': 'https://python.langchain.com/docs/how_to/document_loader_web/', 'start_index': 7486}, page_content='As an example, below we load the content of the "Setup" sections for two web pages:\nfrom typing import Listfrom langchain_core.documents import Documentasync def _get_setup_docs_from_url(url: str) -> List[Document]:    loader = UnstructuredLoader(web_url=url)    setup_docs = []    parent_id = -1    async for doc in loader.alazy_load():        if doc.metadata["category"] == "Title" and doc.page_content.startswith("Setup"):            parent_id = doc.metadata["element_id"]        if doc.metadata.get("parent_id") == parent_id:            setup_docs.append(doc)    return setup_docspage_urls = [    "https://python.langchain.com/docs/how_to/chatbots_memory/",    "https://python.langchain.com/docs/how_to/chatbots_tools/",]setup_docs = []for url in page_urls:    page_setup_docs = await _get_setup_docs_from_url(url)    set

In [57]:
context= ''.join([doc.page_content for doc in retrieved_docs])

In [58]:
context

'As an example, below we load the content of the "Setup" sections for two web pages:\nfrom typing import Listfrom langchain_core.documents import Documentasync def _get_setup_docs_from_url(url: str) -> List[Document]:    loader = UnstructuredLoader(web_url=url)    setup_docs = []    parent_id = -1    async for doc in loader.alazy_load():        if doc.metadata["category"] == "Title" and doc.page_content.startswith("Setup"):            parent_id = doc.metadata["element_id"]        if doc.metadata.get("parent_id") == parent_id:            setup_docs.append(doc)    return setup_docspage_urls = [    "https://python.langchain.com/docs/how_to/chatbots_memory/",    "https://python.langchain.com/docs/how_to/chatbots_tools/",]setup_docs = []for url in page_urls:    page_setup_docs = await _get_setup_docs_from_url(url)    setup_docs.extend(page_setup_docs)API Reference:Document\nfrom collections import defaultdictsetup_text = defaultdict(str)for doc in setup_docs:    url = doc.metadata["url"]   

In [59]:
from langchain_ollama.llms import OllamaLLM
llm= OllamaLLM(model='llama3.2:1b')

prompt= f"""answer the question according to the context briefly: question:{query} context:{context})"""

response= llm.invoke(prompt)
response

'To use the WebDocumentLoader in Python context for "simple and fast" parsing:\n\n1. Install langchain-community and beautifulsoup4 if you haven\'t already:\n   ```bash\npip install -qU langchain-community beautifulsoup4\n```\n\n2. Import required libraries and adjust the `WebBaseLoader` to suit your needs.\n\n3. Load the web pages using the loader\'s `alazy_load()` method, specifying the URL of each page:\n\n```python\nimport bs4\n\nfrom langchain_community.document_loaders import WebBaseLoader\nfrom typing import List\n\n# Adjust the page URLs in \'page_urls\' list according to your needs\n\ndef _get_setup_docs_from_url(url: str) -> List[Document]:\n    loader = WebBaseLoader(web_paths=[url])\n    setup_docs = []\n    parent_id = -1\n    async for doc in loader.alazy_load():\n        if doc.metadata["category"] == "Title" and doc.page_content.startswith("Setup"):\n            parent_id = doc.metadata["element_id"]\n        if doc.metadata.get("parent_id") == parent_id:\n            s

In [None]:
from IPython.display import display, Markdown
display(Markdown(response))

To use the WebDocumentLoader in Python context for "simple and fast" parsing:

1. Install langchain-community and beautifulsoup4 if you haven't already:
   ```bash
pip install -qU langchain-community beautifulsoup4
```

2. Import required libraries and adjust the `WebBaseLoader` to suit your needs.

3. Load the web pages using the loader's `alazy_load()` method, specifying the URL of each page:

```python
import bs4

from langchain_community.document_loaders import WebBaseLoader
from typing import List

# Adjust the page URLs in 'page_urls' list according to your needs

def _get_setup_docs_from_url(url: str) -> List[Document]:
    loader = WebBaseLoader(web_paths=[url])
    setup_docs = []
    parent_id = -1
    async for doc in loader.alazy_load():
        if doc.metadata["category"] == "Title" and doc.page_content.startswith("Setup"):
            parent_id = doc.metadata["element_id"]
        if doc.metadata.get("parent_id") == parent_id:
            setup_docs.append(doc)
    return setup_docs

page_urls = ["https://python.langchain.com/docs/how_to/chatbots_memory/"]

setup_docs = []
for url in page_urls:
    page_setup_docs = await _get_setup_docs_from_url(url)
    setup_docs.extend(page_setup_docs)

# Simple and fast text extraction
import bs4

docs = []  # Initialize an empty list to store the documents
async for doc in loader.alazy_load():
    docs.append(doc)  # Add each document from the loader's async iteration
    
assert len(docs) == 1  # Check if exactly one document is loaded
    
doc = docs[0]  # Access and print the first document
print(f"Page {doc.metadata['page_title']} ({doc.metadata['url']})")
print(doc.page_content[:500].strip())
```