In [1]:
import os
os.chdir("..")  # to run notebook as if in the base folder, so filepaths dont need to be changed

In [2]:
PROCESSED_DIR = "./processed_data/langchain_docs_docs"
MODEL = "models/gemini-2.0-flash"# "llama-3.3-70b-versatile"
RATE_LIMIT = 10  # LLM req/min, -1 if no limit
# COLLECTION = 'simple_md_nodes'

In [3]:
import os
import time
import logging
import math
import asyncio
from pathlib import Path
from dotenv import load_dotenv
from IPython.display import display, Markdown

In [4]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter, MarkdownNodeParser
from llama_index.core import Settings
from llama_index.core.extractors import TitleExtractor, SummaryExtractor
from llama_index.llms.groq import Groq
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode, MetadataMode
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_index.llms.google_genai import GoogleGenAI


In [5]:
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [6]:
from vertexai.preview import tokenization
import chromadb
from tqdm import tqdm
import json
import tiktoken

In [7]:

# from utilities import get_logger, setup_llm_logs
from utilities import create_custom_logger, get_large_files, setup_llm_logs, GeminiTokenizer, HuggingfaceTokenizer


In [8]:
load_dotenv()

True

In [9]:
# directory to store processed data like vecor store, doctore etc
# PROCESSED_DIR = (Path('processed_data') / Path(INPUT_DIR).parts[-1]).as_posix()
# Path(PROCESSED_DIR).mkdir(exist_ok=True)
# PROCESSED_DIR

In [10]:
config_dir = Path(PROCESSED_DIR) / 'config.json'
with open(config_dir, 'r') as f:
    config = json.load(f)

In [11]:
config

{'llm_model_provider': 'gemini',
 'llm_model': 'models/gemini-1.5-flash',
 'rate_limit': 15,
 'input_dir': 'data/langchain/docs/docs/',
 'output_dir': 'processed_data/langchain_docs_docs',
 'file_types': ['.md', '.mdx'],
 'vector_store': 'chroma',
 'chromadb_path': 'processed_data/langchain_docs_docs/chromadb',
 'chroma_collection': 'contextual',
 'doctsore_path': 'processed_data/langchain_docs_docs/docstore.json',
 'embedding_provider': 'GeminiEmbedding',
 'embedding_model': 'models/text-embedding-004',
 'tokenizer_provider': 'gemini',
 'tokenizer_model_name': 'gemini-1.5-flash-002',
 'max_node_tokens': 2000,
 'metadata_extractors': ['CustomDocumentContextExtractor'],
 'datetime': '2025-02-21T14:37:17.151125+00:00',
 'run_1_time': '2025-02-21T17:23:40.691277+00:00',
 'run_1_nodes': 1567,
 'run_2_time': '2025-02-22T07:25:50.461930+00:00',
 'run_2_nodes': 0,
 'run_3_time': '2025-02-22T17:18:28.706884+00:00',
 'run_3_nodes': 1063}

In [12]:
# llm = Groq(model=MODEL, api_key=os.environ['GROQ_API_KEY'], max_retries=3,  # Number of retry attempts
#     retry_on_rate_limit=True)
llm = GoogleGenAI(
    model=MODEL,
    api_key=os.environ['GEMINI_API_KEY'], 
    max_retries=3,  # Number of retry attempts
    retry_on_rate_limit=False
)


In [13]:
def get_embed_model(config):
    if config['embedding_provider'] == 'HuggingFaceEmbedding':
        from llama_index.embeddings.huggingface import HuggingFaceEmbedding
        return HuggingFaceEmbedding(model_name=config['embedding_model'])
    elif config['embedding_provider'] == 'GoogleGenAIEmbedding':
        from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
        return GoogleGenAIEmbedding(model_name=config['embedding_model'], api_key=os.environ['GEMINI_API_KEY'])
    else:
        raise NotImplementedError(f"Embedding provider {config['embedding_provider']} invalid or not implemented")

In [14]:
embed_model = get_embed_model(config)

In [15]:
def get_tokenizer(config):
    if config['tokenizer_provider'] == 'gemini':
        return GeminiTokenizer(model=config['tokenizer_model_name']).encode
    elif config['tokenizer_provider'] == 'huggingface':
        return HuggingfaceTokenizer(model=config['tokenizer_model_name']).encode
    elif config['tokenizer_provider'] == 'tiktoken':
        return tiktoken.get_encoding(encoding_name=config['tokenizer_model_name']).encode
    else:
        raise NotImplementedError(f"{config['tokenizer_provider']} invalid or not implemented")
    
tokenizer = GeminiTokenizer()

In [16]:
Settings.llm = llm
Settings.context_window = 32000
Settings.embed_model = embed_model
Settings.tokenizer = tokenizer.encode

In [17]:
# logger = get_logger(logger_name='LLMEvents', logfile_path='logs/rag_query.log')
# setup_llm_logs(logger=logger, tokenizer=tokenizer, short_inputs=False, short_outputs=True)

In [18]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler, simple_llm_handler
token_counter = TokenCountingHandler(
    tokenizer=tokenizer.encode,
    verbose=True,  # set to true to see usage printed to the console
)
llm_debugger = simple_llm_handler.SimpleLLMHandler()
handlers = [token_counter, llm_debugger]
Settings.callback_manager = CallbackManager(handlers)

## Index from vector store

In [19]:
config

{'llm_model_provider': 'gemini',
 'llm_model': 'models/gemini-1.5-flash',
 'rate_limit': 15,
 'input_dir': 'data/langchain/docs/docs/',
 'output_dir': 'processed_data/langchain_docs_docs',
 'file_types': ['.md', '.mdx'],
 'vector_store': 'chroma',
 'chromadb_path': 'processed_data/langchain_docs_docs/chromadb',
 'chroma_collection': 'contextual',
 'doctsore_path': 'processed_data/langchain_docs_docs/docstore.json',
 'embedding_provider': 'GeminiEmbedding',
 'embedding_model': 'models/text-embedding-004',
 'tokenizer_provider': 'gemini',
 'tokenizer_model_name': 'gemini-1.5-flash-002',
 'max_node_tokens': 2000,
 'metadata_extractors': ['CustomDocumentContextExtractor'],
 'datetime': '2025-02-21T14:37:17.151125+00:00',
 'run_1_time': '2025-02-21T17:23:40.691277+00:00',
 'run_1_nodes': 1567,
 'run_2_time': '2025-02-22T07:25:50.461930+00:00',
 'run_2_nodes': 0,
 'run_3_time': '2025-02-22T17:18:28.706884+00:00',
 'run_3_nodes': 1063}

In [20]:
chromadb_path = config['chromadb_path']
chromadb_found = False
if os.path.exists(chromadb_path):
    chromadb_found = True

In [21]:
import subprocess

In [24]:
if chromadb_found:
    print('loading Index...')
    COLLECTION = config['chroma_collection']
    
    process = subprocess.Popen(["chroma", "run", "--path", chromadb_path])
    chroma_client = chromadb.HttpClient()
    print('collections', chroma_client.list_collections())

    # remote_db = chromadb.PersistentClient(path=chromadb_path)

    chroma_collection = chroma_client.get_collection(COLLECTION)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    vector_index = VectorStoreIndex.from_vector_store(
        vector_store,
        embed_model=embed_model,
    )
    print('index loaded')
else:
    print(f"ChromaDB path {chromadb_path} does not exist.")


loading Index...
collections ['contextual']
index loaded


## Index from docstore

In [25]:
docstore_path = f"{PROCESSED_DIR}/docstore_only_emb.json"
doctsore_found = False
if Path(docstore_path).exists():
    doctsore_found = True
    print(f"Loading docstore from {docstore_path}")
else:
    print('docstore does not exist')

docstore does not exist


In [26]:
if doctsore_found:
    # Load the document store
    docstore = SimpleDocumentStore.from_persist_path(docstore_path)
    nodes = list(docstore.docs.values())
    print(len(nodes), 'nodes')
    for node in nodes[:10]:
        display(Markdown(node.text))
        print('------')


In [27]:
if not os.path.exists(chromadb_path) and docstore:
    storage_context_doctore = StorageContext.from_defaults(docstore=docstore)
    # summary_index = SummaryIndex(nodes, storage_context=storage_context_doctore)
    vector_index = VectorStoreIndex(nodes=nodes, storage_context=storage_context_doctore)

In [28]:
if not os.path.exists(chromadb_path) and not docstore:
    raise Exception('Neither Chromadb nor docstore exist. Please check the paths and try again.')

## Simple Query and Chat

In [29]:
# vector_index.storage_context.persist('mystore2')

In [30]:
# get nodes info
nodes_info = chroma_collection.get()

In [31]:
nodes_info.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris', 'included'])

In [42]:
query = 'how to use gemini as the llm in langchain?'

In [43]:
retr = vector_index.as_retriever(similarity_top_k=10)

In [44]:
retr_nodes = retr.retrieve(query)

Embedding Token Usage: 13


In [45]:
for node in retr_nodes:
    print(node.node.node_id)
    print(node.node.metadata['context'])
    print(node.node.text)
    print('-'*20)


integrations/providers/google.mdx-4864-5207
LLMs section:  Describes how to access Google AI Gemini models using the GoogleGenerativeAI class within the Langchain framework.

### Google Generative AI

Access GoogleAI `Gemini` models such as `gemini-pro` and `gemini-pro-vision` through the `GoogleGenerativeAI` class.

Install python package.

```bash
pip install langchain-google-genai
```

See a [usage example](/docs/integrations/llms/google_ai).

```python
from langchain_google_genai import GoogleGenerativeAI
```
--------------------
integrations/providers/google.mdx-4855-4862
Section outlining different Large Language Models (LLMs) available for integration with Langchain, including Google's Gemini and models accessible through Vertex AI.

## LLMs
--------------------
integrations/providers/google.mdx-2099-2416
This section describes how to access Google's Gemini chat models via the Vertex AI platform using the `langchain-google-vertexai` package.

### Vertex AI

Access chat models li

In [38]:
chat_engine = vector_index.as_chat_engine()

In [40]:
resp = chat_engine.chat(query)

LLM Prompt Token Usage: 508
LLM Completion Token Usage: 52
** Messages: **
system: You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
> Tool Name: query_engine_tool
Tool Description: Useful for running a natural language query
against a knowledge base and get back a natural language response.

Tool Args: {"properties": {"input": {"title": "Input", "type": "string"}}, "required": ["input"], "type": "object"}



## Output Format

Please answer in the same language as the question and use the following format:

```
Thought: The current language of the user is: (user's language). I need to use a tool to help me answer

In [41]:
print(resp.response)

To use Google's Gemini models with Langchain, you can access them through the `GoogleGenerativeAI` class. First, install the `langchain-google-genai` Python package. Then, you can import and use the `GoogleGenerativeAI` class in your code. A usage example is available in the documentation.


In [63]:
resp.source_nodes#[0].node.text

[NodeWithScore(node=TextNode(id_='additional_resources/tutorials.mdx-38-149', embedding=None, metadata={'file_path': 'additional_resources/tutorials.mdx', 'file_name': 'tutorials.mdx', 'file_size': 3498, 'creation_date': '2025-02-21', 'last_modified_date': '2025-02-21', 'header_path': '/3rd Party Tutorials/ Tutorials/', 'context': 'This is a YouTube playlist tutorial on LangChain v0.1 from LangChain.ai, listed under the "Tutorials" section of a document compiling various LangChain learning resources.\n'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='additional_resources/tutorials.mdx', node_type='4', metadata={'file_path': 'additional_resources/tutorials.mdx', 'file_name': 'tutorials.mdx', 'file_size': 3

In [69]:
for source_node in resp.source_nodes:
    print('score:', f"{source_node.score:.3f}", 'doc:', source_node.node.ref_doc_id, 'pos:', f"{source_node.node.start_char_idx*100/source_node.node.metadata['file_size'] :.0f}%", 'start', source_node.node.start_char_idx, 'end', source_node.node.end_char_idx)

score: 0.660 doc: additional_resources/tutorials.mdx pos: 1% start 38 end 149
score: 0.652 doc: introduction.mdx pos: 87% start 5372 end 5649


In [70]:
chat_engine.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='what is Langchain?')]),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='Langchain is a framework that seems to be used for building LLM applications. One resource mentions a YouTube playlist tutorial on LangChain v0.1. Another resource mentions a tool called LangGraph that integrates with LangChain and helps build stateful LLM applications.')])]

In [47]:

# Query Data from the persisted index
query_engine = vector_index.as_query_engine()

print('staring query')

query = input('Enter query (press q to quit): ')
response = query_engine.query(query)
print(response)


print('finished')
print('yo')
# logger.info('hey')

staring query
Embedding Token Usage: 1
LLM Prompt Token Usage: 239
LLM Completion Token Usage: 17
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
[Excerpt from document]
file_path: integrations/text_embedding/index.mdx
header_path: /
context: Frontmatter metadata for the document.
Excerpt:
-----
---
sidebar_position: 0
sidebar_class_name: hidden
---
-----

[Excerpt from document]
file_path: introduction.mdx
header_path: /
context: Frontmatter metadata for the document's introduction.
Excerpt:
-----
---
sidebar_position: 0
sidebar_class_name: hidden
---
-----
---------------------
Given the context informat

## Advanced

In [72]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.chat_engine import ContextChatEngine

In [73]:
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=8)

In [80]:
# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("how to use google gemini as the llm?")
print(response)

Embedding Token Usage: 11
LLM Prompt Token Usage: 3174
LLM Completion Token Usage: 450
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information from multiple sources is below.
---------------------
[Excerpt from document]
file_path: integrations/providers/google.mdx
header_path: /Google/LLMs/
context: LLMs section:  Describes how to access Google AI Gemini models using the GoogleGenerativeAI class within the Langchain framework.
Excerpt:
-----
### Google Generative AI

Access GoogleAI `Gemini` models such as `gemini-pro` and `gemini-pro-vision` through the `GoogleGenerativeAI` class.

Install python package.

```bash
pip install langchain-google-

In [76]:
sum([len(sn.node.text) for sn in response.source_nodes])

1143

In [77]:
len(tokenizer.encode('\n'.join([sn.node.text for sn in response.source_nodes]))) 

317

In [78]:
len(tokenizer.encode(response.get_formatted_sources()))

336

In [79]:
for source_node in response.source_nodes:
    print('score:', f"{source_node.score:.3f}", 'doc:', source_node.node.ref_doc_id, 'pos:', f"{source_node.node.start_char_idx*100/source_node.node.metadata['file_size'] :.0f}%", 'start', source_node.node.start_char_idx, 'end', source_node.node.end_char_idx)

score: 0.679 doc: additional_resources/tutorials.mdx pos: 1% start 38 end 149
score: 0.666 doc: changes/changelog/core.mdx pos: 0% start 0 end 16
score: 0.661 doc: introduction.mdx pos: 87% start 5372 end 5649
score: 0.658 doc: changes/changelog/langchain.mdx pos: 0% start 13 end 21
score: 0.657 doc: additional_resources/tutorials.mdx pos: 8% start 278 end 383
score: 0.656 doc: concepts/chat_models.mdx pos: 0% start 0 end 13
score: 0.655 doc: concepts/why_langchain.mdx pos: 0% start 0 end 603
score: 0.654 doc: versions/v0_2/overview.mdx pos: 0% start 29 end 39


In [34]:
ret = retriever.retrieve("how to download a webpage as an image")

In [36]:
for source_node in ret:
    print('score:', f"{source_node.score:.3f}", 'doc:', source_node.node.ref_doc_id, 'pos:', f"{source_node.node.start_char_idx*100/source_node.node.metadata['file_size'] :.0f}%", 'start', source_node.node.start_char_idx, 'end', source_node.node.end_char_idx)

score: 0.653 doc: Core/Link_Media.md pos: 60% start 8880 end 10869
score: 0.647 doc: Advanced/Overview.md pos: 41% start 6926 end 8449
score: 0.644 doc: Core/Link_Media.md pos: 75% start 11231 end 11573
score: 0.617 doc: Core/Crawler_Result.md pos: 86% start 12569 end 12887
score: 0.613 doc: Core/Content_Selection.md pos: 75% start 13747 end 16261
score: 0.610 doc: Advanced/File_Downloading.md pos: 54% start 5020 end 5446
score: 0.603 doc: Core/Crawler_Result.md pos: 84% start 12253 end 12565
score: 0.602 doc: Core/Link_Media.md pos: 29% start 4250 end 5104


In [51]:
# Create a chat engine with the retriever
chat_engine = ContextChatEngine.from_defaults(
    retriever=retriever,
    system_prompt="You are a helpful AI assistant. Use the provided context to answer questions.",
    verbose=True
)

In [None]:
query = "how to download a webpage as an image"
response = chat_engine.chat()
print(response)


You can download a webpage as an image (screenshot) using Crawl4AI by setting the `screenshot` parameter to `True` in your crawler configuration. The screenshot will be stored as a base64-encoded PNG string in the `result.screenshot` attribute of the crawl result.

Here's an example:

```python
import os, asyncio
from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode
async def main():
  async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
      url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
      cache_mode=CacheMode.BYPASS,
      screenshot=True
    )
    if result.success:
      # Save screenshot
      if result.screenshot:
        with open("wikipedia_screenshot.png", "wb") as f:
          f.write(b64decode(result.screenshot))
      print("[OK] Screenshot captured.")
    else:
      print("[ERROR]", result.error_message)
if __name__ == "__main__":
  asyncio.run(main())
```

This code snippet demonstrates how to capture a

In [54]:
ref_nodes = response.source_nodes

In [56]:

for source_node in ref_nodes:
    print('score:', f"{source_node.score:.3f}", 'doc:', source_node.node.ref_doc_id, 'pos:', f"{source_node.node.start_char_idx*100/source_node.node.metadata['file_size'] :.0f}%", 'start', source_node.node.start_char_idx, 'end', source_node.node.end_char_idx)

score: 0.653 doc: Core/Link_Media.md pos: 60% start 8880 end 10869
score: 0.647 doc: Advanced/Overview.md pos: 41% start 6926 end 8449
score: 0.644 doc: Core/Link_Media.md pos: 75% start 11231 end 11573
score: 0.617 doc: Core/Crawler_Result.md pos: 86% start 12569 end 12887
score: 0.613 doc: Core/Content_Selection.md pos: 75% start 13747 end 16261
score: 0.610 doc: Advanced/File_Downloading.md pos: 54% start 5020 end 5446
score: 0.603 doc: Core/Crawler_Result.md pos: 84% start 12253 end 12565
score: 0.602 doc: Core/Link_Media.md pos: 29% start 4250 end 5104


## Hybrid Retriever

In [None]:
# hybrid:
# from llama_index.retrievers import BM25Retriever, VectorIndexRetriever

# bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)
# vector_retriever = VectorIndexRetriever(index)
# from llama_index.retrievers import BaseRetriever

# class HybridRetriever(BaseRetriever):
# def __init__(self, vector_retriever, bm25_retriever):
# self.vector_retriever = vector_retriever
# self.bm25_retriever = bm25_retriever

# def _retrieve(self, query, **kwargs):
# bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
# vector_nodes = self.vector_retriever.retrieve(query, **kwargs)
# all_nodes = []
# node_ids = set()
# for n in bm25_nodes + vector_nodes:
# if n.node.node_id not in node_ids:
# all_nodes.append(n)
# node_ids.add(n.node.node_id)
# return all_nodes

# hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

## Rerank

In [None]:
from llama_index.postprocessor.cohere_rerank import CohereRerank

cohere_rerank = CohereRerank(
    top_n=2, model="rerank-v3.5", api_key=os.environ['COHERE_API_KEY']
)

sorted_nodes = cohere_rerank.postprocess_nodes(nodes=ref_nodes, query_str="how to download a webpage as an image")

In [65]:

for source_node in sorted_nodes:
    print('score:', f"{source_node.score:.3f}", 'doc:', source_node.node.ref_doc_id, 'pos:', f"{source_node.node.start_char_idx*100/source_node.node.metadata['file_size'] :.0f}%", 'start', source_node.node.start_char_idx, 'end', source_node.node.end_char_idx)

score: 0.243 doc: Advanced/Overview.md pos: 41% start 6926 end 8449
score: 0.242 doc: Core/Link_Media.md pos: 75% start 11231 end 11573


In [45]:
response = chat_engine.chat("now print the crawl result")
print(response)


```python
import os, asyncio
from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode
async def main():
  async with AsyncWebCrawler() as crawler:
    result = await crawler.arun(
      url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
      cache_mode=CacheMode.BYPASS,
      screenshot=True
    )
    if result.success:
      # Save screenshot
      if result.screenshot:
        with open("wikipedia_screenshot.png", "wb") as f:
          f.write(b64decode(result.screenshot))
      print("[OK] Screenshot captured.")
      print(result) # Print the crawl result
    else:
      print("[ERROR]", result.error_message)
if __name__ == "__main__":
  asyncio.run(main())
```

This code will now print the entire `CrawlResult` object to the console, allowing you to inspect all its attributes, including `url`, `status_code`, `html`, `markdown_v2`, `media`, `links`, `screenshot`, and more.



In [50]:

for source_node in response.source_nodes:
    print('score:', f"{source_node.score:.3f}", 'doc:', source_node.node.ref_doc_id, 'pos:', f"{source_node.node.start_char_idx*100/source_node.node.metadata['file_size'] :.0f}%", 'start', source_node.node.start_char_idx, 'end', source_node.node.end_char_idx)

score: 0.778 doc: API_Reference/CrawlResult.md pos: 82% start 14476 end 15782
score: 0.774 doc: API_Reference/CrawlResult.md pos: 33% start 5861 end 6017
score: 0.756 doc: Core/Crawler_Result.md pos: 89% start 13055 end 13688
score: 0.750 doc: Core/Simple_Crawling.md pos: 76% start 6673 end 6975
score: 0.746 doc: API_Reference/CrawlResult.md pos: 36% start 6429 end 6633
score: 0.745 doc: API_Reference/CrawlResult.md pos: 34% start 6021 end 6237
score: 0.744 doc: Core/Crawler_Result.md pos: 42% start 6084 end 8277
score: 0.739 doc: Core/Crawler_Result.md pos: 84% start 12253 end 12565


In [46]:
chat_engine.chat_history 

[ChatMessage(role=<MessageRole.USER: 'user'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='how to download a webpage as an image')]),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, additional_kwargs={}, blocks=[TextBlock(block_type='text', text='You can download a webpage as an image (screenshot) using Crawl4AI by setting the `screenshot` parameter to `True` in your crawler configuration. The screenshot will be stored as a base64-encoded PNG string in the `result.screenshot` attribute of the crawl result.\n\nHere\'s an example:\n\n```python\nimport os, asyncio\nfrom base64 import b64decode\nfrom crawl4ai import AsyncWebCrawler, CacheMode\nasync def main():\n  async with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n      url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",\n      cache_mode=CacheMode.BYPASS,\n      screenshot=True\n    )\n    if result.success:\n      # Save screenshot\n      if result.screenshot:\n        w

In [67]:
query_engine = vector_index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[cohere_rerank],
    streaming=True,
)
streaming_response = query_engine.query(
    "define a pydantic class 'ProductData' and extract the list of Product from a webpage by getting the title and content. write code.",
)
streaming_response.print_response_stream()

```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
# Define schema
class ProductData(BaseModel):
  title: str
  content: str
# Create strategy
strategy = LLMExtractionStrategy(
  provider="ollama/llama2",
  schema=ProductData.schema(),
  instruction="Extract article details"
)
# Use with crawler
result = await crawler.arun(
  url="https://example.com/article",
  extraction_strategy=strategy
)
# Access extracted data
data = json.loads(result.extracted_content)
```

In [None]:
# stop the chromadb server process
process.terminate()
process.kill()