In [2]:
pip install llama_index.embeddings.huggingface

Collecting llama_index.embeddings.huggingface
  Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl.metadata (458 bytes)
Downloading llama_index_embeddings_huggingface-0.6.1-py3-none-any.whl (8.9 kB)
Installing collected packages: llama_index.embeddings.huggingface
Successfully installed llama_index.embeddings.huggingface-0.6.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import nest_asyncio
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import MarkdownNodeParser # Use this for local-only
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# This line is CRITICAL for Jupyter/Anaconda to prevent "Event loop" errors
nest_asyncio.apply()

load_dotenv()

# 1. Initialize the Parser
parser = LlamaParse(result_type="markdown", verbose=True)

# 2. Define the Reader 
# This handles the rate limits and async calls more safely than a manual loop
file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(
    input_dir="./Data", 
    file_extractor=file_extractor
)

def build_and_save():
    print("Extracting PDFs via LlamaParse...")
    documents = reader.load_data()
    
    # 3. Use MarkdownNodeParser (Does NOT require OpenAI/LLM)
    node_parser = MarkdownNodeParser()
    nodes = node_parser.get_nodes_from_documents(documents)
    
    # 4. Embed locally using your MiniLM model
    embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
    for node in nodes:
        node.embedding = embed_model.get_text_embedding(node.get_content())
        
    # 5. Save the Index to your hard drive
    index = VectorStoreIndex(nodes, embed_model=embed_model)
    index.storage_context.persist(persist_dir="./storage")
    print("✅ Success! Knowledge base saved to ./storage")

build_and_save()

  from .autonotebook import tqdm as notebook_tqdm


Extracting PDFs via LlamaParse...


2026-01-10 20:15:43,825 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id 1534ca02-6804-4fa8-a543-1b277b4f0881


2026-01-10 20:15:45,024 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1534ca02-6804-4fa8-a543-1b277b4f0881 "HTTP/1.1 200 OK"
2026-01-10 20:15:47,160 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1534ca02-6804-4fa8-a543-1b277b4f0881 "HTTP/1.1 200 OK"
2026-01-10 20:15:50,293 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1534ca02-6804-4fa8-a543-1b277b4f0881 "HTTP/1.1 200 OK"
2026-01-10 20:15:54,433 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1534ca02-6804-4fa8-a543-1b277b4f0881 "HTTP/1.1 200 OK"
2026-01-10 20:15:54,689 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/1534ca02-6804-4fa8-a543-1b277b4f0881/result/markdown "HTTP/1.1 200 OK"
2026-01-10 20:15:54,757 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


✅ Success! Knowledge base saved to ./storage
