In [11]:
import os
import nest_asyncio
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import MarkdownNodeParser # Use this for local-only
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# This line is CRITICAL for Jupyter/Anaconda to prevent "Event loop" errors
nest_asyncio.apply()

load_dotenv()

# 1. Initialize the Parser
parser = LlamaParse(result_type="markdown", verbose=True)

# 2. Define the Reader 
# This handles the rate limits and async calls more safely than a manual loop
file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(
    input_dir="./Data", 
    file_extractor=file_extractor
)

def build_and_save():
    print("Extracting PDFs via LlamaParse...")
    documents = reader.load_data()
    
    # 3. Use MarkdownNodeParser (Does NOT require OpenAI/LLM)
    node_parser = MarkdownNodeParser()
    nodes = node_parser.get_nodes_from_documents(documents)
    
    # 4. Embed locally using your MiniLM model
    embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
    for node in nodes:
        node.embedding = embed_model.get_text_embedding(node.get_content())
        
    # 5. Save the Index to your hard drive
    index = VectorStoreIndex(nodes, embed_model=embed_model)
    index.storage_context.persist(persist_dir="./storage")
    print("✅ Success! Knowledge base saved to ./storage")

build_and_save()

Extracting PDFs via LlamaParse...


2026-01-10 19:19:52,465 - INFO - HTTP Request: POST https://api.cloud.llamaindex.ai/api/parsing/upload "HTTP/1.1 200 OK"


Started parsing the file under job_id c8413804-3895-45aa-8ec0-55c6593dc2f1


2026-01-10 19:19:53,619 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/c8413804-3895-45aa-8ec0-55c6593dc2f1 "HTTP/1.1 200 OK"
2026-01-10 19:19:55,760 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/c8413804-3895-45aa-8ec0-55c6593dc2f1 "HTTP/1.1 200 OK"
2026-01-10 19:19:58,910 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/c8413804-3895-45aa-8ec0-55c6593dc2f1 "HTTP/1.1 200 OK"
2026-01-10 19:19:59,148 - INFO - HTTP Request: GET https://api.cloud.llamaindex.ai/api/parsing/job/c8413804-3895-45aa-8ec0-55c6593dc2f1/result/markdown "HTTP/1.1 200 OK"
2026-01-10 19:19:59,189 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


✅ Success! Knowledge base saved to ./storage
