In [1]:
cd "/Users/sylviathsu/Documents/COS243/LocalAILibrarian"

/Users/sylviathsu/Documents/COS243/LocalAILibrarian


In [2]:
!pip install -q \
  llama-index \
  EbookLib \
  html2text \
  gradio \
  llama-index-embeddings-huggingface \
  llama-index-llms-ollama

In [3]:
import os
import logging
import gradio as gr
from llama_index.core import (
    SimpleDirectoryReader, 
    VectorStoreIndex, 
    StorageContext, 
    load_index_from_storage,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.node_parser import SentenceSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def setup_logging():
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    return logger

logger = setup_logging()

# Step 2: Document Processing
def process_documents(doc_folder):
    reader = SimpleDirectoryReader(input_dir=doc_folder)
    documents = reader.load_data()
    splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
    nodes = splitter.get_nodes_from_documents(documents)
    return nodes

# Step 3: Embedding Generation
def generate_embeddings(nodes):
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    vector_index = VectorStoreIndex.from_documents(nodes, embed_model=embed_model)
    storage_context = vector_index.storage_context
    storage_context.persist(persist_dir="storage")
    return vector_index

# Step 4: Query Engine Configuration
def configure_query_engine():
    llm = Ollama(model="phi3.5:3.8b-mini-instruct-q4_K_M")
    storage_context = StorageContext.from_defaults(persist_dir="storage")
    index = load_index_from_storage(storage_context, llm=llm)
    query_engine = index.as_query_engine(
        similarity_top_k=5,
        timeout=30  # Increase timeout to handle slow processing
    )
    return query_engine

# Step 5: Gradio Interface
def create_gradio_interface(query_engine):
    def query_docs(query, history):
        response = query_engine.query(query)
        sources = "\n".join([node.node.metadata.get('file_name', 'Unknown Source') for node in response.source_nodes])
        history.append((query, response.response + f"\nSources:\n{sources}"))
        return "\n".join([f"Q: {q}\nA: {a}" for q, a in history]), history

    with gr.Blocks() as app:
        history = gr.State([])
        with gr.Row():
            gr.Markdown("## Local AI Librarian")
        with gr.Row():
            with gr.Column():
                query = gr.Textbox(label="Enter your query")
                submit_btn = gr.Button("Search")
            with gr.Column():
                output = gr.Textbox(label="Results", lines=20)
        submit_btn.click(query_docs, inputs=[query, history], outputs=[output, history])
    app.launch()

# Main Execution
if __name__ == "__main__":
    try:
        logger.info("Processing documents...")
        doc_folder = "./library"
        nodes = process_documents(doc_folder)

        logger.info("Generating embeddings and saving index...")
        vector_index = generate_embeddings(nodes)

        logger.info("Configuring query engine...")
        query_engine = configure_query_engine()

        logger.info("Launching Gradio interface...")
        create_gradio_interface(query_engine)

    except Exception as e:
        logger.error(f"An error occurred: {e}")

INFO:__main__:Processing documents...
INFO:__main__:Generating embeddings and saving index...
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
ERROR:__main__:An error occurred: 'TextNode' object has no attribute 'get_doc_id'
