In [1]:
import os
import time
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec
from langchain_classic.chains import create_retrieval_chain 
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from langchain_core.documents import Document
from uuid import uuid4
from langchain_cohere import CohereRerank
from langchain_classic.retrievers import ContextualCompressionRetriever
from langchain_classic.retrievers.multi_query import MultiQueryRetriever

  from .autonotebook import tqdm as notebook_tqdm


# Create a new RAG from Scratch(if the DB is not created use this)

In [None]:
#loading and chunk the pdf file
def load_and_chunk_pdfs(file_paths):
    documents = [] 
    #chunk pdf one by one(if there are more than one PDF)
    for path in file_paths:
        single_file_text = ""
        try:
            reader = PdfReader(path)
            #extract text from each page of pdf and combine them together
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    single_file_text  = single_file_text + text + " "
        except Exception as e:
            print(f"   can't read the file, {path}: {e}")
            continue

        #chunk the pdf
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=400,
            separators=["\n\n", "\n", ".", "?", "!", ";"],
            is_separator_regex=False 
        )
        page_chunk = text_splitter.split_text(single_file_text)        

        #transfer each chunk to a document object
        for chunk in page_chunk:
            doc = Document(
                page_content=chunk, 
                #info of the chunk, such as source, page...
                metadata={           
                    "Source": "The Almanack of Naval Ravikant",
                    "page":i+1,
                    "type": "book"
                }
            )
            documents.append(doc)

        print(f"   -> {path} created {len(page_chunk)} Document objects„ÄÇ")

    print(f"Finished. Total {len(documents)} objects.")
    return documents

#init the pinecone and upload documents
def init_vector_store(documents):
    #init the Embedding model
    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

    # init the pinecone
    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

    #check if the index exist if not create one.
    existing_indexes = [i.name for i in pc.list_indexes()]
    if INDEX_NAME not in existing_indexes:
        print(f"Creating Pinecone Index: {INDEX_NAME} (Serverless)...")
        pc.create_index(
            name=INDEX_NAME,
            dimension=DIMENSION,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
        #wait for initialization
        while not pc.describe_index(INDEX_NAME).status.ready:
            time.sleep(1)
    index = pc.Index(INDEX_NAME)

    print("uploading embeddings.")
    vector_store = PineconeVectorStore(index=index, embedding=embeddings)
    #create unique id for each chunk
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)
    print(f"Index {INDEX_NAME} is set.")

    return vector_store

#create the rag chain
def setup_rag_chain(vector_store):
    
    #set the LLM
    llm = ChatOpenAI(model="gpt-4o", temperature=0.1)

    # set pinecone retriever
    # I use cosine si,ilarity to calculate the distnace between each vector.
    base_retriever = vector_store.as_retriever(
        #Cosine similarity
        search_type="similarity",
        #Return Top 20 closest results. 
        search_kwargs={"k":15} 
    )

    # MultiQuery Retriever -> exapnd the user query to have more accurate result.
    multi_query_retriever = MultiQueryRetriever.from_llm(
        retriever=base_retriever,
        llm=llm
    )

    # Rerank by Cohere's cross encoder
    compressor = CohereRerank(
        #the model for english
        model="rerank-english-v3.0", 
        #rerank and get the top 5 results
        top_n=5                     
    )

    #Combine the pinecone_retriver and reranking
    rerank_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=multi_query_retriever
    )

    system_prompt = (
        "You are an AI embodiment of **Naval Ravikant** based on his Almanack.\n"
        "Your goal is to provide the best advice that Naval most probably will provide to the people who ask you question.\n"
        "Style: Direct, concise, rational, and first-principles thinking.\n"
        "Instructions:\n"
        "1. Answer strictly based on the provided book context.\n"
        "2. If applicable, mention the page number from the metadata (e.g., [Page 42]).\n"
        "3. Use bullet points for lists, as Naval often does.\n"
        "4. Answer in Traditional Chinese.\n"
        "Context: {context}"
    )
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
    ])

    #stuff the prompt to LLM
    combine_docs_chain = create_stuff_documents_chain(llm, prompt)
    #chain the rerank retriever and combine_docs_chain
    rag_chain = create_retrieval_chain(rerank_retriever, combine_docs_chain)

    return rag_chain

# --- ‰∏ªÁ®ãÂºèÂü∑Ë°åÂçÄ ---
if __name__ == "__main__":
    # Load the .env file
    load_dotenv()
    # ÂèÉÊï∏Ë®≠ÂÆö
    INDEX_NAME = "the-almanack-of-naval-ravikant-rag"
    EMBEDDING_MODEL = "text-embedding-3-large"
    DIMENSION = 3072
    # ÊåáÂÆöÊ™îÊ°àË∑ØÂæë
    path = r"your path"
    pdf_files = [path]
    
    # 1. read and chunk PDFs
    documents = load_and_chunk_pdfs(pdf_files)
    
    # 2. prepare the vector db
    vector_store = init_vector_store(documents)
    
    # 3. create the Rag chain
    rag_chain = setup_rag_chain(vector_store)
    
    # 4. test
    print("\n" + "="*30)
    print("RAG Bot is ready (enter 'exit' to end the chat)")
    print("="*30)
    
    while True:
        query = input("\n your query: ")
        if query.lower() in ["exit", "quit"]:
            break
            
        print("Thinking...")
        response = rag_chain.invoke({"input": query})
        
        print(f"\n Answer:\n{response['answer']}")
        
        # #print references
        # print("\nüîç referemce (Context):")
        # for i, doc in enumerate(response["context"]):
        #     print(f"--- Chunk {i+1} ---")
        #     print(doc.page_content[:100] + "...")

   -> /Users/shenglienlee/Documents/GitHub/quick-start-guide-to-llms?tab=readme-ov-file/notebooks/My_project/doc/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf created 451 Document objects„ÄÇ
Finished. Total 451 objects.
Creating Pinecone Index: the-almanack-of-naval-ravikant-rag (Serverless)...
uploading embeddings.
Index the-almanack-of-naval-ravikant-rag is set.

RAG Bot is ready (enter 'exit' to end the chat)
Thinking...

 Answer:
Ë¶ÅÊìÅÊúâÊõ¥Â•ΩÁöÑ‰∫∫ÁîüÔºåÂèØ‰ª•ËÄÉÊÖÆ‰ª•‰∏ãÂπæÈªûÔºö

- **ÈÅ∏ÊìáÂπ∏Á¶è**ÔºöÂπ∏Á¶èÊòØ‰∏ÄÁ®ÆÈÅ∏ÊìáÂíåÊäÄËÉΩ„ÄÇ‰Ω†ÂèØ‰ª•ÈÄöÈÅéÊîπËÆäÂÖßÂøÉ‰æÜÊèêÂçáÂπ∏Á¶èÊÑüÔºåËÄå‰∏çÊòØ‰æùË≥¥Â§ñÈÉ®Áí∞Â¢ÉÁöÑÊîπËÆä„ÄÇ[Page 134]
- **ÂüπÈ§äËâØÂ•ΩÁøíÊÖ£**ÔºöÁî®Â•ΩÁöÑÁøíÊÖ£Âèñ‰ª£Â£ûÁøíÊÖ£Ôºå‰∏¶ËàáÁ©çÊ•µÂêë‰∏äÁöÑ‰∫∫‰∫§ÂæÄ„ÄÇ‰Ω†ÁöÑÁøíÊÖ£Âíå‰Ω†Âë®ÂúçÁöÑ‰∫∫ÊúÉÂΩ±Èüø‰Ω†ÁöÑÈï∑ÊúüÂπ∏Á¶è„ÄÇ[Page 140]
- **ÂÖßÂøÉÁöÑÂπ≥Âíå**ÔºöËøΩÊ±ÇÂÖßÂøÉÁöÑÂπ≥ÂíåÔºåËÄå‰∏çÊòØË©¶ÂúñËß£Ê±∫ÊâÄÊúâÂ§ñÈÉ®ÂïèÈ°å„ÄÇÊîæ‰∏ãÂ∞çÂïèÈ°åÁöÑÂü∑ËëóÔºåÊâçËÉΩÁç≤ÂæóÂÖßÂøÉÁöÑÂπ≥Èùú„ÄÇ[Page 143]
- **Èôç‰ΩéË∫´‰ªΩË™çÂêå

## About multiquery Retriever
ÂÅáË®≠ÊÇ®Ë®≠ÂÆö search_kwargs={"k": 20}Ôºå‰∏î MultiQuery ÁîüÊàê‰∫Ü 3 ÂÄãÂïèÈ°åÔºö  
ÁîüÊàêÂïèÈ°åÔºö GPT-4o Áî¢Áîü Q1, Q2, Q3„ÄÇ  
ÂàÜÂà•ÊêúÂ∞ãÔºöQ1 -> Âéª Pinecone Êäì 20 Á≠Ü„ÄÇ  
Q2 -> Âéª Pinecone Êäì 20 Á≠Ü„ÄÇ  
Q3 -> Âéª Pinecone Êäì 20 Á≠Ü„ÄÇ  
ÂéüÂßãÁ∏ΩÊï∏Ôºö Á∏ΩÂÖ±ÊäìÂõû $20 + 20 + 20 = 60$ Á≠ÜË≥áÊñô„ÄÇ  
ÈóúÈçµÊ≠•È©ü - ÂéªÈáç (Deduplication)Ôºö  
LangChain ÊúÉËá™ÂãïÂü∑Ë°å „ÄåÂèñËÅØÈõÜ (Union)„Äç ÁöÑÂãï‰Ωú„ÄÇÊÉÖÂ¢ÉÔºö Â¶ÇÊûú„ÄåÁ¨¨ 42 È†ÅÁöÑÊüê‰∏ÄÊÆµ„ÄçÂú® Q1 Ë¢´ÊäìÂà∞ÔºåÂú® Q2 ‰πüË¢´ÊäìÂà∞„ÄÇ  
ÁµêÊûúÔºö ÂÆÉÂè™ÊúÉÁÆó 1 Á≠ÜÔºå‰∏çÊúÉÈáçË§á„ÄÇ  
ÊúÄÁµÇÈÄÅÁµ¶ Reranker ÁöÑÊï∏ÈáèÔºö  
ÊúÄÂ∞ëÔºö 20 Á≠Ü (Â¶ÇÊûú 3 ÂÄãÂïèÈ°åÊäìÂõû‰æÜÁöÑÂÆåÂÖ®‰∏ÄÊ®°‰∏ÄÊ®£)„ÄÇ  
ÊúÄÂ§öÔºö 60 Á≠Ü (Â¶ÇÊûú 3 ÂÄãÂïèÈ°åÊäìÂõû‰æÜÁöÑÂÆåÂÖ®‰∏çÈáçÁñä)„ÄÇÂπ≥ÂùáÁ∂ìÈ©óÂÄºÔºö ÈÄöÂ∏∏ÊúÉËêΩÂú® 35 ~ 50 Á≠Ü ‰πãÈñì„ÄÇ

# use exist RAG

In [None]:
def setup_rag_chain(vector_store):
    llm = ChatOpenAI(model="gpt-4o", temperature=0.1)
    
# set pinecone retriever
    # I use cosine si,ilarity to calculate the distnace between each vector.
    base_retriever = vector_store.as_retriever(
        #Cosine similarity
        search_type="similarity",
        #Return Top 20 closest results. 
        search_kwargs={"k":15} 
    )

    # MultiQuery Retriever -> exapnd the user query to have more accurate result.
    multi_query_retriever = MultiQueryRetriever.from_llm(
        retriever=base_retriever,
        llm=llm
    )

    # Rerank by Cohere's cross encoder
    compressor = CohereRerank(
        #the model for english
        model="rerank-english-v3.0", 
        #rerank and get the top 5 results
        top_n=5                     
    )

    #Combine the pinecone_retriver and reranking
    rerank_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=multi_query_retriever
    )

    system_prompt = (
        "You are an AI embodiment of Naval Ravikant based on his Almanack.\n"
        "Your goal is to provide the best advice that Naval most probably will provide to the people who ask you question.\n"
        "Style: Direct, concise, rational, and first-principles thinking.\n"
        "Instructions:\n"
        "1. Answer strictly based on the provided book context.\n"
        "2. If applicable, mention the page number from the metadata (e.g., [Page 42]).\n"
        "3. Use bullet points for lists, as Naval often does.\n"
        "4. Answer in Traditional Chinese.\n"
        "Context: {context}"
    )

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", "{input}"),
    ])


    combine_docs_chain = create_stuff_documents_chain(llm, prompt)

    rag_chain = create_retrieval_chain(rerank_retriever, combine_docs_chain)

    return rag_chain


if __name__ == "__main__":
    # Load the .env file
    load_dotenv()

    # Retrieve the Pinecone API key from user data
    pinecone_key = os.environ.get('PINECONE_API_KEY')

    # Initialize the OpenAI client with the API key from user data
    OpenAI_key=os.environ.get("OPENAI_API_KEY")




    INDEX_NAME = "the-almanack-of-naval-ravikant-rag"
    EMBEDDING_MODEL = "text-embedding-3-large"
    DIMENSION = 3072
    path = r"/Users/shenglienlee/Documents/GitHub/quick-start-guide-to-llms?tab=readme-ov-file/notebooks/My_project/doc/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
    pdf_files = [path]
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
    vector_store = PineconeVectorStore.from_existing_index(
        index_name=INDEX_NAME,
        embedding=embeddings
    )
    
    rag_chain = setup_rag_chain(vector_store)
    
    #test
    print("\n" + "="*30)
    print("RAG Bot is ready (enter 'exit' to end the chat)")
    print("="*30)
    
    while True:
        query = input("\n your query: ")
        if query.lower() in ["exit", "quit"]:
            break
        print(f"User: {query}")
        print("Thinking...")
        response = rag_chain.invoke({"input": query})
        
        print(f"\n Answer:\n{response['answer']}")
        

        # print("\nüîç Reference: (Context):")
        # for i, doc in enumerate(response["context"]):
        #     print(f"--- Chunk {i+1} ---")
        #     print(doc.page_content[:100] + "...") 


RAG Bot is ready (enter 'exit' to end the chat)
User: how to have a good lif
Thinking...

 Answer:
Ë¶ÅÈÅé‰∏äÂ•ΩÁîüÊ¥ªÔºåÂèØ‰ª•ËÄÉÊÖÆ‰ª•‰∏ãÂπæÈªûÔºö

- **Âª∫Á´ãËâØÂ•ΩÁøíÊÖ£**ÔºöÊàíÈÖí„ÄÅÂ∞ëÂêÉÁ≥ñ„ÄÅ‰∏çÊ≤âËø∑ÊñºÁ§æ‰∫§Â™íÈ´îÔºåÈÄô‰∫õÈÉΩËÉΩËÆì‰Ω†ÁöÑÊÉÖÁ∑íÊõ¥Á©©ÂÆö„ÄÇ[Page 147]
- **ÈÅ∏ÊìáÁ©çÊ•µÁöÑÁ§æ‰∫§Âúà**ÔºöËàáÁ©çÊ•µÂêë‰∏ä„ÄÅ‰ΩéÁ∂≠Ë≠∑ÁöÑÊúãÂèã‰∫§ÂæÄÔºåÈÄôÊ®£ÁöÑÈóú‰øÇËÉΩÊèêÂçá‰Ω†ÁöÑÂπ∏Á¶èÊÑü„ÄÇ[Page 147]
- **Â≠∏ÊúÉÊé•Âèó**ÔºöÂú®ÁîüÊ¥ª‰∏≠Ôºå‰Ω†Á∏ΩÊúâ‰∏âÂÄãÈÅ∏ÊìáÔºöÊîπËÆäÂÆÉ„ÄÅÊé•ÂèóÂÆÉÊàñÈõ¢ÈñãÂÆÉ„ÄÇÊé•ÂèóÁèæÂØ¶ËÉΩËÆì‰Ω†Êõ¥Âø´ÈÅ©Êáâ„ÄÇ[Page 152]
- **‰øùÊåÅÁï∂‰∏ãÁöÑÂ≠òÂú®ÊÑü**ÔºöÈÅøÂÖçÈÅéÂ§öÂú∞Ê¥ªÂú®ÈÅéÂéªÊàñÊú™‰æÜÔºåÈÄôÊ®£ËÉΩËÆì‰Ω†Êõ¥Âø´Ê®Ç„ÄÇ[Page 135]
- **ÈóúÊ≥®ÂÅ•Â∫∑**ÔºöÂÑ™ÂÖàËÄÉÊÖÆËá™Â∑±ÁöÑË∫´È´î„ÄÅÂøÉÁêÜÂíåÁ≤æÁ•ûÂÅ•Â∫∑ÔºåÈÄôÊòØÂÖ∂‰ªñ‰∏ÄÂàáÁöÑÂü∫Á§é„ÄÇ[Page 160]

ÈÄô‰∫õÂª∫Ë≠∞ËÉΩÂπ´Âä©‰Ω†Âú®ÁîüÊ¥ª‰∏≠ÊâæÂà∞Âπ≥Ë°°ÂíåÂπ∏Á¶è„ÄÇ


# Test

In [10]:
path = r"/Users/shenglienlee/Documents/GitHub/quick-start-guide-to-llms?tab=readme-ov-file/notebooks/My_project/doc/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
file_paths = [path]
documents = [] 
for path in file_paths:
    single_file_text = ""
    try:
        reader = PdfReader(path)
        #extract text from each page of pdf and combine them together
        for i, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                single_file_text  = single_file_text + text + " "
    except Exception as e:
        print(f"   ‚ö†Ô∏è ÁÑ°Ê≥ïËÆÄÂèñ {path}: {e}")
        continue
    #chunk the pdf
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2500, # Ë®≠ÂÆöÁÇ∫ 2500ÔºåÁ¥ÑÊ∂µËìã 2-3 ÂÄãÊÆµËêΩ
        chunk_overlap=400, # 400 Â≠óÈáçÁñäÔºåÁ¢∫‰øùË∑®ÊÆµËêΩË™ûÊÑè
        separators=["\n \n", "\n", "„ÄÇ", "ÔºÅ", "Ôºü", " ", ""],
        is_separator_regex=False 
    )
    
    page_chunk = text_splitter.split_text(single_file_text)
    
    # ----------------
    
    # --- 3. ÂåÖË£ùÊàê Document Áâ©‰ª∂ (ÈóúÈçµ‰øÆÊîπ) ---
    
    for chunk in page_chunk:
        # Âª∫Á´ã Document Áâ©‰ª∂
        doc = Document(
            page_content=chunk,  # üëà ÂÖßÊñá‰øùÊåÅ‰πæÊ∑® (Clean Text)
            metadata={           # üëà ‰æÜÊ∫êË≥áË®äËóèÂú®ÈÄôË£°
                "Source": "The Almanack of Naval Ravikant",
                "page":i+1,
                "type": "book"
            }
        )
        documents.append(doc)

    print(f"   -> {path} Âª∫Á´ã‰∫Ü {len(page_chunk)} ÂÄã Document Áâ©‰ª∂„ÄÇ")
print(f"‚úÖ ÊâÄÊúâÊñá‰ª∂ËôïÁêÜÂÆåÁï¢ÔºåÂÖ± {len(documents)} ÂÄãÁâ©‰ª∂„ÄÇ")

   -> /Users/shenglienlee/Documents/GitHub/quick-start-guide-to-llms?tab=readme-ov-file/notebooks/My_project/doc/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf Âª∫Á´ã‰∫Ü 130 ÂÄã Document Áâ©‰ª∂„ÄÇ
‚úÖ ÊâÄÊúâÊñá‰ª∂ËôïÁêÜÂÆåÁï¢ÔºåÂÖ± 130 ÂÄãÁâ©‰ª∂„ÄÇ


In [8]:
PdfReader(path)

<PyPDF2._reader.PdfReader at 0x10f67ea40>

In [9]:
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    print(text)

THE ALMANACK OF NAVAL RAVIKANT

GETTING RICH IS NOT JUST  
ABOUT LUCK; HAPPINESS IS  
NOT JUST A TRAIT WE ARE  
BORN WITH.
These aspirations may seem out of reach, but building wealth and be -
ing happy are skills we can learn.
So what are these skills, and how do we learn them? What are the 
principles that should guide our efforts? What does progress really 
look like?
Naval Ravikant is an entrepreneur, philosopher, and investor who 
has captivated the world with his principles for building wealth and 
creating long-term happiness. The Almanack of Naval Ravikant  is a  
collection of Naval‚Äôs wisdom and experience from the last ten years, 
shared as a curation of his most insightful interviews and poignant 
reflections. This isn‚Äôt a how-to book, or a step-by-step gimmick. In -
stead, through Naval‚Äôs own words, you will learn how to walk your own 
unique path toward a happier, wealthier life.
This book has been created as a public service. It is available for free 
download in pd