***ChatBot_pdf_EXP 03 ***

Submitted By - Prabhat Singh

In [1]:
# Install dependencies
!pip install openai faiss-cpu pydantic PyPDF2 tiktoken

import os
import openai
import faiss
import numpy as np
import PyPDF2
import tiktoken
from typing import List, Tuple, Dict
from pydantic import BaseModel
from dataclasses import dataclass
from IPython.display import display, Markdown
from google.colab import files

# === STEP 1: Setup API Key ===
openai.api_key = input("Enter your OpenAI API key: ")

# === STEP 2: Upload PDF ===
print("📂 Please upload the PDF file of 'The Hard Thing About Hard Things'")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

# === STEP 3: Message Schema for Memory ===
class Message(BaseModel):
    role: str
    content: str

# === STEP 4: PDF Chunking ===
def load_pdf_chunks(pdf_path: str, max_tokens: int = 300) -> List[Dict]:
    reader = PyPDF2.PdfReader(pdf_path)
    tokenizer = tiktoken.get_encoding("cl100k_base")
    chunks = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue
        tokens = tokenizer.encode(text)
        start = 0
        while start < len(tokens):
            end = min(start + max_tokens, len(tokens))
            chunk_text = tokenizer.decode(tokens[start:end])
            chunks.append({"text": chunk_text, "page": i + 1})
            start = end
    return chunks

# === STEP 5: Embedding Generator ===
class Embedder:
    def __init__(self, model: str = "text-embedding-3-small"):
        self.model = model

    def get_embedding(self, text: str) -> List[float]:
        response = openai.embeddings.create(input=[text], model=self.model)
        return response.data[0].embedding

def get_memory_conditioned_embedding(messages: List[Message], embedder: Embedder) -> List[float]:
    prompt_context = "\n".join([f"{msg.role}: {msg.content}" for msg in messages])
    return embedder.get_embedding(prompt_context)

# === STEP 6: FAISS Vector Store ===
@dataclass
class DocumentChunk:
    text: str
    page: int
    embedding: List[float]

class LocalVectorStore:
    def __init__(self, dim: int):
        self.index = faiss.IndexFlatL2(dim)
        self.chunks: List[DocumentChunk] = []

    def add(self, chunk: DocumentChunk):
        self.index.add(np.array([chunk.embedding], dtype=np.float32))
        self.chunks.append(chunk)

    def search(self, query_embedding: List[float], k: int = 5) -> List[Tuple[str, int]]:
        D, I = self.index.search(np.array([query_embedding], dtype=np.float32), k)
        return [(self.chunks[i].text, self.chunks[i].page) for i in I[0]]

# === STEP 7: Chatbot with Memory-Aware Embedding ===
class RAGChatbot:
    def __init__(self, embedder: Embedder, store: LocalVectorStore):
        self.embedder = embedder
        self.store = store
        self.messages: List[Message] = []

    def ask(self, user_input: str) -> str:
        self.messages.append(Message(role="user", content=user_input))
        query_embedding = get_memory_conditioned_embedding(self.messages, self.embedder)
        retrieved = self.store.search(query_embedding, k=5)

        context = "\n\n".join([f"(Page {page}) {text}" for text, page in retrieved])
        conversation = "\n".join([f"{msg.role}: {msg.content}" for msg in self.messages])

        prompt = f"""You are a helpful assistant discussing the book 'The Hard Thing About Hard Things' by Ben Horowitz.
Use the context below to answer the user's question. Always cite relevant quotes and page numbers.

Context:
{context}

Conversation so far:
{conversation}

assistant:"""

        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "system", "content": prompt}],
            temperature=0.2
        )
        answer = response.choices[0].message.content
        self.messages.append(Message(role="assistant", content=answer))
        return answer

# === STEP 8: Process PDF and Launch Chat ===
print("📄 Chunking PDF...")
raw_chunks = load_pdf_chunks(pdf_path)

print("🔎 Generating embeddings and building FAISS index...")
embedder = Embedder()
dim = len(embedder.get_embedding("test"))
store = LocalVectorStore(dim=dim)

for chunk in raw_chunks:
    embedding = embedder.get_embedding(chunk["text"])
    store.add(DocumentChunk(text=chunk["text"], page=chunk["page"], embedding=embedding))

bot = RAGChatbot(embedder, store)

# === STEP 9: Chat Interface ===
print("\n✅ Ready! Ask questions about the book (type 'exit' to stop):")
while True:
    user_input = input("You: ")
    if user_input.strip().lower() in {"exit", "quit"}:
        break
    answer = bot.ask(user_input)
    display(Markdown(f"**Assistant:** {answer}"))


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2, faiss-cpu
Successfully installed PyPDF2-3.0.1 faiss-cpu-1.11.0
Enter your OpenAI API key: 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
📂 Please upload the PDF file of 'The Hard Thing About Hard Things'


Saving the_hard_thing_about_hard_things.pdf to the_hard_thing_about_hard_things.pdf
📄 Chunking PDF...
🔎 Generating embeddings and building FAISS index...

✅ Ready! Ask questions about the book (type 'exit' to stop):
You: what is pdf about


**Assistant:** The book "The Hard Thing About Hard Things" by Ben Horowitz is about the challenges and difficulties of running a business, particularly in the technology sector. It provides insights and advice on how to handle the tough decisions and situations that arise when leading a company. The book covers topics such as managing and training employees, making strategic decisions, and dealing with competition. It emphasizes the importance of good product management and effective communication within a company. For example, it discusses the qualities of good versus bad product managers and the significance of training programs (pages 96-97).

You: give me the all chapter names


**Assistant:** Here are the chapter names from "The Hard Thing About Hard Things" by Ben Horowitz:

1. From Communist to Venture Capitalist
2. “I Will Survive”
3. This Time with Feeling
4. When Things Fall Apart
   - The Struggle
   - CEOs Should Tell It Like It Is
   - The Right Way to Lay People Off
   - Preparing to Fire an Executive
   - Demoting a Loyal Friend
   - Lies That Losers Tell
   - Lead Bullets
   - Nobody Cares
5. Take Care of the People, the Products, and the Profits—in That Order
   - A Good Place to Work
   - Why Startups Should Train Their People
   - Is It Okay to Hire People from Your Friend’s Company?
   - Why It’s Hard to Bring Big Company Execs into Little Companies
   - Hiring Executives: If You’ve Never Done the Job, How Do You Hire Somebody Good?
   - When Employees Misinterpret Managers
   - Management Debt
   - Management Quality Assurance
6. Concerning the Going Concern
   - How to Minimize Politics in
7. How to Lead Even When You Don’t Know Where You Are Going
   - The Most Difficult CEO Skill
   - The Fine Line Between Fear and Courage
   - Ones and Twos
   - Follow the Leader
   - Peacetime CEO/Wartime CEO
   - Making Yourself a CEO
   - How to Evaluate CEOs
8. First Rule of Entrepreneurship: There Are No Rules
   - Solving the Accountability vs. Creativity Paradox
   - The Freaky Friday Management Technique
   - Staying Great
   - Should You Sell Your Company?
9. The End of the Beginning

Additionally, there is an appendix titled "Questions for Head of Enterprise Sales Force."

You: quit
