***ChatBot_pdf_EXP 02 ***

Submitted By - Prabhat Singh

In [None]:
# Install dependencies
!pip install openai faiss-cpu pydantic PyPDF2 tiktoken sentence-transformers

import os
import openai
import faiss
import numpy as np
import PyPDF2
import tiktoken
from typing import List, Tuple, Dict
from pydantic import BaseModel
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
from IPython.display import display, Markdown
from google.colab import files

# === STEP 1: Setup API Key ===
openai.api_key = input("Enter your OpenAI API key: ")

# === STEP 2: PDF Upload ===
print("📂 Please upload the PDF file of 'The Hard Thing About Hard Things'")
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

# === STEP 3: PDF Loading & Chunking ===
def load_pdf_chunks_variable(pdf_path: str, max_tokens: int = 300) -> List[Dict]:
    reader = PyPDF2.PdfReader(pdf_path)
    tokenizer = tiktoken.get_encoding("cl100k_base")
    chunks = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue
        tokens = tokenizer.encode(text)
        start = 0
        while start < len(tokens):
            end = min(start + max_tokens, len(tokens))
            chunk_text = tokenizer.decode(tokens[start:end])
            chunks.append({"text": chunk_text, "page": i + 1})
            start = end
    return chunks

# === STEP 4: Local Embedding Generator using Sentence Transformers ===
class LocalEmbedder:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def get_embedding(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

# === STEP 5: FAISS Vector Store ===
@dataclass
class DocumentChunk:
    text: str
    page: int
    embedding: List[float]

class LocalVectorStore:
    def __init__(self, dim: int):
        self.index = faiss.IndexFlatL2(dim)
        self.chunks: List[DocumentChunk] = []

    def add(self, chunk: DocumentChunk):
        self.index.add(np.array([chunk.embedding], dtype=np.float32))
        self.chunks.append(chunk)

    def search(self, query_embedding: List[float], k: int = 5) -> List[Tuple[str, int]]:
        D, I = self.index.search(np.array([query_embedding], dtype=np.float32), k)
        return [(self.chunks[i].text, self.chunks[i].page) for i in I[0]]

# === STEP 6: Chatbot with Memory ===
class Message(BaseModel):
    role: str
    content: str

class RAGChatbot:
    def __init__(self, embedder: LocalEmbedder, store: LocalVectorStore):
        self.embedder = embedder
        self.store = store
        self.messages: List[Message] = []

    def ask(self, user_input: str) -> str:
        self.messages.append(Message(role="user", content=user_input))
        query_embedding = self.embedder.get_embedding(user_input)
        retrieved = self.store.search(query_embedding, k=5)

        context = "\n\n".join([f"(Page {page}) {text}" for text, page in retrieved])
        conversation = "\n".join([f"{msg.role}: {msg.content}" for msg in self.messages])
        prompt = f"""You are a helpful assistant discussing the book 'The Hard Thing About Hard Things' by Ben Horowitz.
Use the context below to answer the user's question. Always cite relevant quotes and page numbers.

Context:
{context}

Conversation so far:
{conversation}

assistant:"""

        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": prompt}],
            temperature=0.2
        )
        answer = response.choices[0].message.content
        self.messages.append(Message(role="assistant", content=answer))
        return answer

# === STEP 7: Run Everything ===
print("📄 Chunking PDF...")
raw_chunks = load_pdf_chunks_variable(pdf_path, max_tokens=300)

print("🔎 Embedding and indexing locally...")
embedder = LocalEmbedder()
dim = len(embedder.get_embedding("test sentence"))
store = LocalVectorStore(dim=dim)

for chunk in raw_chunks:
    embedding = embedder.get_embedding(chunk["text"])
    store.add(DocumentChunk(text=chunk["text"], page=chunk["page"], embedding=embedding))

bot = RAGChatbot(embedder, store)

# === STEP 8: Chat Loop ===
print("\n✅ Ready! Ask questions about the book (type 'exit' to stop):")
while True:
    user_input = input("You: ")
    if user_input.strip().lower() in {"exit", "quit"}:
        break
    answer = bot.ask(user_input)
    display(Markdown(f"**Assistant:** {answer}"))


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-tran

Saving the_hard_thing_about_hard_things.pdf to the_hard_thing_about_hard_things.pdf
📄 Chunking PDF...
🔎 Embedding and indexing locally...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


✅ Ready! Ask questions about the book (type 'exit' to stop):


**Assistant:** "The Hard Thing About Hard Things" by Ben Horowitz is a book that delves into the challenges and struggles faced by entrepreneurs, CEOs, and business leaders. It emphasizes the fact that there is no set formula or recipe for dealing with complex and difficult situations in business. The author shares his own experiences and lessons learned from building companies, navigating tough decisions, and leading teams through adversity. The book provides insights and advice based on real-world experiences rather than offering a one-size-fits-all solution.

**Assistant:** Here is a list of the context provided from "The Hard Thing About Hard Things" by Ben Horowitz:

1. Dedication to the author's family and his commitment to donate all proceeds to help women in developing countries gain basic civil rights (Page 3).
2. Acknowledgments to individuals who supported and contributed to the book, including Michael Ovitz, Carlye Adler, Hollis Heimbouch, Binky Urban, Nasir Jones, Kanye West, the author's mother, father, and longtime business partner Marc Andreessen (Pages 234-235).
3. A note of gratitude to Bill Campbell for his mentorship and guidance during tough times (Page 235).
4. Information about the author's editor, Carlye Adler, and the publishing team at HarperCollins (Page 235).
5. Mention of exclusive information on the author available at www.AuthorTracker.com (Page 237).

These excerpts provide insight into the author's personal connections, support network, and the dedication behind the book.

You: quit
