## INSTALL REQUIRED PACKAGES FOR PDF PROCESSING

In [1]:
#  Install Required Packages for PDF Processing
!pip install llama-index
!pip install llama-index-embeddings-gemini
!pip install llama-index-vector-stores-pinecone
!pip install pinecone[grpc]>=3.0.0
!pip install pandas
!pip install python-dotenv
!pip install requests
!pip install PyPDF2
!pip install pdfplumber
!pip install pymupdf  # fitz for better PDF handling

print("✅ All packages installed including PDF processors!")

Collecting llama-index
  Downloading llama_index-0.12.45-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.12-py3-none-any.whl.metadata (439 bytes)
Collecting llama-index-cli<0.5,>=0.4.2 (from llama-index)
  Downloading llama_index_cli-0.4.3-py3-none-any.whl.metadata (1.4 kB)
Collecting llama-index-core<0.13,>=0.12.45 (from llama-index)
  Downloading llama_index_core-0.12.45-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.4,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.7.7-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-llms-openai<0.5,>=0.4.0 (from llama-index)
  Downloading llama_index_llms_openai-0.4.7-py3-none-any.whl.metadata (3.0 kB)
Collecting llama

Collecting llama-index-embeddings-gemini
  Downloading llama_index_embeddings_gemini-0.3.2-py3-none-any.whl.metadata (907 bytes)
Downloading llama_index_embeddings_gemini-0.3.2-py3-none-any.whl (4.0 kB)
Installing collected packages: llama-index-embeddings-gemini
Successfully installed llama-index-embeddings-gemini-0.3.2
Collecting llama-index-vector-stores-pinecone
  Downloading llama_index_vector_stores_pinecone-0.6.0-py3-none-any.whl.metadata (424 bytes)
Collecting pinecone<8.0.0,>=7.0.0 (from llama-index-vector-stores-pinecone)
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone<8.0.0,>=7.0.0->llama-index-vector-stores-pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone<8.0.0,>=7.0.0->llama-index-vector-stores-pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Dow

In [2]:
import os
from pinecone import Pinecone, PodSpec
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    StorageContext,
    Settings  # <-- Import Settings
)
from llama_index.vector_stores.pinecone import PineconeVectorStore

### Import Libraries and Setup for PDF Processing

In [3]:
import os
import pandas as pd
import requests
from pathlib import Path
from dotenv import load_dotenv
import PyPDF2
import pdfplumber
import fitz
# Load environment variables
load_dotenv()

False

In [4]:
PINECONE_API_KEY="pcsk_5RqeRB_BajBbfmLzLGntd4KYSkkPVxkxDLr6diyCkeTJyod1K7SYWizQA4TzosrFUCtfjT"
PINECONE_ENVIRONMENT="us-east-1-aws"
GOOGLE_API_KEY="AIzaSyD4z99pxqer4jpTE2AkO4dMuZ2xGZo65i4"

### SET ENVIRONMENT VARIABLES

In [5]:
# Set environment variables
os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY

print(" API keys configured for PDF processing!")

 API keys configured for PDF processing!


## **LOAD THE PDF DOCUMENT**

In [6]:
pdf_path ="thebook.pdf"
try:
    documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
    print(f"Loaded {len(documents)} document(s).")
except Exception as e:
    print(f"Error loading the PDF file: {e}")
    print(f"Please check that the path is correct: {pdf_path}")
    exit()

Loaded 234 document(s).


### CREATE AN INSTANCE OF THE PINECONE CLASS

In [7]:
# Create an instance of the Pinecone class
pc = Pinecone(api_key=PINECONE_API_KEY)
print(pc)

<pinecone.pinecone.Pinecone object at 0x78a9b4ae0110>


**DEFINE PINECONE INDEX AND ENVIORNMENT**

In [8]:
#  Define Pinecone Index and Environment
index_name = "chatbot"
pinecone_environment = "us-east-1-aws" # Or your actual environment
embedding_dimension = 384 # Dimension of the 'all-MiniLM-L6-v2' model

### CREATE PINECONE INDEX

In [9]:
# Create Pinecone Index if it doesn't exist
if index_name not in pc.list_indexes().names():
    print(f"Creating Pinecone index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=embedding_dimension,
        metric="cosine",
        spec=PodSpec(environment=pinecone_environment)
    )
    print("Index created successfully.")
else:
    print(f"Pinecone index '{index_name}' already exists.")

Pinecone index 'chatbot' already exists.


## SET UP LLAMAINDEX COMPONENTS

In [10]:
# Set up LlamaIndex Components
pinecone_index = pc.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [11]:
import nest_asyncio

# Apply the patch
nest_asyncio.apply()

## **CREATE THE INDEX**

In [16]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Use local Hugging Face embedding model with dimension 384
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the index
print("Creating index and storing embeddings in Pinecone... This may take a moment.")
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    embed_model=embed_model  # <-- explicitly set local embed model
)
print("Finished indexing and storing.")

Creating index and storing embeddings in Pinecone... This may take a moment.


Upserted vectors:   0%|          | 0/243 [00:00<?, ?it/s]

Finished indexing and storing.


In [13]:
!pip install llama-index-embeddings-huggingface


Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.5.5-py3-none-any.whl.metadata (458 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3

In [14]:
!pip install llama-index-llms-gemini


Collecting llama-index-llms-gemini
  Downloading llama_index_llms_gemini-0.5.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pillow<11,>=10.2.0 (from llama-index-llms-gemini)
  Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Downloading llama_index_llms_gemini-0.5.0-py3-none-any.whl (9.5 kB)
Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pillow, llama-index-llms-gemini
  Attempting uninstall: pillow
    Found existing installation: pillow 11.2.1
    Uninstalling pillow-11.2.1:
      Successfully uninstalled pillow-11.2.1
Successfully installed llama-index-llms-gemini-0.5.0 pillow-10.4.0


## SET UP LLAMAINDEX COMPONENTS

In [17]:
#  Setup LlamaIndex Components
from llama_index.core import Settings, Document
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
llm = Gemini(
        model="models/gemini-2.5-flash",
        api_key=GOOGLE_API_KEY
    )

embed_model = GeminiEmbedding(
        model_name="models/embedding-001",
        api_key=GOOGLE_API_KEY
    )

    # Configure global settings
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 1000  # Good for PDF text
Settings.chunk_overlap = 50 # More overlap for PDFs


  llm = Gemini(
  embed_model = GeminiEmbedding(


In [18]:
chat_engine = index.as_chat_engine(chat_mode="context", llm=llm)

## **CHAT LOOP**

In [None]:
#  Chat Loop
print("🤖 Chatbot is ready! Type 'exit' to quit.")
while True:
    query = input("You: ")
    if query.lower() in ("exit", "quit"):
        break
    response = chat_engine.chat(query)
    print("Bot:", response.response)

🤖 Chatbot is ready! Type 'exit' to quit.
You: what is classification
Bot: Based on the context provided, **classification** is the process of categorizing items or observations into predefined groups or classes.

Examples given in the text include:
*   **Binary classification:** Separating stars from diamonds (as shown in Figure 1.5).
*   Building a system to **classify new emails** (e.g., as spam or not spam).
*   **Cancer diagnosis:** Inferring whether a patient is healthy or not (generating a "yes/no" answer) based on histological data.

Essentially, it involves generating a specific answer (like "yes/no" or assigning to a category) given a set of observations.
