In [1]:
# ====================================================================
# PHASE 1: DATA INDEXING (Run This in Google Colab)
# ====================================================================

# Cell 1: Installation and Setup
# --------------------------------------------------------------------
# Install the necessary libraries
!pip install -q \
    google-genai \
    langchain \
    langchain-google-genai \
    pinecone-client \
    langchain-pinecone \
    pypdf \
    langchain-community \
    langchain-core \
    langchain-text-splitters --upgrade

import os
import uuid
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# FINAL FIX: Use the specific recommended configuration module path
from google.generativeai import configure, embed_content

# --- 2. Configuration (Input your keys here) ---
# NOTE: Replace with YOUR actual, full API key values.
GEMINI_API_KEY = "AIzaSyADC-p5kFNCJF68koiI6IlBUCq0qXGKX0k"
PINECONE_API_KEY = "pcsk_4e9Swh_GgW9bEgEgiLjEqWMqo79wqYQmVQkcp9jrLCE52m9bHn1JbDGZAKpaf3kus4XB3"

# Index configuration using your confirmed values
INDEX_NAME = "uta-rag"
DIMENSION = 768
CLOUD = "aws"
REGION = "us-east-1"

# Set environment variables (Crucial for system-wide access)
os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY

# Configure native Gemini client (The guaranteed correct way)
configure(api_key=GEMINI_API_KEY)


# Cell 2: Index Creation
# --------------------------------------------------------------------
# --- A. Upload Document Path ---
PDF_PATH = "/content/PM_TEXTBOOK.pdf"
if not os.path.exists(PDF_PATH):
    print(f"Error: Please upload your university PDF named '{PDF_PATH}' to Colab.")
    raise FileNotFoundError

# --- B. Initialize Pinecone and Create the Index (BYOV Configuration) ---
pc = Pinecone(api_key=PINECONE_API_KEY)

# Initialize Embeddings object (Passing key directly for stable authentication)
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", api_key=GEMINI_API_KEY)

if INDEX_NAME not in pc.list_indexes().names():
    print(f"Creating Bring Your Own Vectors (BYOV) index '{INDEX_NAME}'...")
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=CLOUD,
            region=REGION
        )
    )
    print("Index creation request sent. Waiting for index to be active...")
else:
    print(f"Index '{INDEX_NAME}' already exists. Skipping creation.")


# Cell 3: Load, Split, Embed, and Upload Data (The Working Solution)
# --------------------------------------------------------------------
# Define a function to get embeddings using the NATIVE Google GenAI client
def get_native_embedding(text_content):
    """Generates an embedding vector using the authenticated native Gemini client."""
    # We now use the globally imported embed_content function
    result = embed_content(
        model="models/text-embedding-004",
        content=text_content,
        task_type="RETRIEVAL_DOCUMENT"
    )
    return result['embedding']

print(f"Loading and splitting document: {PDF_PATH}...")
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)
docs = text_splitter.split_documents(documents)
print(f"Total chunks created: {len(docs)}")

# --- 3. Generate Embeddings and Upsert ---
index = pc.Index(INDEX_NAME)

print("Generating and uploading Gemini embeddings. This may take a few minutes...")

vectors_to_upsert = []
batch_size = 50
total_chunks = len(docs)

for i, doc in enumerate(docs):
    text_content = doc.page_content
    metadata = doc.metadata.copy()

    # 3a. Call native Gemini client for the vector
    try:
        vector = get_native_embedding(text_content)
    except Exception as e:
        print(f"Skipping chunk {i} due to embedding error: {e}")
        continue

    # 3b. Prepare data for upsert
    doc_id = str(uuid.uuid4())
    metadata['text'] = text_content

    vectors_to_upsert.append((doc_id, vector, metadata))

    # 4. Upsert in Batches
    if len(vectors_to_upsert) >= batch_size:
        index.upsert(vectors=vectors_to_upsert, namespace="default")
        print(f"Upserted batch ending at chunk {i+1}/{total_chunks}. Waiting 1 second...")
        vectors_to_upsert = []
        time.sleep(1)

# Upsert any remaining vectors
if vectors_to_upsert:
    index.upsert(vectors=vectors_to_upsert, namespace="default")
    print(f"Upserted final batch.")

print("\n Indexing Complete! The university knowledge base is ready.")

Index 'uta-rag' already exists. Skipping creation.
Loading and splitting document: /content/PM_TEXTBOOK.pdf...
Total chunks created: 2356
Generating and uploading Gemini embeddings. This may take a few minutes...
Upserted batch ending at chunk 50/2356. Waiting 1 second...
Upserted batch ending at chunk 100/2356. Waiting 1 second...
Upserted batch ending at chunk 150/2356. Waiting 1 second...
Upserted batch ending at chunk 200/2356. Waiting 1 second...
Upserted batch ending at chunk 250/2356. Waiting 1 second...
Upserted batch ending at chunk 300/2356. Waiting 1 second...
Upserted batch ending at chunk 350/2356. Waiting 1 second...
Upserted batch ending at chunk 400/2356. Waiting 1 second...
Upserted batch ending at chunk 450/2356. Waiting 1 second...
Upserted batch ending at chunk 500/2356. Waiting 1 second...
Upserted batch ending at chunk 550/2356. Waiting 1 second...
Upserted batch ending at chunk 600/2356. Waiting 1 second...
Upserted batch ending at chunk 650/2356. Waiting 1 secon