In [1]:
# Cell 1: Configuration
AWS_REGION = "eu-north-1"
S3_BUCKET = "my-semantic-search-data"
SAGEMAKER_ENDPOINT = "jumpstart-dft-cohere-embed-multilin-20250717-233006"
CHROMA_PERSIST_DIR = "./chroma_db"
EMBED_DIM = 1024  # Cohere multilingual embedding size
CHUNK_SIZE = 400
CHUNK_OVERLAP = 50


In [2]:
import boto3
import os

# Initialize S3 client
s3 = boto3.client("s3", region_name=AWS_REGION)

# List of PDF keys to download
pdf_keys = [
    "General_Application_of_Machine_Learning_in_Software_Engineering (4).pdf",
    "CodeofConduct2022-23-accommodation.pdf"
]

# Local storage directory
local_dir = "/tmp/pdf_docs"
os.makedirs(local_dir, exist_ok=True)

# Download all PDFs from S3
local_pdf_paths = []

for key in pdf_keys:
    local_path = os.path.join(local_dir, os.path.basename(key))
    try:
        s3.download_file(S3_BUCKET, key, local_path)
        print(f"✅ Downloaded '{key}' to '{local_path}'")
        local_pdf_paths.append(local_path)
    except Exception as e:
        print(f"❌ Error downloading '{key}': {e}")


✅ Downloaded 'General_Application_of_Machine_Learning_in_Software_Engineering (4).pdf' to '/tmp/pdf_docs/General_Application_of_Machine_Learning_in_Software_Engineering (4).pdf'
✅ Downloaded 'CodeofConduct2022-23-accommodation.pdf' to '/tmp/pdf_docs/CodeofConduct2022-23-accommodation.pdf'


In [3]:
pip install boto3 langchain chromadb openai PyPDF2


Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install pypdf




In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
import json
import boto3
import math

# 1. Load all PDFs
all_documents = []
for pdf_path in local_pdf_paths:
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    all_documents.extend(pages)

print(f"✅ Loaded {len(all_documents)} pages from {len(local_pdf_paths)} PDFs")

# 2. Chunk pages into smaller documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunked_documents = text_splitter.split_documents(all_documents)
print(f"✅ Chunked into {len(chunked_documents)} text documents")

# 3. Custom Embedding class with batching for SageMaker Cohere endpoint
sm_runtime = boto3.client("sagemaker-runtime", region_name=AWS_REGION)

class SageMakerCohereEmbeddings(Embeddings):
    def embed_documents(self, texts):
        BATCH_SIZE = 48  # max supported by endpoint is 96, stay safe
        all_embeddings = []

        for i in range(0, len(texts), BATCH_SIZE):
            batch = texts[i:i + BATCH_SIZE]
            payload = {
                "texts": batch,
                "input_type": "search_document"
            }
            response = sm_runtime.invoke_endpoint(
                EndpointName=SAGEMAKER_ENDPOINT,
                ContentType="application/json",
                Body=json.dumps(payload)
            )
            result = json.loads(response["Body"].read().decode("utf-8"))
            all_embeddings.extend(result["embeddings"]["float"])

        return all_embeddings

    def embed_query(self, text):
        return self.embed_documents([text])[0]

embedding_model = SageMakerCohereEmbeddings()

# 4. Store in Chroma DB
vectordb = Chroma.from_documents(
    documents=chunked_documents,
    embedding=embedding_model,
    persist_directory=CHROMA_PERSIST_DIR
)

print(f"✅ Chroma vector store created with {vectordb._collection.count()} documents")


✅ Loaded 18 pages from 2 PDFs
✅ Chunked into 180 text documents
✅ Chroma vector store created with 540 documents


In [6]:
retriever = vectordb.as_retriever(search_kwargs={"k": 4})  # Or desired `k`


In [None]:
import os

#OpenAI API
os.environ["OPENAI_API_KEY"] = "sk--"  # Replace with your actual key


In [8]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# 1. Prompt Template
prompt_template = """You are an assistant for answering questions about machine learning documents.
Use the following context to answer the question at the end.

Context:
{context}

Question: {question}
Answer:"""

QA_PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# 2. Load OpenAI Chat Model
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.2
)

# 3. Build QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": QA_PROMPT}
)

print("✅ RetrievalQA chain initialized.")


✅ RetrievalQA chain initialized.


In [11]:
query = "What are the key findings of pdf code of conduct?"
result = qa_chain.invoke({"query": query})
print("🧠 Answer:")
print(result["result"])


🧠 Answer:
The key findings of the PDF code of conduct are that residents must observe the terms and conditions of the Licence Agreement and follow the instructions of the Manager, University staff, and other responsible individuals.


In [15]:
import time
from openai import OpenAIError  # Safe and current error class

max_retries = 5
query = "quantum computing"

for attempt in range(max_retries):
    try:
        result = qa_chain.invoke({"query": query})
        print("🧠 Answer:")
        print(result["result"])
        break
    except OpenAIError as e:
        wait_time = 2 ** attempt
        print(f"⚠️ OpenAI error: {e}. Retrying in {wait_time}s...")
        time.sleep(wait_time)
else:
    print("❌ Failed after multiple retries.")


🧠 Answer:
The document does not mention quantum computing.
