# Method 1

In [23]:
# key open ai api - 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
# key pinecone - pcsk_5sE2pP_9xvbQEerZMcDgVJZ2JHGpD5bN4utknN45PUv9q9FrJk6WVsw7Au4hr5qEktcRYZ
# env = us-east-1

In [24]:
!pip install -q sentence-transformers pinecone PyMuPDF langchain ipywidgets PyPDF2


In [None]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import files
import PyPDF2
import getpass
from tqdm import tqdm

#  Pinecone API key
pinecone_api_key = getpass.getpass("Enter your Pinecone API key: ")
pc = Pinecone(api_key=pinecone_api_key)

In [None]:
# UPLOAD & EXTRACT PDF

uploaded = files.upload()
file_path = list(uploaded.keys())[0]

def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join([page.extract_text() or "" for page in reader.pages])

raw_text = extract_text_from_pdf(file_path)
print(f"✅ Extracted {len(raw_text)} characters.")

In [None]:
# ✂️ TEXT CHUNKING

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)
chunks = text_splitter.split_text(raw_text)
print(f"✅ Split into {len(chunks)} chunks.")

In [None]:
# LOAD EMBEDDING MODEL (LOCAL)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# INIT/CREATE PINECONE INDEX
index_name = "rag-st-index"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # 'all-MiniLM-L6-v2' has 384 dim
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [None]:
#  EMBED AND UPLOAD TO PINECONE
for i in tqdm(range(0, len(chunks), 50)):
    batch = chunks[i:i+50]
    embeddings = model.encode(batch).tolist()
    ids = [f"chunk-{i+j}" for j in range(len(batch))]
    metadata = [{"text": text} for text in batch]
    index.upsert(vectors=[
        {"id": ids[j], "values": embeddings[j], "metadata": metadata[j]}
        for j in range(len(batch))
    ])

print("✅ All chunks embedded and uploaded to Pinecone.")

In [None]:
# ✅ CHAT FUNCTION: Semantic + Keyword Chat with Standard Output Style
from IPython.display import display
import ipywidgets as widgets
import re

history = []

def chat_with_document(_):
    question = input_box.value
    if not question.strip():
        return
    history.append(("🧑 You", question))

    q_embed = model.encode([question])[0].tolist()
    results = index.query(vector=q_embed, top_k=10, include_metadata=True)

    keywords = set(re.findall(r"\w+", question.lower()))
    filtered = [m for m in results['matches'] if any(k in m['metadata']['text'].lower() for k in keywords)]
    top_contexts = [m['metadata']['text'] for m in filtered[:3] if 'text' in m['metadata']]

    if not top_contexts:
        top_contexts = [m['metadata']['text'] for m in results['matches'][:3]]

    context_str = "\n\n".join(top_contexts)
    response = f"Based on the document, here's what I found:\n\n{context_str}"
    history.append(("🤖 Bot", response))

    chat_output.value = "\n\n".join([f"{h[0]}: {h[1]}" for h in history])
    input_box.value = ""

input_box = widgets.Text(placeholder='Ask a question about the document...', layout=widgets.Layout(width='100%'))
send_button = widgets.Button(description="Ask", button_style='primary')
chat_output = widgets.Textarea(value='', layout=widgets.Layout(width='100%', height='300px'))

send_button.on_click(chat_with_document)

display(input_box, send_button, chat_output)


# Method 2

In [None]:
# key open ai api - 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
# key pinecone - pcsk_4dZPaG_8n2t4ksPSTGbdMNpCMUaYPvHs3xFXyvwJXWWnZwuXN4NAdnfbN1BorXFcZ1PspT
# env = us-east-1

In [None]:
!pip install -q sentence-transformers pinecone PyMuPDF langchain ipywidgets PyPDF2 rank_bm25 python-pptx


In [None]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import files
import PyPDF2
import getpass
from tqdm import tqdm

# 🔑 Pinecone API key
pinecone_api_key = getpass.getpass("Enter your Pinecone API key: ")
pc = Pinecone(api_key=pinecone_api_key)

In [None]:
from google.colab import files
import PyPDF2
from pptx import Presentation

# Upload file
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

def extract_text_from_pdf(path):
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join([page.extract_text() or "" for page in reader.pages])

def extract_text_from_ppt(path):
    prs = Presentation(path)
    text_runs = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text_runs.append(shape.text)
    return "\n".join(text_runs)

# Extract text based on file type
if file_path.lower().endswith(".pdf"):
    raw_text = extract_text_from_pdf(file_path)
elif file_path.lower().endswith((".ppt", ".pptx")):
    raw_text = extract_text_from_ppt(file_path)
else:
    raise ValueError("Unsupported file format. Please upload a PDF or PPT/PPTX.")

print(f"✅ Extracted {len(raw_text)} characters from {file_path}")


In [None]:
# ✂️ TEXT CHUNKING

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len
)
chunks = text_splitter.split_text(raw_text)
print(f"✅ Split into {len(chunks)} chunks.")

In [None]:
# LOAD EMBEDDING MODEL (LOCAL)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# INIT/CREATE PINECONE INDEX
index_name = "rag-st-index"
pc.create_index(
        name=index_name,
        dimension=384,  # 'all-MiniLM-L6-v2' has 384 dim
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [None]:
#  EMBED AND UPLOAD TO PINECONE
for i in tqdm(range(0, len(chunks), 50)):
    batch = chunks[i:i+50]
    embeddings = model.encode(batch).tolist()
    ids = [f"chunk-{i+j}" for j in range(len(batch))]
    metadata = [{"text": text} for text in batch]
    index.upsert(vectors=[
        {"id": ids[j], "values": embeddings[j], "metadata": metadata[j]}
        for j in range(len(batch))
    ])

print("✅ All chunks embedded and uploaded to Pinecone.")

In [None]:
# ✅ CHAT FUNCTION: Semantic + BM25 Reranking (Improved UI)
from rank_bm25 import BM25Okapi
from IPython.display import display, HTML, Javascript
import ipywidgets as widgets
import html

# Prepare tokenized chunks for BM25
tokenized_chunks = [chunk.lower().split() for chunk in chunks]
bm25 = BM25Okapi(tokenized_chunks)

history_bm25 = []

def chat_with_bm25(_=None):
    question = input_box_bm25.value.strip()
    if not question:
        return

    history_bm25.append(("🧑 You", question))

    # Semantic query to Pinecone
    q_embed = model.encode([question])[0].tolist()
    results = index.query(vector=q_embed, top_k=10, include_metadata=True)

    # BM25 scores for reranking
    bm25_scores = bm25.get_scores(question.lower().split())
    reranked = sorted(
        results["matches"],
        key=lambda m: bm25_scores[chunks.index(m["metadata"]["text"])],
        reverse=True
    )

    top_contexts = [m["metadata"]["text"] for m in reranked[:3]]
    context_str = "\n\n".join(top_contexts)
    response = f"📄 Based on the document, here's what I found:\n\n{context_str}"
    history_bm25.append(("🤖 Bot", response))

    # Format chat history
    formatted_history = ""
    for sender, message in history_bm25:
        escaped_msg = html.escape(message).replace('\n', '<br>')
        align = 'right' if sender == "🧑 You" else 'left'
        bg_color = '#2a9d8f' if sender == "🧑 You" else '#264653'
        text_color = '#ffffff'
        bubble = f"""
        <div style='text-align: {align}; margin: 10px;'>
            <span style="
                display: inline-block;
                background-color: {bg_color};
                color: {text_color};
                padding: 10px;
                border-radius: 12px;
                max-width: 70%;
                font-family: sans-serif;
                text-align: left;
            "><b>{sender}:</b><br>{escaped_msg}</span>
        </div>
        """
        formatted_history += bubble

    chat_output_bm25.value = f"<div id='chatbox'>{formatted_history}</div>"

    # Auto scroll to bottom using JS
    display(Javascript("""
        var out = document.querySelectorAll('div.output_subarea');
        if (out.length) {
            out[out.length - 1].scrollIntoView({ behavior: 'smooth', block: 'end' });
        }
    """))

    input_box_bm25.value = ""

# Input box
input_box_bm25 = widgets.Text(
    placeholder='Ask a question using BM25 rerank...',
    layout=widgets.Layout(width='75%', padding='5px')
)
input_box_bm25.on_submit(chat_with_bm25)

# Send arrow button
send_button_bm25 = widgets.Button(
    icon='arrow-up',
    tooltip='Send',
    layout=widgets.Layout(
        width='40px',
        height='40px',
        border_radius='20px',
        margin='4px 5px 0 5px'
    ),
    button_style=''
)
send_button_bm25.on_click(chat_with_bm25)

# Mic button (visual only)
mic_button_bm25 = widgets.Button(
    icon='microphone',
    tooltip='Mic (not active)',
    layout=widgets.Layout(
        width='40px',
        height='40px',
        border_radius='20px',
        margin='4px 5px 0 5px'
    ),
    button_style=''
)

# Chat output
chat_output_bm25 = widgets.HTML(
    value='',
    layout=widgets.Layout(
        width='100%',
        height='450px',
        border='1px solid #888',
        padding='10px',
        overflow_y='scroll',
        background_color='#1e1e1e'
    )
)

# Input row layout
input_row_bm25 = widgets.HBox([mic_button_bm25, send_button_bm25, input_box_bm25])

# Display the UI
display(HTML("<h3>🔍 BM25-RAG Chatbot</h3>"))
display(chat_output_bm25, input_row_bm25)