<a href="https://colab.research.google.com/github/Testgitchhub/SITERAG-CHATBOT/blob/main/SITERAG_CHATBOT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flask-ngrok flask flask-cors python-docx pdfplumber sentence-transformers faiss-cpu openai werkzeug==2.2.3

In [None]:
DOC_STORE = {} # doc_id -> { 'chunks': [text], 'meta': [ {start, end, filename}], 'emb_index': faiss_index }
# Simple text extractor
def extract_text_from_pdf(path_or_bytes):
text = []
if isinstance(path_or_bytes, (bytes, bytearray)):
fp = io.BytesIO(path_or_bytes)
with pdfplumber.open(fp) as pdf:
for p in pdf.pages:
text.append(p.extract_text() or '')
else:
with pdfplumber.open(path_or_bytes) as pdf:
for p in pdf.pages:
text.append(p.extract_text() or '')
return "\n".join(text)
def extract_text_from_docx(path_or_bytes):
if isinstance(path_or_bytes, (bytes, bytearray)):
fp = io.BytesIO(path_or_bytes)
doc = docx.Document(fp)
else:
doc = docx.Document(path_or_bytes)
paragraphs = [p.text for p in doc.paragraphs]
return "\n".join(paragraphs)
def chunk_text(text, chunk_size=500, overlap=50):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = words[i:i+chunk_size]
chunks.append(' '.join(chunk))
i += chunk_size - overlap
return chunks
def build_faiss_index(embs):
index = faiss.IndexFlatL2(embs.shape[1])
index.add(embs)
return index
def get_embeddings(texts):
embs = embed_model.encode(texts, show_progress_bar=False, convert_to_numpy=True)
return embs
def call_llm_system(prompt, max_tokens=300, temperature=0.1):
# Uses OpenAI ChatCompletion (gpt-3.5-turbo). Replace if you want other models.
if not openai.api_key:
raise ValueError('OPENAI_API_KEY not set in Colab env')
res = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{"role": "system", "content": "You are a helpful assistant that answers questions based only on the provided context. If the answer is not in the context, say you don't know."},
{"role": "user", "content": prompt},
],
temperature=temperature,
max_tokens=max_tokens,
)
return res['choices'][0]['message']['content'].strip()

In [None]:
from flask_ngrok import run_with_ngrok
data = f.read()
lower = filename.lower()
if lower.endswith('.pdf'):
text = extract_text_from_pdf(data)
elif lower.endswith('.docx'):
text = extract_text_from_docx(data)
else:
try:
text = data.decode('utf-8')
except Exception:
return jsonify({'error':'unsupported file type'}), 400
chunks = chunk_text(text, chunk_size=300, overlap=30)
embs = get_embeddings(chunks)
index = build_faiss_index(embs)
doc_id = str(len(DOC_STORE) + 1)
DOC_STORE[doc_id] = {
'filename': filename,
'chunks': chunks,
'embs': embs,
'index': index,
}
return jsonify({'doc_id': doc_id, 'n_chunks': len(chunks)})
@app.route('/query', methods=['POST'])
def query_doc():
data = request.get_json(force=True)
doc_id = data.get('doc_id')
question = data.get('question')
top_k = int(data.get('top_k', 4))
if not doc_id or not question:
return jsonify({'error':'doc_id and question required'}), 400
if doc_id not in DOC_STORE:
return jsonify({'error':'doc_id not found'}), 404
store = DOC_STORE[doc_id]
q_emb = get_embeddings([question])
D, I = store['index'].search(q_emb, top_k)
retrieved = [store['chunks'][int(i)] for i in I[0] if i != -1]
# Build prompt
context = "\n\n---\n\n".join(retrieved)
prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer concisely and cite the chunk if needed."
try:
answer = call_llm_system(prompt)
except Exception as e:
return jsonify({'error': str(e)}), 500
return jsonify({'answer': answer, 'retrieved_chunks': retrieved})
# Start Flask
print('Starting Flask server with ngrok...')
app.run()

In [None]:
import os

# Create folder
os.makedirs("chrome_extension", exist_ok=True)

# Create files
files = {
    "chrome_extension/manifest.json": """
{
  "manifest_version": 3,
  "name": "RAG Document Q&A",
  "version": "1.0",
  "description": "Ask questions about uploaded documents using RAG.",
  "permissions": ["activeTab", "storage"],
  "action": {
    "default_popup": "popup.html"
  }
}
""",

    "chrome_extension/popup.html": """
<!DOCTYPE html>
<html>
<head>
  <title>RAG Q&A</title>
  <link rel="stylesheet" href="styles.css" />
</head>
<body>
  <h2>Ask from Document</h2>
  <input type="file" id="fileInput" />
  <input type="text" id="questionInput" placeholder="Ask your question..." />
  <button id="askButton">Ask</button>
  <div id="answer"></div>
  <script src="popup.js"></script>
</body>
</html>
""",

    "chrome_extension/popup.js": """
document.getElementById('askButton').addEventListener('click', async () => {
  const question = document.getElementById('questionInput').value;
  const file = document.getElementById('fileInput').files[0];

  if (!file || !question) {
    alert("Please upload a document and enter a question!");
    return;
  }

  const formData = new FormData();
  formData.append("file", file);

  const backendURL = "https://YOUR_NGROK_URL/upload"; // Replace with your actual backend URL
  await fetch(backendURL, { method: "POST", body: formData });

  const queryRes = await fetch("https://YOUR_NGROK_URL/query", {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ question })
  });

  const data = await queryRes.json();
  document.getElementById('answer').textContent = data.answer;
});
""",

    "chrome_extension/styles.css": """
body {
  font-family: Arial, sans-serif;
  width: 300px;
  padding: 10px;
}
h2 {
  text-align: center;
}
input, button {
  width: 100%;
  margin-top: 5px;
  padding: 6px;
}
#answer {
  margin-top: 10px;
  padding: 8px;
  background: #f5f5f5;
  border-radius: 8px;
}
"""
}

# Write files
for path, content in files.items():
    with open(path, "w") as f:
        f.write(content.strip())

print("✅ Chrome Extension files created successfully!")


In [None]:
!ls chrome_extension