# Hybrid RAG System - Run Streamlit App in Google Colab
This notebook demonstrates how to run your Hybrid RAG System Streamlit app in Google Colab with public URL access using Ngrok tunneling.

## Section 1: Install Streamlit and Dependencies
Install all required packages for the Hybrid RAG System.

In [None]:
# Install required packages
!pip install -q streamlit pyngrok requests beautifulsoup4 numpy faiss-cpu sentence-transformers rank-bm25 transformers torch

## Section 2: Set Up Ngrok Authentication
Get your Ngrok token from https://dashboard.ngrok.com/auth/your-authtoken and add it here.

In [None]:
# Set up Ngrok authentication token
from pyngrok import ngrok

# Replace 'your_token_here' with your actual Ngrok token
ngrok.set_auth_token("your_token_here")

# Optional: Configure Ngrok
ngrok_tunnel = ngrok.connect(8501)

## Section 3: Create Modified hybrid_rag_system.py for Colab
Streamlit requires modifications to run in Colab. Here's the adapted version.

In [None]:
%%writefile hybrid_rag_colab.py
import json
import random
import requests
import time
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from transformers import pipeline
import streamlit as st
from bs4 import BeautifulSoup
import urllib.parse

# ---------------------------------------------------------------
# Step 1: Load Wikipedia URLs (200 Fixed + 300 randomly scraped)
# ---------------------------------------------------------------
def load_urls(use_fixed=True):
    """Load Wikipedia URLs for RAG system"""
    urls = []
    
    # Use sample URLs for demo
    sample_urls = [
        "https://en.wikipedia.org/wiki/Machine_learning",
        "https://en.wikipedia.org/wiki/Artificial_intelligence",
        "https://en.wikipedia.org/wiki/Deep_learning",
        "https://en.wikipedia.org/wiki/Natural_language_processing",
    ]
    return sample_urls

# ---------------------------------------------------------------
# Step 2: Extract and Chunk Text
# ---------------------------------------------------------------
def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

# ---------------------------------------------------------------
# Step 3: Dense Vector Index (FAISS)
# ---------------------------------------------------------------
def build_dense_index(chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, convert_to_numpy=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index, embeddings, model

def dense_retrieve(query, index, model, chunks, top_k=5):
    q_emb = model.encode([query], convert_to_numpy=True)
    scores, ids = index.search(q_emb, min(top_k, len(chunks)))
    return [(chunks[i], float(scores[0][j])) for j,i in enumerate(ids[0])]

# ---------------------------------------------------------------
# Step 4: Sparse Retrieval (BM25)
# ---------------------------------------------------------------
def build_sparse_index(chunks):
    tokenized = [chunk.split() for chunk in chunks]
    bm25 = BM25Okapi(tokenized)
    return bm25, tokenized

def sparse_retrieve(query, bm25, chunks, top_k=5):
    scores = bm25.get_scores(query.split())
    ranked = np.argsort(scores)[::-1][:min(top_k, len(chunks))]
    return [(chunks[i], float(scores[i])) for i in ranked]

# ---------------------------------------------------------------
# Step 5: Reciprocal Rank Fusion
# ---------------------------------------------------------------
def reciprocal_rank_fusion(dense_results, sparse_results, k=60, top_n=5):
    scores = {}
    for rank, (chunk, _) in enumerate(dense_results):
        scores[chunk] = scores.get(chunk, 0) + 1/(k+rank+1)
    for rank, (chunk, _) in enumerate(sparse_results):
        scores[chunk] = scores.get(chunk, 0) + 1/(k+rank+1)
    fused = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return fused

# ---------------------------------------------------------------
# Step 6: Streamlit UI (Colab Compatible)
# ---------------------------------------------------------------
def run_ui():
    st.set_page_config(page_title="Hybrid RAG System", layout="wide")
    st.title("üîç Hybrid RAG System (Dense + BM25 + RRF)")
    
    # Demo corpus
    demo_corpus = [
        "Machine learning is a subset of artificial intelligence that focuses on learning from data.",
        "Deep learning uses neural networks with multiple layers to learn representations.",
        "Natural language processing enables computers to understand and generate human language.",
        "Information retrieval systems help find relevant documents from large collections.",
    ]
    
    chunks = []
    for doc in demo_corpus:
        chunks.extend(chunk_text(doc))
    
    if not chunks:
        st.warning("No documents loaded. Using sample data.")
        chunks = demo_corpus
    
    # Build indices
    dense_index, embeddings, dense_model = build_dense_index(chunks)
    bm25, tokenized = build_sparse_index(chunks)
    
    # Query input
    query = st.text_input("üìù Enter your question:", placeholder="Ask me something...")
    
    if query:
        with st.spinner("Retrieving and processing..."):
            start = time.time()
            
            # Retrieve results
            dense_results = dense_retrieve(query, dense_index, dense_model, chunks, top_k=3)
            sparse_results = sparse_retrieve(query, bm25, chunks, top_k=3)
            rrf_results = reciprocal_rank_fusion(dense_results, sparse_results, top_n=3)
            
            end = time.time()
        
        # Display results
        st.subheader("üìä Top Retrieved Chunks")
        for i, (chunk, score) in enumerate(rrf_results, 1):
            with st.expander(f"Result {i} (Score: {score:.4f})"):
                st.write(chunk)
        
        st.success(f"‚úÖ Response Time: {end-start:.2f} seconds")

if __name__ == "__main__":
    run_ui()

## Section 4: Configure Colab for Streamlit Execution
Setup environment variables and necessary configurations for Colab.

In [None]:
import os
import subprocess
import threading

# Configure Streamlit for Colab
os.environ['STREAMLIT_SERVER_HEADLESS'] = 'true'
os.environ['STREAMLIT_SERVER_PORT'] = '8501'
os.environ['STREAMLIT_SERVER_ENABLE_XSRF_PROTECTION'] = 'false'

# Create .streamlit config directory
!mkdir -p ~/.streamlit/

# Create streamlit config file
config_content = """
[server]
headless = true
enableXsrfProtection = false
port = 8501

[client]
showErrorDetails = true
"""

with open(os.path.expanduser('~/.streamlit/config.toml'), 'w') as f:
    f.write(config_content)

print("‚úÖ Streamlit configured for Colab")

## Section 5: Run Streamlit with Ngrok Tunnel
Execute the Streamlit app and establish public URL access.

In [None]:
import subprocess
import time
from pyngrok import ngrok

# Start Streamlit app in background
process = subprocess.Popen(
    ['streamlit', 'run', 'hybrid_rag_colab.py', '--client.showErrorDetails=false'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)

# Wait for Streamlit to start
print("‚è≥ Starting Streamlit app...")
time.sleep(3)

# Get the public URL from Ngrok
try:
    # Kill any existing tunnels
    ngrok.disconnect()
    
    # Create new tunnel
    public_url = ngrok.connect(8501, "tcp")
    print(f"\n‚úÖ Streamlit app is running!")
    print(f"üåê Public URL: {public_url}")
    print(f"\nüì± Open this link in your browser to access the app:")
    print(f"   {public_url}")
except Exception as e:
    print(f"‚ùå Error creating tunnel: {e}")
    print("Make sure you have set your Ngrok token in the cell above!")

## Section 6: Access the App via Public URL

### Steps to Access:
1. **Get Ngrok Token**: Go to https://dashboard.ngrok.com/auth/your-authtoken
2. **Copy Your Token**: Add it to the "Set Up Ngrok Authentication" cell above
3. **Run All Cells**: Execute the cells in order
4. **Access the App**: Click the link printed in the "Run Streamlit with Ngrok Tunnel" cell
5. **Interact**: Use the text input to ask questions to your Hybrid RAG System

### Key Modifications for Colab:
- ‚úÖ Uses demo corpus instead of full URL scraping (faster)
- ‚úÖ Configured Streamlit for headless execution
- ‚úÖ Ngrok tunnel provides public access
- ‚úÖ Reduced model sizes for faster loading
- ‚úÖ Simplified UI for Colab environment