In [None]:
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class Document:
    source_id: str
    source_type: str   # pdf | wikipedia | text
    title: str
    content: str
    metadata: Dict

@dataclass
class DocumentChunk:
    chunk_id: str
    source_id: str
    source_type: str
    title: str
    content: str
    metadata: Dict

@dataclass
class WebSearchResult:
    title: str
    snippet: str
    url: str

@dataclass
class AnswerSource:
    source_type: str   # Doc | Web
    reference: str


In [None]:
!pip install -q langchain-community
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WikipediaLoader
import uuid

# Document class is defined in cell bqB4HZ88SL9L and will be available after execution

def load_pdfs(pdf_paths):
    docs = []
    for path in pdf_paths:
        loader = PyPDFLoader(path)
        pages = loader.load()
        full_text = " ".join([p.page_content for p in pages])

        docs.append(Document(
            source_id=str(uuid.uuid4()),
            source_type="pdf",
            title=path.split("/")[-1],
            content=full_text,
            metadata={"path": path}
        ))
    return docs

def load_text_files(paths):
    docs = []
    for path in paths:
        loader = TextLoader(path)
        data = loader.load()[0]

        docs.append(Document(
            source_id=str(uuid.uuid4()),
            source_type="text",
            title=path.split("/")[-1],
            content=data.page_content,
            metadata={"path": path}
        ))
    return docs

def load_wikipedia(pages):
    docs = []
    for page in pages:
        loader = WikipediaLoader(query=page, load_max_docs=1)
        data = loader.load()[0]

        docs.append(Document(
            source_id=str(uuid.uuid4()),
            source_type="wikipedia",
            title=page,
            content=data.page_content,
            metadata={"url": data.metadata.get("source")}
        ))
    return docs

In [None]:
import re

def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()


In [None]:
!pip install -q langchain-text-splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dataclasses import dataclass
from typing import Dict, Any
import uuid

@dataclass
class DocumentChunk:
    chunk_id: str
    source_id: str
    source_type: str
    title: str
    content: str
    metadata: Dict[str, Any]

def chunk_documents(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )

    chunks = []
    for doc in documents:
        split_texts = splitter.split_text(doc.content)

        for i, chunk in enumerate(split_texts):
            chunks.append(DocumentChunk(
                chunk_id=str(uuid.uuid4()),
                source_id=doc.source_id,
                source_type=doc.source_type,
                title=doc.title,
                content=chunk,
                metadata={"chunk_index": i}
            ))
    return chunks

In [None]:
!pip install -q langchain-openai langchain-core
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document as LCDocument
import os

def index_documents(chunks, index_path="faiss_index"):
    embeddings = OpenAIEmbeddings()

    lc_docs = [
        LCDocument(
            page_content=c.content,
            metadata={
                "title": c.title,
                "source_type": c.source_type,
                "chunk_index": c.metadata["chunk_index"]
            }
        )
        for c in chunks
    ]

    db = FAISS.from_documents(lc_docs, embeddings)
    db.save_local(index_path)

def load_faiss_index(index_path="faiss_index"):
    embeddings = OpenAIEmbeddings()
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

In [None]:
def semantic_search(db, query, k=5):
    return db.similarity_search(query, k=k)


In [None]:
def classify_query(query: str):
    query = query.lower()

    if any(word in query for word in ["latest", "recent", "current", "today", "news"]):
        return "web"

    if any(word in query for word in ["compare", "difference", "vs"]):
        return "hybrid"

    return "document"


In [None]:
from langchain_community.tools.tavily_search import TavilySearchResults

def tavily_search(query, k=5):
    tool = TavilySearchResults(k=k)
    results = tool.run(query)

    return results

In [None]:
def build_context(doc_chunks, web_results):
    context = ""

    for d in doc_chunks:
        context += f"[Doc] {d.metadata['title']} (Chunk {d.metadata['chunk_index']}):\n{d.page_content}\n\n"

    for w in web_results:
        context += f"[Web] {w['title']}:\n{w['content']}\n\n"

    return context[:4000]


In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
import os

def generate_answer(query, context):
    llm = ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))

    prompt = PromptTemplate(
        input_variables=["query", "context"],
        template="""
Answer the question using ONLY the context.
Cite sources clearly.

Context:
{context}

Question:
{query}
"""
    )

    return llm(prompt.format(query=query, context=context))

In [None]:
def summarize_documents(docs):
    llm = ChatOpenAI(temperature=0)

    summaries = []
    for d in docs:
        summary = llm(f"Summarize:\n{d.page_content[:1000]}")
        summaries.append(summary)

    return summaries


In [None]:
!pip install -q streamlit langchain-community langchain-openai langchain-core
import streamlit as st
import os
from dataclasses import dataclass
from typing import Dict, Any, List
import uuid

# --- BEGIN: Consolidated Function Definitions and Imports ---

# From cell bqB4HZ88SL9L (Document and DocumentChunk dataclasses)
@dataclass
class Document:
    source_id: str
    source_type: str
    title: str
    content: str
    metadata: Dict

@dataclass
class DocumentChunk:
    chunk_id: str
    source_id: str
    source_type: str
    title: str
    content: str
    metadata: Dict

@dataclass
class WebSearchResult:
    title: str
    snippet: str
    url: str

# From cell Bs9xWNyiTeX4 (classify_query)
def classify_query(query: str):
    query = query.lower()

    if any(word in query for word in ["latest", "recent", "current", "today", "news"]):
        return "web"

    if any(word in query for word in ["compare", "difference", "vs"]):
        return "hybrid"

    return "document"

# From cell S7V9QkdUSztO (load_faiss_index and dependencies)
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document as LCDocument

def load_faiss_index(index_path="faiss_index"):
    embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
    # Ensure the index directory exists before trying to load
    if not os.path.exists(index_path):
        st.error(f"FAISS index not found at '{index_path}'. Please run the indexing steps first.")
        return None
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

# From cell oyfT5XoXThBZ (tavily_search)
from langchain_community.tools.tavily_search import TavilySearchResults

def tavily_search(query, k=5):
    tool = TavilySearchResults(k=k)
    results = tool.run(query)
    # Parse TavilySearchResults output into WebSearchResult dataclass list
    parsed_results = []
    for res in eval(results): # TavilySearchResults.run returns a string representation of a list of dicts
        parsed_results.append(WebSearchResult(
            title=res.get('title', 'No Title'),
            snippet=res.get('content', 'No Content'),
            url=res.get('url', 'No URL')
        ))
    return parsed_results

# From cell YhVG42z8Tj66 (build_context)
def build_context(doc_chunks, web_results):
    context = ""

    for d in doc_chunks:
        context += f"[Doc] {d.metadata['title']} (Chunk {d.metadata['chunk_index']}):\n{d.page_content}\n\n"

    for w in web_results:
        context += f"[Web] {w.title}:\n{w.snippet}\n\n" # Use w.snippet instead of w['content']

    return context[:4000]

# From cell IeWxc_JBTu4F (generate_answer and dependencies)
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

def generate_answer(query, context):
    llm = ChatOpenAI(temperature=0, openai_api_key=os.getenv("OPENAI_API_KEY"))

    prompt = PromptTemplate(
        input_variables=["query", "context"],
        template="""
Answer the question using ONLY the context.
Cite sources clearly.

Context:
{context}

Question:
{query}
"""
    )

    return llm.invoke(prompt.format(query=query, context=context))

# --- END: Consolidated Function Definitions and Imports ---

st.set_page_config(page_title="Hybrid RAG Engine")

st.sidebar.title(" Document Manager")
use_web = st.sidebar.checkbox("Enable Web Search", True)

query = st.text_input("Ask your question")

if query:
    route = classify_query(query)

    db = load_faiss_index()
    if db is None:
        st.error("Cannot proceed without a FAISS index.")
        st.stop() # Stop execution if index is not loaded

    docs, web = [], []

    if route in ["document", "hybrid"]:
        # Ensure db.similarity_search returns Document objects that have page_content
        docs = db.similarity_search(query, k=4)

    if route in ["web", "hybrid"] and use_web:
        # tavily_search now returns list of WebSearchResult objects
        web = tavily_search(query)

    context = build_context(docs, web)
    # generate_answer now returns an AIMessage object, extract content
    answer_message = generate_answer(query, context)
    answer_content = answer_message.content if hasattr(answer_message, 'content') else str(answer_message)

    icon = "üìÑ" if route == "document" else "üåê" if route == "web" else "üìä" if route == "hybrid" else "‚ùì"
    st.markdown(f"### {icon} Answer")
    st.write(answer_content)

    with st.expander(" Document Evidence"):
        if docs:
            for d in docs:
                st.write(f"**Title:** {d.metadata.get('title', 'N/A')} (Chunk {d.metadata.get('chunk_index', 'N/A')})")
                st.write(d.page_content[:300] + "...")
        else:
            st.write("No document evidence found.")

    with st.expander("Web Evidence"):
        if web:
            for w in web:
                st.write(f"**Title:** [{w.title}]({w.url})")
                st.write(w.snippet[:300] + "...")
        else:
            st.write("No web evidence found.")

In [None]:
!pip install streamlit -q

In [None]:
!wget -q -O - ipv4.icanhazip.com

In [None]:
%%writefile app.py

import streamlit as st

st.set_page_config(page_title="Hybrid RAG", layout="wide")

st.title("Hybrid RAG Search Engine")
st.write("Streamlit is running successfully ")

query = st.text_input("Ask something")
if query:
    st.write("You asked:", query)


In [None]:
!npx localtunnel --port 8501

[1G[0K‚†ô[1G[0K‚†π[1G[0K‚†∏[1G[0K‚†º[1G[0K‚†¥[1G[0K‚†¶[1G[0K‚†ß[1G[0K‚†á[1G[0K‚†è[1G[0K‚†ã[1G[0K‚†ô[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20G

In [None]:
! streamlit run app.py & npx localtunnel --port 8501