In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1


In [None]:
!pip install openai langchain faiss-cpu sentence-transformers streamlit sympy wikipedia

Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=d93e2c51b563bf652f071783822b9df67398bed2de555205decea515e2326222
  Stored in directory: /root/.cache/pip/wheels/63/47/7c/

In [None]:
!pip install sentence-transformers



In [None]:
pip install wikipedia-api

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15383 sha256=e1a03c477600ca7802e65db07b0629d5cb7262bd98a5100c03f103d4c1e43df1
  Stored in directory: /root/.cache/pip/wheels/33/3c/79/b36253689d838af4a0539782853ac3cc38a83a6591ad570dde
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=8e5b8569dd0c5290552afc4c21cc09813796211d5698ef41bc5753a20403503e
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
# ================================
# 1. INSTALL REQUIRED PACKAGES (RUN ONCE)
# ================================
# pip install wikipedia-api sentence-transformers faiss-cpu numpy transformers rouge-score torch

# ================================
# 2. IMPORTS
# ================================
import os
import numpy as np
import faiss
import wikipediaapi
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from rouge_score import rouge_scorer

# ================================
# 3. DOWNLOAD WIKIPEDIA DATA
# ================================
os.makedirs("data", exist_ok=True)

wiki = wikipediaapi.Wikipedia(
    language="en",
    user_agent="DocSearchBot/1.0 (contact@example.com)"
)

topics = ["Artificial Intelligence", "Machine Learning", "Deep Learning"]
documents = []

for topic in topics:
    page = wiki.page(topic)
    if page.exists():
        documents.append(page.text)

print("Downloaded documents:", len(documents))

# ================================
# 4. CHUNK TEXT
# ================================
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

print("Total chunks:", len(chunks))

# ================================
# 5. CREATE EMBEDDINGS
# ================================
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)
print("Embeddings shape:", embeddings.shape)

# ================================
# 6. BUILD FAISS INDEX
# ================================
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
print("FAISS index size:", index.ntotal)

# ================================
# 7. SEARCH FUNCTION
# ================================
def search_documents(query, top_k=3):
    query_vector = model.encode([query])
    distances, indices = index.search(np.array(query_vector), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(chunks):  # SAFETY CHECK
            results.append(chunks[idx])
    return results

# ================================
# 8. LOAD SUMMARIZATION MODEL
# ================================
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=-1  # CPU, set 0 for GPU
)

# ================================
# 9. SEARCH + SAFE SUMMARIZE FUNCTION
# ================================
def search_and_summarize(query, top_k=3, max_len=150, sub_chunk_size=400):
    # 1️⃣ Get top documents
    docs = search_documents(query, top_k=top_k)
    if not docs:
        return "No relevant documents found."

    # 2️⃣ Combine text and split into sub-chunks to avoid tokenizer limits
    combined_text = " ".join(docs)
    words = combined_text.split()
    sub_chunks = [" ".join(words[i:i+sub_chunk_size]) for i in range(0, len(words), sub_chunk_size)]

    # 3️⃣ Summarize each sub-chunk
    summaries = []
    for sub in sub_chunks:
        summary = summarizer(
            sub,
            max_length=max_len,
            min_length=60,
            do_sample=False
        )
        summaries.append(summary[0]["summary_text"])

    # 4️⃣ Combine sub-summaries
    final_summary = " ".join(summaries)
    return final_summary

# ================================
# 10. SIMPLE EVALUATION FUNCTIONS
# ================================
# Precision@K for search
def precision_at_k(relevant_indices, retrieved_indices, k=3):
    retrieved_k = retrieved_indices[:k]
    return len(set(retrieved_k) & set(relevant_indices)) / k

# ROUGE score for summary
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
def evaluate_summary(reference_text, generated_summary):
    scores = scorer.score(reference_text, generated_summary)
    return scores

# ================================
# 11. TEST SEARCH + SUMMARIZATION + EVALUATION
# ================================
query = "Explain Artificial Intelligence"

# SEARCH
results = search_documents(query, top_k=3)
print("\n--- SEARCH RESULTS ---")
for i, text in enumerate(results, 1):
    print(f"\nResult {i}:\n{text[:300]}")

# SUMMARIZE
final_summary = search_and_summarize(query, top_k=3, max_len=150)
print("\n--- FINAL SUMMARY ---\n")
print(final_summary)

# EVALUATION EXAMPLE
reference_text = chunks[0]  # first chunk of AI doc as reference
rouge_scores = evaluate_summary(reference_text, final_summary)
print("\n--- ROUGE Scores ---")
print(rouge_scores)

retrieved_indices = [chunks.index(r) for r in results]
relevant_indices = [0]  # assume first chunk is relevant
prec = precision_at_k(relevant_indices, retrieved_indices, k=3)
print(f"\nPrecision@3: {prec}")




Downloaded documents: 3
Total chunks: 62


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Embeddings shape: (62, 384)
FAISS index size: 62


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu



--- SEARCH RESULTS ---

Result 1:
Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops and studies methods and so

Result 2:
would not be considered acceptable unless it provided "a full and satisfactory explanation for the decisions" it makes. In 2018, a self-driving car from Uber failed to detect a pedestrian, who was killed after a collision. Attempts to use machine learning in healthcare with the IBM Watson system fai

Result 3:
theory can be used to weigh the value of exploratory or experimental actions. The space of possible future actions and situations is typically intractably large, so the agents must take actions and evaluate situations while being uncertain of what the outcome will be. A Markov decision process has a

--- FINAL SUMMARY ---

Artificial intel

In [None]:
!pip install streamlit wikipedia-api sentence-transformers faiss-cpu transformers torch pyngrok

Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [None]:
# ================================
# Streamlit App: Document Search & Summarization
# ================================
# Install dependencies (run once)
# pip install streamlit wikipedia-api sentence-transformers faiss-cpu transformers torch

import streamlit as st
import os
import numpy as np
import wikipediaapi
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

# ================================
# 1. DOWNLOAD WIKIPEDIA ARTICLES
# ================================
@st.cache_data
def download_documents(topics):
    wiki = wikipediaapi.Wikipedia(
        language="en",
        user_agent="DocSearchApp/1.0 (contact@example.com)"
    )
    documents = []
    for topic in topics:
        page = wiki.page(topic)
        if page.exists():
            documents.append(page.text)
    return documents

topics = ["Artificial Intelligence", "Machine Learning", "Deep Learning"]
documents = download_documents(topics)

# ================================
# 2. CHUNK TEXT
# ================================
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

# ================================
# 3. CREATE EMBEDDINGS
# ================================
@st.cache_resource
def create_embeddings(chunks):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks, show_progress_bar=True)
    return model, embeddings

model, embeddings = create_embeddings(chunks)

# ================================
# 4. BUILD FAISS INDEX
# ================================
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# ================================
# 5. SEARCH FUNCTION
# ================================
def search_documents(query, top_k=3):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(chunks):
            results.append(chunks[idx])
    return results

# ================================
# 6. SUMMARIZATION MODEL
# ================================
@st.cache_resource
def load_summarizer():
    return pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device=-1  # CPU
    )

summarizer = load_summarizer()

# ================================
# 7. SEARCH + SAFE SUMMARIZE FUNCTION
# ================================
def search_and_summarize(query, top_k=3, max_len=150, sub_chunk_size=400):
    docs = search_documents(query, top_k=top_k)
    if not docs:
        return "No relevant documents found."

    combined_text = " ".join(docs)
    words = combined_text.split()
    sub_chunks = [" ".join(words[i:i+sub_chunk_size]) for i in range(0, len(words), sub_chunk_size)]

    summaries = []
    for sub in sub_chunks:
        summary = summarizer(sub, max_length=max_len, min_length=60, do_sample=False)
        summaries.append(summary[0]["summary_text"])

    return " ".join(summaries)

# ================================
# 8. STREAMLIT INTERFACE
# ================================
st.title("📄 Document Search & Summarization App")
st.write("Type your query to search across AI, ML, and Deep Learning documents.")

query = st.text_input("Enter your query:", "What is Artificial Intelligence?")
top_k = st.slider("Number of top results to retrieve:", 1, 10, 3)
summary_length = st.slider("Maximum summary length:", 50, 300, 150)

if st.button("Search & Summarize"):
    with st.spinner("Searching and summarizing..."):
        results = search_documents(query, top_k=top_k)
        final_summary = search_and_summarize(query, top_k=top_k, max_len=summary_length)

    st.subheader("🔍 Top Search Results")
    for i, r in enumerate(results, 1):
        st.markdown(f"**Result {i}:** {r[:500]}...")

    st.subheader("📝 Summary")
    st.write(final_summary)


2025-12-16 13:10:02.806 No runtime found, using MemoryCacheStorageManager


In [None]:
%%writefile app.py
import streamlit as st
import os
import numpy as np
import wikipediaapi
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline

# ================================
# Download Wikipedia documents
# ================================
@st.cache_data
def download_documents(topics):
    wiki = wikipediaapi.Wikipedia(
        language="en",
        user_agent="DocSearchApp/1.0"
    )
    documents = []
    for topic in topics:
        page = wiki.page(topic)
        if page.exists():
            documents.append(page.text)
    return documents

topics = ["Artificial Intelligence", "Machine Learning", "Deep Learning"]
documents = download_documents(topics)

# ================================
# Chunk documents
# ================================
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = []
for doc in documents:
    chunks.extend(chunk_text(doc))

# ================================
# Embeddings
# ================================
@st.cache_resource
def create_embeddings(chunks):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks, show_progress_bar=True)
    return model, embeddings

model, embeddings = create_embeddings(chunks)

# ================================
# FAISS Index
# ================================
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# ================================
# Search function
# ================================
def search_documents(query, top_k=3):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(chunks):
            results.append(chunks[idx])
    return results

# ================================
# Summarizer
# ================================
@st.cache_resource
def load_summarizer():
    return pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device=-1
    )

summarizer = load_summarizer()

def search_and_summarize(query, top_k=3, max_len=150, sub_chunk_size=400):
    docs = search_documents(query, top_k=top_k)
    if not docs:
        return "No relevant documents found."

    combined_text = " ".join(docs)
    words = combined_text.split()
    sub_chunks = [" ".join(words[i:i+sub_chunk_size]) for i in range(0, len(words), sub_chunk_size)]

    summaries = []
    for sub in sub_chunks:
        summary = summarizer(sub, max_length=max_len, min_length=60, do_sample=False)
        summaries.append(summary[0]["summary_text"])

    return " ".join(summaries)

# ================================
# Streamlit UI
# ================================
st.title("📄 Document Search & Summarization App")
st.write("Type your query to search across AI, ML, and Deep Learning documents.")

query = st.text_input("Enter your query:", "What is Artificial Intelligence?")
top_k = st.slider("Number of top results to retrieve:", 1, 10, 3)
summary_length = st.slider("Maximum summary length:", 50, 300, 150)

if st.button("Search & Summarize"):
    with st.spinner("Searching and summarizing..."):
        results = search_documents(query, top_k=top_k)
        final_summary = search_and_summarize(query, top_k=top_k, max_len=summary_length)

    st.subheader("🔍 Top Search Results")
    for i, r in enumerate(results, 1):
        st.markdown(f"**Result {i}:** {r[:500]}...")

    st.subheader("📝 Summary")
    st.write(final_summary)


Overwriting app.py


In [None]:
# 1. Install Streamlit
!pip install -q streamlit

# 2. Run Streamlit in background
!streamlit run /content/app.py --server.port 8501 --server.address 0.0.0.0 &>/content/logs.txt &

# 3. Create SSH reverse tunnel using Serveo
!ssh -o StrictHostKeyChecking=no -R 80:localhost:8501 serveo.net


[32mForwarding HTTP traffic from https://240c95402454ee248395ba7b16566298.serveousercontent.com
[0m