##  Setup

In [2]:
!pip install wikipedia-api faiss-cpu sentence-transformers


Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15383 sha256=d5f25ada2f0476b651e731c6f42350c0a3435beb34f4820e4e2cba62175c2554
  Stored in directory: /root/.cache/pip/wheels/33/3c/79/b36253689d838af4a0539782853ac3cc38a83a6591ad570dde
Successfully built wikipedia-api
Installing collected packages: faiss-cpu, wikipedia-api
Successfully installed faiss-cpu-1.12.

## Mount Google Drive
to download LLama model locally

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import libraries

In [4]:
import wikipediaapi
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np


## Fetch Wikipedia Articles

In [5]:
import wikipediaapi

wiki = wikipediaapi.Wikipedia(
    user_agent="DocuQueryRAG/1.0",
    language="en"
)

topics = [
    "Artificial intelligence",
    "Machine learning",
    "Neural network",
    "Quantum computing",
    "Cybersecurity",
    "Remote sensing",
    "Oceanography",
    "Climate change",
    "Renewable energy",
    "Space exploration"
]

docs = {}
for topic in topics:
    page = wiki.page(topic)
    docs[topic] = page.text
    print(f"Fetched: {topic} ({len(page.text.split())} words)")


Fetched: Artificial intelligence (13450 words)
Fetched: Machine learning (8494 words)
Fetched: Neural network (614 words)
Fetched: Quantum computing (7163 words)
Fetched: Cybersecurity (14629 words)
Fetched: Remote sensing (4160 words)
Fetched: Oceanography (3772 words)
Fetched: Climate change (9642 words)
Fetched: Renewable energy (8397 words)
Fetched: Space exploration (6861 words)


## Chunk the documents

In [6]:
def chunk_text(text, chunk_size=300, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        if chunk:
            chunks.append(chunk)
    return chunks

corpus = []
metadata = []

# store counts per document
chunk_counts = {}

for topic, text in docs.items():
    chunks = chunk_text(text)
    corpus.extend(chunks)
    metadata.extend([topic] * len(chunks))
    chunk_counts[topic] = len(chunks)   # save count for this doc
    print(f"{topic}: {len(chunks)} chunks")

print(f"\nTotal chunks across all docs: {len(corpus)}")


Artificial intelligence: 54 chunks
Machine learning: 34 chunks
Neural network: 3 chunks
Quantum computing: 29 chunks
Cybersecurity: 59 chunks
Remote sensing: 17 chunks
Oceanography: 16 chunks
Climate change: 39 chunks
Renewable energy: 34 chunks
Space exploration: 28 chunks

Total chunks across all docs: 313


## Build embeddings + FAISS index

In [7]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(corpus, convert_to_numpy=True)

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"Index size: {index.ntotal}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Index size: 313


## Define retrieval function
This function first retrieves the top candidate chunks from the FAISS index based on semantic similarity to the query.
Then, a Cross-Encoder model re-ranks these candidates by computing similarity scores between each query–chunk pair, ensuring that only the most relevant chunks are passed to the LLM for answer generation.

In [8]:
from sentence_transformers import CrossEncoder
# Load reranker
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def retrieve_and_rerank(query, top_k=5):
    # Step 1: get top_k results from FAISS (but fetch more to rerank)
    query_emb = embedder.encode([query])   # use "embedder", not embedding_model
    D, I = index.search(query_emb, top_k*3)  # get 3x more candidates

    candidates = [(corpus[idx], metadata[idx]) for idx in I[0]]

    # Step 2: rerank using CrossEncoder
    pairs = [(query, doc) for doc, _ in candidates]
    scores = reranker.predict(pairs)

    # Step 3: sort by score
    reranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)

    # Step 4: return top_k reranked chunks
    return [r[0] for r in reranked[:top_k]]


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

## -- Add an LLM for generation --

using local model like LLaMA from HuggingFace



## Set model cache path to Drive

In [9]:
import shutil, os

# Paths: Hugging Face + Torch default caches
hf_default = "/root/.cache/huggingface"
torch_default = "/root/.cache/torch"
custom_cache = "/content/drive/MyDrive/ML_projects/HF_models_cache"

for path in [hf_default, torch_default, custom_cache]:
    if os.path.exists(path):
        shutil.rmtree(path)
        print(f"🗑️ Deleted cache at: {path}")
    else:
        print(f"⚠️ No cache at: {path}")

# Recreate your custom cache folder
os.makedirs(custom_cache, exist_ok=True)
os.environ["TRANSFORMERS_CACHE"] = custom_cache
os.environ["HF_HOME"] = custom_cache
print("📂 Fresh empty cache created at:", custom_cache)


🗑️ Deleted cache at: /root/.cache/huggingface
⚠️ No cache at: /root/.cache/torch
🗑️ Deleted cache at: /content/drive/MyDrive/ML_projects/HF_models_cache
📂 Fresh empty cache created at: /content/drive/MyDrive/ML_projects/HF_models_cache


## Log into Hugging Face (needed for LLaMA)

LLaMA is gated → we need to accept license on Hugging Face hub and log in with a token.

In [11]:
!pip install huggingface_hub
from huggingface_hub import login

login(token="your-HF-token")  # get this from https://huggingface.co/settings/tokens




## Load LLaMA 3 model

Now we can load LLaMA model directly from Drive

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_name = "meta-llama/Meta-Llama-3-8B"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

llama_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    do_sample=False
)



tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


## Define RAG function
This function provides the retrieved context, query, and an instruction prompt to the LLaMA model, which then generates a concise and context-aware answer.

In [13]:
def answer_with_context_llama(query, top_k=3):
    # Step 1: retrieve relevant chunks
    results = retrieve_and_rerank(query, top_k=3)
    context = "\n\n".join([r[0] for r in results])


    # Step 2: build prompt

    prompt = f"""
You are a LLM assistant.
Use the context below to answer the question.

Context:
{context}
Question: {query}

Answer directly and concisely:
"""

    # Step 3: run through LLaMA
    output = llama_pipe(prompt)[0]["generated_text"]

    # Step 4: clean answer (remove the prompt prefix)
    answer = output.split("Answer directly and concisely:")[-1].strip()
    return answer


## Compare baseline LLM vs. RAG

In [14]:
import textwrap
query = "What are the applications of remote sensing?"

print("=== Baseline (LLM only - LLaMA) ===")
baseline = llama_pipe(query, max_new_tokens=200)[0]["generated_text"]
print(textwrap.fill(baseline, width=100))

#############
print("\n=== RAG ( Retrieval + LLM – LLaMA) ===")
rag_answer = answer_with_context_llama(query, top_k=3)
print(textwrap.fill(rag_answer, width=100))   # wrap lines to 100 chars






=== Baseline (LLM only - LLaMA) ===
What are the applications of remote sensing? Remote sensing is used to monitor the environment,
study the Earth’s surface, and map the Earth’s surface. It is also used to monitor the Earth’s
atmosphere, study the Earth’s climate, and map the Earth’s surface. What are the applications of
remote sensing? Remote sensing is a technique used to collect information about an object or area
without physically touching it. It is used in a variety of applications, including environmental
monitoring, agriculture, and disaster response. What are the applications of remote sensing? Remote
sensing is a technique used to collect information about an object or area without physically
touching it. It is used in a variety of applications, including environmental monitoring,
agriculture, and disaster response. What are the applications of remote sensing? Remote sensing is a
technique used to collect information about an object or area without physically touching it. It