In [1]:
# 📦 Step 1: Install Dependencies
!pip install -q faiss-cpu sentence-transformers transformers PyMuPDF

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# 📂 Step 2: Upload PDFs
from google.colab import files
uploaded = files.upload()

Saving 1706.03762v7.pdf to 1706.03762v7.pdf
Saving 2005.11401v4.pdf to 2005.11401v4.pdf
Saving 2005.14165v4.pdf to 2005.14165v4.pdf


In [3]:
# 📥 Step 3: Load and Extract Text from PDFs
import fitz  # PyMuPDF

def load_pdfs(uploaded_files):
    texts = []
    for fname in uploaded_files:
        doc = fitz.open(fname)
        for page_num, page in enumerate(doc):
            texts.append({
                "page": page_num + 1,
                "text": page.get_text()
            })
    return texts

pdf_texts = load_pdfs(uploaded.keys())
print(f"Loaded {len(pdf_texts)} pages.")

Loaded 109 pages.


In [4]:
# 🧱 Step 4: Chunk the Text
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

def chunk_texts(texts):
    chunks = []
    for entry in texts:
        words = entry["text"].split()
        for i in range(0, len(words), CHUNK_SIZE - CHUNK_OVERLAP):
            chunk = " ".join(words[i:i + CHUNK_SIZE])
            if chunk:
                chunks.append({"page": entry["page"], "chunk": chunk})
    return chunks

chunks = chunk_texts(pdf_texts)
print(f"Generated {len(chunks)} chunks.")

Generated 179 chunks.


In [5]:
# 📐 Step 5: Embed the Chunks
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer('all-MiniLM-L6-v2')
texts = [c["chunk"] for c in chunks]
embeddings = embedder.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
# 🔎 Step 6: Build FAISS Index
import faiss

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [16]:
# 💬 Step 7: Answer Questions using Retrieved Context
from transformers import pipeline

qa_generator = pipeline("text2text-generation", model="google/flan-t5-base")

def retrieve(query, k=3):
    q_embedding = embedder.encode([query]).astype("float32")
    D, I = index.search(q_embedding, k)
    return [chunks[i] for i in I[0]]

def generate_answer(query):
    top_chunks = retrieve(query)
    context = "\n".join([f"[Page {c['page']}] {c['chunk']}" for c in top_chunks])
    prompt = f"Answer the question based on the context below.\nContext: {context}\nQuestion: {query}"
    result = qa_generator(prompt, max_length=1024)[0]['generated_text']
    return result, top_chunks

Device set to use cpu


In [17]:
# 🧪 Step 8: Ask a Question
question = {"Explain about Positional Encoding"
}
answer, sources = generate_answer(question)

print("✅ Answer:\n", answer)
print("\n📚 Sources:")
for src in sources:
    print(f"- Page {src['page']}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1541 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


✅ Answer:
 Positional Encoding Since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed. There are many choices of positional encodings, learned and fixed [9]. In this work, we use sine and cosine functions of different frequencies: PE(pos,2i) = sin(pos/100002i/dmodel) PE(pos,2i+1) = cos(pos/100002i/dmodel) where pos is the position and i is the dimension. That is, each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression from 2 to 10000  2. We chose this function because we hypothe

📚 Sources:
- Page 6
- Page 3
- Page 2
