In [None]:
!pip -q install sentence-transformers pypdf scikit-learn tqdm


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m225.3/329.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, re
import numpy as np
from tqdm import tqdm
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity



In [None]:
PDF_DIR = "papers"

In [None]:
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MAX_CHARS = 350
OVERLAP = 80
TOP_K = 10
BATCH_SIZE = 64

In [None]:
def clean_text(s: str) -> str:
    s = s.replace("-\n", "")
    s = s.replace("\n", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_pdf_chunks(pdf_path: str, max_chars=900, overlap=120):
    reader = PdfReader(pdf_path)
    pdf_file = os.path.basename(pdf_path)
    chunks = []

    for i, page in enumerate(reader.pages):
        raw = page.extract_text() or ""
        text = clean_text(raw)
        if not text.strip():
            continue

        start = 0
        page_num = i + 1
        while start < len(text):
            end = min(start + max_chars, len(text))
            chunk_text = text[start:end].strip()
            if chunk_text:
                chunk_id = f"{pdf_file}::p{page_num}::c{len(chunks)}"
                chunks.append({
                    "chunk_id": chunk_id,
                    "pdf_file": pdf_file,
                    "page": page_num,
                    "text": chunk_text
                })
            if end == len(text):
                break
            start = max(0, end - overlap)

    return chunks

In [None]:
def build_index(pdf_dir: str):
    pdf_files = sorted([
        os.path.join(pdf_dir, f)
        for f in os.listdir(pdf_dir)
        if f.lower().endswith(".pdf")
    ])

    if not pdf_files:
        raise FileNotFoundError(f"No PDF files found in: {pdf_dir}")

    print("Found PDFs:")
    for f in pdf_files:
        print(" -", os.path.basename(f))

    # 1) chunks
    all_chunks = []
    for pdf in pdf_files:
        all_chunks.extend(extract_pdf_chunks(pdf, MAX_CHARS, OVERLAP))

    if len(all_chunks) == 0:
        raise ValueError("No text extracted. If these are scanned PDFs, you may need OCR.")

    print(f"\nTotal chunks: {len(all_chunks)}")

    # 2) embeddings
    model = SentenceTransformer(MODEL_NAME)
    texts = [c["text"] for c in all_chunks]

    embs = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding chunks"):
        batch = texts[i:i+BATCH_SIZE]
        emb = model.encode(batch, normalize_embeddings=True)  # normalize -> cosine = dot
        embs.append(emb)

    embeddings = np.vstack(embs).astype(np.float32)
    return all_chunks, embeddings, model

In [None]:
def search(query: str, chunks, embeddings, model, top_k=5):
    q_emb = model.encode([query], normalize_embeddings=True).astype(np.float32)
    scores = cosine_similarity(q_emb, embeddings)[0]
    idx = np.argsort(-scores)[:top_k]

    results = []
    for rank, j in enumerate(idx, start=1):
        c = chunks[j]
        results.append({
            "rank": rank,
            "score": float(scores[j]),
            "pdf_file": c["pdf_file"],
            "page": c["page"],
            "chunk_id": c["chunk_id"],
            "preview": c["text"][:300] + ("..." if len(c["text"]) > 300 else "")
        })
    return results

In [9]:
chunks, embeddings, model = build_index(PDF_DIR)

print("\nIndex ready. Try a query like:")
print('results = search("your query", chunks, embeddings, model, top_k=5)')

Found PDFs:
 - 2020.03.05.20031088v1.full.pdf
 - 2020.03.09.20032896v1.full.pdf
 - 2020.03.11.20034215v1.full (1).pdf
 - 2020.03.11.20034215v1.full.pdf
 - 2020.03.11.20034546v1.full.pdf
 - 2020.03.14.20035659v1.full.pdf
 - 2020.03.28.20045997v2.full.pdf
 - 2020.03.28.20046045v1.full.pdf
 - 556951v5.full.pdf
 - Chest_CT_for_early_detection_and_management_of_cor.pdf
 - Effects of age-targeted sequestration for COVID-19.pdf
 - Evaluation_of_Group_Testing_for_SARS-CoV-2_RNA.pdf
 - Far-UVC_light_A_new_tool_to_control_the_spread_of_.pdf
 - High_Prevalence_of_Strongyloidiasis_in_Spain_A_Hos.pdf
 - Why_estimating_population-based_case_fatality_rate.pdf
 - gene201456.pdf
 - main.pdf
 - publichealth-2020-2-e19464.pdf

Total chunks: 3265


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding chunks: 100%|██████████| 52/52 [03:58<00:00,  4.59s/it]


Index ready. Try a query like:
results = search("your query", chunks, embeddings, model, top_k=5)





In [10]:
results = search("How to protect from COVID", chunks, embeddings, model, top_k=5)
for r in results:
    print(f"\n#{r['rank']}  score={r['score']:.4f}")
    print(f"Source: {r['pdf_file']}  page={r['page']}  id={r['chunk_id']}")
    print(r["preview"])


#1  score=0.5998
Source: 2020.03.05.20031088v1.full.pdf  page=3  id=2020.03.05.20031088v1.full.pdf::p3::c28
h resources and individual liberty, it is vital to assess under what conditions quarantine can effectively control COVID-19, and among these under what conditions it is substantially more effective than less restrictive approaches such as active monitoring, particularly given uncertainty in essentia...

#2  score=0.5836
Source: 2020.03.11.20034546v1.full.pdf  page=6  id=2020.03.11.20034546v1.full.pdf::p6::c28
nd to beat high risk of developing COVID-19, even when infection prevention measures were in place, including usage of personal protective equipment (PPE: eye protection/face shield, respiratory protection, isolation gowns, and gloves), hand hygiene, and patient placement in negative-pressure isolat...

#3  score=0.5644
Source: 2020.03.05.20031088v1.full.pdf  page=23  id=2020.03.05.20031088v1.full.pdf::p23::c182
varied. Added value of this study As COVID-19 continues to spr

In [11]:
results = search("Do masks prevent the spread of COVID?", chunks, embeddings, model, top_k=5)
for r in results:
    print(f"\n#{r['rank']}  score={r['score']:.4f}")
    print(f"Source: {r['pdf_file']}  page={r['page']}  id={r['chunk_id']}")
    print(r["preview"])


#1  score=0.7811
Source: main.pdf  page=5  id=main.pdf::p5::c68
direct evidence exists to support the argument for the public wearing masks in the Covid-19 pandemic’’. 8 Howard et al. (2020) also review available medical evidence and conclude that ‘‘mask wearing reduces the transmissibility per contact by reducing transmission of infected droplets in both labora...

#2  score=0.7574
Source: main.pdf  page=6  id=main.pdf::p6::c93
masks to reduce the emission of droplets. Chu et al. (2020) conduct a meta-analysis of observational studies on transmission of the viruses that cause COVID-19 and related diseases and find the effectiveness of mask use for reducing transmission. 10 Whether wearing masks creates a false sense of sec...

#3  score=0.7442
Source: main.pdf  page=6  id=main.pdf::p6::c76
ce masks in preventing viral respiratory infections in non-hospital and non-household settings, finding that face masks decreased infections across all five studies they reviewed. 10 Given the lack