# Enviroment Setup

In [1]:
!pip install arxiv pymupdf sentence-transformers transformers accelerate


Collecting arxiv
  Downloading arxiv-2.3.1-py3-none-any.whl.metadata (5.2 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.3.1-py3-none-any.whl (11 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel 

# Download NLP Papers from arXiv (Controlled Corpus)




In [2]:
import arxiv, os, re
from tqdm import tqdm

BASE_DIR = "arxiv_pdfs"
NLP_DIR = os.path.join(BASE_DIR, "nlp_related")
os.makedirs(NLP_DIR, exist_ok=True)

def safe_filename(s):
    return re.sub(r"[^a-zA-Z0-9_-]+", "_", s)[:80]

def download_pdf(result, out_dir):
    filename = safe_filename(result.entry_id.split("/")[-1]) + ".pdf"
    try:
        result.download_pdf(dirpath=out_dir, filename=filename)
        return os.path.join(out_dir, filename)
    except:
        return None

def fetch_nlp_pdfs(limit=20):
    results = arxiv.Search(
        query="cs.CL",
        max_results=limit * 3,
        sort_by=arxiv.SortCriterion.Relevance
    )

    papers = []

    for r in tqdm(results.results(), desc="Downloading NLP PDFs"):
        if len(r.summary) < 50:
            continue

        path = download_pdf(r, NLP_DIR)
        if not path:
            continue

        papers.append({
            "paper_id": r.entry_id.split("/")[-1],
            "title": r.title,
            "abstract": r.summary,
            "pdf_path": path,
            "source": "arxiv",
            "category": "cs.CL"
        })

        if len(papers) >= limit:
            break

    return papers
pdf_paths = fetch_nlp_pdfs(20)
len(pdf_paths)



  for r in tqdm(results.results(), desc="Downloading NLP PDFs"):
Downloading NLP PDFs: 19it [00:04,  3.93it/s]


20

# Load SciBERT Classifier (Gate)

In [3]:
!pip install -q gdown

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import zipfile
import gdown
import os

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Handle Google Drive Download
# Extract the ID from your URL: 1-uf26t8kUTh60O16q8eVGdaeD9oOHlzY
drive_id = "1-uf26t8kUTh60O16q8eVGdaeD9oOHlzY"
zip_output = "scibert_model.zip"
extract_path = "./scibert_nlp_classifier"

# Download using gdown (handles large file warnings automatically)
if not os.path.exists(zip_output):
    url = f'https://drive.google.com/uc?id={drive_id}'
    gdown.download(url, zip_output, quiet=False)

# 2. Extract the ZIP
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_output, "r") as z:
        z.extractall(extract_path)
    print(f"Model extracted to {extract_path}")

# 3. Load Model and Tokenizer
# Note: Ensure the files (config.json, pytorch_model.bin) are directly in extract_path
tokenizer_scibert = AutoTokenizer.from_pretrained(extract_path)
model_scibert = AutoModelForSequenceClassification.from_pretrained(extract_path).to(DEVICE)
model_scibert.eval()

label_map = {0: "not_nlp", 1: "nlp_related"}

print(f"Model successfully loaded on {DEVICE}")

Downloading...
From (original): https://drive.google.com/uc?id=1-uf26t8kUTh60O16q8eVGdaeD9oOHlzY
From (redirected): https://drive.google.com/uc?id=1-uf26t8kUTh60O16q8eVGdaeD9oOHlzY&confirm=t&uuid=9f1e6b48-f65d-4417-bc14-545754eef922
To: /content/scibert_model.zip
100%|██████████| 409M/409M [00:02<00:00, 144MB/s]


Model extracted to ./scibert_nlp_classifier
Model successfully loaded on cuda


# PDF Text Extraction (No NLP Preprocessing)

In [5]:
import fitz  # PyMuPDF

def extract_pdf_text(path):
    doc = fitz.open(path)
    pages = []
    for page in doc:
        txt = page.get_text()
        if txt:
            pages.append(txt)
    return "\n".join(pages)


def classify_with_scibert(text):
    inputs = tokenizer_scibert(
        text,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        logits = model_scibert(**inputs).logits

    return label_map[int(torch.argmax(logits))]


# Section-aware chunking

In [6]:
def extract_structured_sections(text):
    """
    Extract scientifically meaningful sections from a paper.
    This avoids naive keyword filtering.
    """
    sections = []
    current = []
    header = None

    for line in text.split("\n"):
        if re.match(r"^\s*(abstract|introduction|background|model|method|formalism|discussion)", line.lower()):
            if current:
                sections.append(("other", "\n".join(current)))
                current = []
            header = line.strip()
        current.append(line)

    if current:
        sections.append((header if header else "other", "\n".join(current)))

    return sections


# Apply SciBERT Gate

In [10]:
accepted_docs = {}


for doc in tqdm(pdf_paths, desc="SciBERT filtering"):

    file_path = doc['pdf_path']


    text = extract_pdf_text(file_path)

    if len(text) < 500:
        continue

    label = classify_with_scibert(text)

    if label == "nlp_related":

        title = doc['title']



        accepted_docs[title] = text
len(accepted_docs)

SciBERT filtering: 100%|██████████| 20/20 [00:06<00:00,  2.92it/s]


20

# Chunking

# Semantic Chunking

In [11]:
import nltk
def chunk_text(text, max_words=200):
    sentences = nltk.sent_tokenize(text)
    chunks, current = [], []

    for s in sentences:
        current.append(s)
        if len(" ".join(current).split()) >= max_words:
            chunks.append(" ".join(current))
            current = []

    if current:
        chunks.append(" ".join(current))

    return chunks


# Build SBERT Index (MPNet)

In [15]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

retriever = SentenceTransformer("all-mpnet-base-v2")

chunks = []
meta = []

for title, text in accepted_docs.items():
    for i, ch in enumerate(chunk_text(text)):
        chunks.append(ch)
        meta.append({"title": title, "chunk_id": i})

chunk_embeddings = retriever.encode(chunks, convert_to_tensor=True)


# Diversified Retrieval

In [17]:
def retrieve(query, top_k=3, max_per_doc=1):
    q_emb = retriever.encode(query, convert_to_tensor=True)
    scores = cos_sim(q_emb, chunk_embeddings)[0]
    ranked = torch.argsort(scores, descending=True)

    selected, per_doc = [], {}

    for idx in ranked:
        idx = int(idx)
        title = meta[idx]["title"]
        if per_doc.get(title, 0) >= max_per_doc:
            continue

        selected.append({
            "text": chunks[idx],
            "title": title,
            "chunk_id": meta[idx]["chunk_id"],
            "score": float(scores[idx])
        })

        per_doc[title] = per_doc.get(title, 0) + 1
        if len(selected) >= top_k:
            break

    return selected


# Utilities - croppping the text before it enters the LLM

In [18]:


def truncate_text(text, tokenizer, max_tokens=120):
    """
    Truncate text to a fixed number of tokens
    to avoid GPU/CPU memory explosion during generation.
    """
    tokens = tokenizer.encode(text, add_special_tokens=False)
    tokens = tokens[:max_tokens]
    return tokenizer.decode(tokens)


# Evidence-based RAG

In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM

llm_name = "mistralai/Mistral-7B-Instruct-v0.2"
tok = AutoTokenizer.from_pretrained(llm_name)
llm = AutoModelForCausalLM.from_pretrained(
    llm_name,
    device_map="auto",
    torch_dtype=torch.float16
)




tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [20]:
def build_prompt(question, evidences):
    ctx = ""
    for i, e in enumerate(evidences, 1):
        ctx += (
            f"[{i}] {e['title']}:\n"
            f"{truncate_text(e['text'], tok, max_tokens=120)}\n\n"
        )

    return f"""
You are an academic assistant.
Answer the question strictly using the provided evidence.
Do not use prior knowledge.
also You are an expert in compositional distributional semantics.

Answer the question using ONLY the provided evidence.
Focus on:
- how sentence meaning is computed
- the role of linear maps and inner products
- why tensor dimensionality does NOT grow with sentence length


Evidence:
{ctx}

Question:
{question}

Answer concisely and cite sources like [1], [2].
"""


# Final RAG answer

In [21]:
from transformers import TextIteratorStreamer
import threading

def rag_answer(question):
    evidences = retrieve(question, top_k=3, max_per_doc=1)
    prompt = build_prompt(question, evidences)

    inputs = tok(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(llm.device)

    streamer = TextIteratorStreamer(tok, skip_special_tokens=True)

    generation_kwargs = dict(
        **inputs,
        max_new_tokens=150,
        do_sample=False,
        streamer=streamer
    )

    thread = threading.Thread(target=llm.generate, kwargs=generation_kwargs)
    thread.start()

    print("\nANSWER (streaming):\n")
    output = ""
    for token in streamer:
        print(token, end="", flush=True)
        output += token

    return output, evidences


# Test the RAG

In [22]:
question = """
How does the proposed compositional distributional model represent sentence meaning,
and why does it avoid the dimensionality explosion problem of tensor-based approaches?
"""


In [23]:
answer, evidences = rag_answer(question)

print("\nEVIDENCE USED:\n")
for i, e in enumerate(evidences, 1):
    print(f"[{i}] {e['title']} | chunk {e['chunk_id']} | score={e['score']:.4f}")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



ANSWER (streaming):


You are an academic assistant.
Answer the question strictly using the provided evidence.
Do not use prior knowledge.
also You are an expert in compositional distributional semantics.

Answer the question using ONLY the provided evidence.
Focus on:
- how sentence meaning is computed
- the role of linear maps and inner products
- why tensor dimensionality does NOT grow with sentence length


Evidence:
[1] Concrete Sentence Spaces for Compositional Distributional Models of Meaning:
arXiv:1101.0309v1  [cs.CL]  31 Dec 2010
Concrete Sentence Spaces for Compositional Distributional
Models of Meaning
Edward Grefenstette∗, Mehrnoosh Sadrzadeh∗, Stephen Clark†, Bob Coecke∗, Stephen Pulman∗
∗Oxford University Computing Laboratory, †University of Cambridge Computer Laboratory
firstname.lastname@comlab.ox.ac.uk, step

[2] Experimental Support for a Categorical Compositional Distributional Model of Meaning:
We
provide a general algorithm for building (or indeed
learning) these