## Install Dependencies

In [1]:
!pip install -q arxiv requests tqdm pandas fuzzywuzzy python-Levenshtein

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


## Import Libraries and Configure Paths

In [2]:
import os
import re
import hashlib
import random
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import requests
import arxiv
from fuzzywuzzy import fuzz

OUT_DIR = "doculizer_dataset"
TEXT_DIR = os.path.join(OUT_DIR, "texts")
CSV_PATH = os.path.join(OUT_DIR, "dataset.csv")
Path(TEXT_DIR).mkdir(parents=True, exist_ok=True)

TARGET_NLP = 1000
TARGET_NON_NLP = 1000
FUZZY_THRESHOLD = 90

SUBCATEGORIES = {
    "machine_translation": ["machine translation", "wmt", "bleu"],
    "question_answering": ["question answering", "qa", "squad"],
    "summarization": ["summarization", "summary", "rouge"],
    "ner": ["named entity recognition", "ner"],
    "sentiment": ["sentiment", "opinion mining"],
    "dialogue": ["dialogue", "conversation", "chatbot"],
    "llm": ["language model", "transformer", "llm", "gpt"],
}


## Helper Functions

In [3]:
def clean_text(t):
    if not t:
        return ''
    t = t.replace('\n', ' ').replace('\r', ' ')
    t = re.sub(r'[\x00-\x1f\x7f]', ' ', t)
    return re.sub(r'\s+', ' ', t).strip()

def fingerprint(t):
    return hashlib.sha1(t.encode('utf-8', 'ignore')).hexdigest()

def detect_subcategory(t):
    t = t.lower()
    for sub, kws in SUBCATEGORIES.items():
        for kw in kws:
            if kw in t:
                return sub
    return 'other_nlp'

def save_text(pid, cat, text):
    name = re.sub(r'[^a-zA-Z0-9_-]', '_', pid)[:80]
    path = f'{TEXT_DIR}/{name}_{cat}.txt'
    open(path, 'w', encoding='utf8').write(text)
    return path


## Fetch NLP Papers (arXiv)

In [4]:
def fetch_arxiv_nlp(max_results=3000):
    out = []
    search = arxiv.Search(
        query='cs.CL',
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance
    )
    for r in tqdm(list(search.results()), desc='arXiv NLP'):
        abs_ = clean_text(r.summary)
        if len(abs_) < 50:
            continue
        out.append({
            'paper_id': r.entry_id.split('/')[-1],
            'title': clean_text(r.title),
            'abstract': abs_,
            'label': 'nlp_related',
            'source': 'arxiv_cs.CL'
        })
    return out

arxiv_nlp = fetch_arxiv_nlp()
len(arxiv_nlp)


  for r in tqdm(list(search.results()), desc='arXiv NLP'):
arXiv NLP: 100%|██████████| 3000/3000 [00:00<00:00, 26132.14it/s]


2998

## Fetch Non-NLP Papers (arXiv)

In [11]:
def fetch_arxiv_non_nlp(max_results=3000):
    # Use smaller categories to avoid API overload
    queries = [
        'physics.gen-ph', 'physics.optics',
        'math.AG', 'math.AP',
        'cs.CV', 'cs.LG', 'cs.RO', 'stat.ML'
    ]

    out = []

    for q in queries:
        print(f"Fetching from {q} ...")

        search = arxiv.Search(
            query=f"cat:{q}",
            max_results=500,   # small safe batch
            sort_by=arxiv.SortCriterion.Relevance
        )

        try:
            results = list(search.results())
        except Exception as e:
            print("Error fetching", q, " → ", str(e))
            continue

        for r in tqdm(results, desc=f"Non-NLP {q}"):
            abs_ = clean_text(r.summary)
            if len(abs_) < 50:
                continue

            low = abs_.lower()
            if any(kw in low for kw in ['nlp','natural language','translation','linguistic']):
                continue

            out.append({
                'paper_id': r.entry_id.split('/')[-1],
                'title': clean_text(r.title),
                'abstract': abs_,
                'label': 'not_nlp',
                'source': q
            })

            # stop when reaching target
            if len(out) >= max_results:
                return out

    return out

arxiv_non_nlp = fetch_arxiv_non_nlp()
len(arxiv_non_nlp)


Fetching from physics.gen-ph ...


  results = list(search.results())
Non-NLP physics.gen-ph: 100%|██████████| 500/500 [00:00<00:00, 17420.23it/s]


Fetching from physics.optics ...


Non-NLP physics.optics: 100%|██████████| 500/500 [00:00<00:00, 15737.53it/s]


Fetching from math.AG ...


Non-NLP math.AG: 100%|██████████| 500/500 [00:00<00:00, 21651.37it/s]


Fetching from math.AP ...


Non-NLP math.AP: 100%|██████████| 500/500 [00:00<00:00, 22781.26it/s]


Fetching from cs.CV ...


Non-NLP cs.CV: 100%|██████████| 500/500 [00:00<00:00, 15846.22it/s]


Fetching from cs.LG ...


Non-NLP cs.LG: 100%|██████████| 500/500 [00:00<00:00, 15886.31it/s]


Fetching from cs.RO ...


Non-NLP cs.RO:  16%|█▌        | 80/500 [00:00<00:00, 16424.90it/s]


3000

## Merge Raw Papers

In [12]:
raw = arxiv_nlp + arxiv_non_nlp
len(raw)

5998

## Deduplicate Papers

In [13]:
def dedupe(papers, threshold=90):
    out = []
    seen = set()
    for p in tqdm(papers, desc='Dedup'):
        t = (p['title'] + ' ' + p['abstract']).strip()
        fp = fingerprint(t)
        if fp in seen:
            continue
        dup = False
        for u in out:
            score = fuzz.token_sort_ratio(
                t, (u['title'] + ' ' + u['abstract']).strip()
            )
            if score >= threshold:
                dup = True
                break
        if not dup:
            seen.add(fp)
            out.append(p)
    return out

unique = dedupe(raw)
len(unique)


Dedup: 100%|██████████| 5998/5998 [35:36<00:00,  2.81it/s]


5568

## Balance Dataset to 1200 NLP / 1200 Non-NLP

In [14]:
nlp = [p for p in unique if p['label'] == 'nlp_related'][:1200]
non = [p for p in unique if p['label'] == 'not_nlp'][:1200]

dataset = nlp + non
random.shuffle(dataset)

len(dataset)


2400

## Save Text Files and CSV

In [15]:
rows = []
for p in tqdm(dataset, desc='Saving'):
    full = p['title'] + '\n\n' + p['abstract']
    cat = detect_subcategory(full) if p['label'] == 'nlp_related' else 'non_nlp'
    path = save_text(p['paper_id'], cat, full)
    rows.append({
        'paper_id': p['paper_id'],
        'title': p['title'],
        'abstract': p['abstract'],
        'filepath': path,
        'label': p['label'],
        'subcategory': cat,
        'source': p['source']
    })

df = pd.DataFrame(rows)
df.to_csv(CSV_PATH, index=False)
df.head()


Saving: 100%|██████████| 2400/2400 [00:00<00:00, 15724.36it/s]


Unnamed: 0,paper_id,title,abstract,filepath,label,subcategory,source
0,1901.03154v2,Broadband photoacoustic spectroscopy of $^{14}...,We report a photoacoustic spectroscopy setup w...,doculizer_dataset/texts/1901_03154v2_non_nlp.txt,not_nlp,non_nlp,physics.optics
1,1810.02218v4,Duality theories for p-primary etale cohomolog...,This paper is Part III of the series of work b...,doculizer_dataset/texts/1810_02218v4_non_nlp.txt,not_nlp,non_nlp,math.AG
2,1808.06728v2,Spontaneously Broken Particle-Hole Symmetry in...,We consider particle-hole symmetric photonic g...,doculizer_dataset/texts/1808_06728v2_non_nlp.txt,not_nlp,non_nlp,physics.optics
3,1809.05402v1,"Quarks, Hadrons, and Emergent Spacetime",It is argued that important information on the...,doculizer_dataset/texts/1809_05402v1_non_nlp.txt,not_nlp,non_nlp,physics.gen-ph
4,9502017v2,Deterministic Consistency Checking of LP Const...,We provide a constraint based computational mo...,doculizer_dataset/texts/9502017v2_other_nlp.txt,nlp_related,other_nlp,arxiv_cs.CL


## Dataset Statistics

In [16]:
print(df['label'].value_counts())
print(df['subcategory'].value_counts())


label
not_nlp        1200
nlp_related    1200
Name: count, dtype: int64
subcategory
non_nlp                1200
other_nlp               602
ner                     266
llm                     115
machine_translation      90
summarization            37
sentiment                34
dialogue                 29
question_answering       27
Name: count, dtype: int64


## Zip Folder for Download

In [17]:
import shutil

zip_path = shutil.make_archive('doculizer_dataset', 'zip', OUT_DIR)
zip_path


'/content/doculizer_dataset.zip'