In [None]:
import json
import pathlib
from itertools import chain
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt

from arxiv2text import arxiv_to_text
from arxiv.taxonomy import definitions

path_root = pathlib.Path(".").parent
path_dataset = path_root / "dataset"
path_database = path_root / "database"
path_dataset.mkdir(exist_ok=True)
path_database.mkdir(exist_ok=True)

CATEGORIES = {
    cat: meta
    for cat, meta in definitions.CATEGORIES_ACTIVE.items()
    if cat.split(".")[0] == "cs"
}

cat2idx = {cat: idx for idx, cat in enumerate(CATEGORIES)}
idx2cat = {idx: cat for idx, cat in enumerate(CATEGORIES)}

## Build Database

First, we need to discard all metadata that is unrelated to Computer Science papers. This step is taken to restrict the size of the dataset, making it more manageable. By doing so, we also limit the number of categories in the classification task, enabling it to comfortably fit into the LLM prompt without muddling the context. If your prompt contains numerous categories, you can refer to [this notebook](https://colab.research.google.com/drive/1CpsOiLiLYKeGrhmq579_FmtGsD5uZ3Qe); dspy-arxiv heavily relies on the techniques discussed in this notebook.

In [None]:
with open(path_database / "arxiv.json", "w") as outfile:
    with open(path_root / "arxiv.json") as infile:
        for line in infile:
            paper = json.loads(line.strip())
            cats = {cat for cat in paper["categories"].split(" ")}
            if cats & set(CATEGORIES.keys()):
                outfile.write(line)

## Build Dataset

In the following, we will perform a brief data analysis to better understand the data and determine which papers to select. The goal is to choose papers that belong to a single category and also include a few papers that have multiple categories.

In [None]:
def reader(path_file):
    with open(path_file, "r") as f:
        for line in f:
            paper = json.loads(line)
            yield {
                "id": paper["id"],
                "categories": paper["categories"].split(" "),
            }


df = pd.DataFrame(reader(path_database / "arxiv.json"))
df.set_index("id", inplace=True)
df.head()

In [None]:
occurences = Counter(
    s for s in chain.from_iterable(df.categories) if s.startswith("cs")
)
occurences.update(CATEGORIES.keys())
assert set(occurences.keys()) == set(CATEGORIES.keys())

fig, ax = plt.subplots(figsize=(10, 10))
numbers = list(occurences.keys())
counts = list(occurences.values())
ax.barh(numbers, counts, height=0.8)
ax.set_ylabel("Number")
ax.set_xlabel("Frequency")
ax.set_xscale("log")
ax.set_title("Number of papers")
plt.show()

In [None]:
df_single_cat = df[df.categories.apply(len) == 1].categories.apply(lambda x: x[0])
occurences = Counter(s for s in df_single_cat if s.startswith("cs"))
occurences.update(CATEGORIES.keys())
assert set(occurences.keys()) == set(CATEGORIES.keys())
# The category cs.IT always come with math.IT

fig, ax = plt.subplots(figsize=(10, 10))
numbers = list(occurences.keys())
counts = list(occurences.values())
ax.barh(numbers, counts, height=0.8)
ax.set_ylabel("Number")
ax.set_xlabel("Frequency")
ax.set_xscale("log")
ax.set_title("Number of paper with single category")
plt.show()

We create three splits:

- `trainset` used for "training" the pipeline
- `valset` used to evaluate the performace during training
- `testset` used to evaluate the preformace after training 

In [None]:
# splits
trainset, valset, testset = set(), set(), set()

# Single category
for cat in CATEGORIES:
    if cat == "cs.IT":  # The category cs.IT always come with math.IT
        df_cat = df[df.categories.apply(lambda x: x == ["cs.IT", "math.IT"])]
    else:
        df_cat = df_single_cat[df_single_cat == cat]
    sample = df_cat.sample(n=3, random_state=1).index
    trainset.add(sample[0])
    valset.add(sample[1])
    testset.add(sample[2])

# Multiple categories: add multi-categories paper to reach 50 papers in each split
# random_state 1 sample pdf that are not withdrawn
num = 50 - len(CATEGORIES)
sample = df[df.categories.apply(len) > 2].sample(n=3 * num, random_state=1).index
trainset |= set(sample[:num])
valset |= set(sample[num : num * 2])
testset |= set(sample[num * 2 :])

dataset = trainset | valset | testset

# ensure no overlapping
assert not (trainset & valset)
assert not (trainset & testset)
assert not (valset & testset)
assert len(trainset) == len(valset) == len(testset)

Now, we want to download the PDFs of the selected papers (50 per split) and extract the full text body. 
For this task, we utilize [arxiv2text](https://github.com/dsdanielpark/arxiv2text).

> In the simple example of the pipeline proposed in *features.ipynb*, we only utilize the title and abstract of the paper, without using the full text body. However, we have decided to include the full PDF text in the dataset to facilitate future experimentation with more complex pipelines that can process documents in chunks.

In [None]:
(path_dataset / "trainset").mkdir(exist_ok=True)
(path_dataset / "valset").mkdir(exist_ok=True)
(path_dataset / "testset").mkdir(exist_ok=True)

i = 0
with open(path_database / "arxiv.json") as file:
    for line in file:
        paper = json.loads(line.strip())
        if paper["id"] in dataset:
            if paper["id"] in trainset:
                path_paper = path_dataset / "trainset"
            if paper["id"] in valset:
                path_paper = path_dataset / "valset"
            if paper["id"] in testset:
                path_paper = path_dataset / "testset"

            i += 1
            path_paper = path_paper / f'{paper["id"].replace("/", "-")}.json'
            url = f"https://arxiv.org/pdf/{paper['id']}.pdf"
            print(f"[{i:>3}/{len(dataset)}] Processing {path_paper.stem}")
            if not path_paper.exists():
                paper["text"] = arxiv_to_text(url)
                with open(path_paper, "w") as outfile:
                    json.dump(paper, outfile, indent=4)