In [20]:
from omegaconf import OmegaConf

config = {
    'dataset': 'saier/unarxive_citrec',
    'n_train': 10_000,
    'n_valid': 1_000,
    'n_test': 1_000,
    'max_chars_len': 512,
    'min_chars_len': 128,
    'save_dir': '../../data/raw/unarxive_citrec/'
}
config = OmegaConf.create(config)

In [21]:
from datasets import load_dataset
from tqdm import tqdm

# Load the dataset in streaming mode
dataset = load_dataset(config.dataset, split='train', streaming=True)

def take_n_samples(n: int, split: str, batch_size: int = 250) -> list:
    dataset = load_dataset(config.dataset, split=split, streaming=True)
    samples = []
    bar = tqdm(total=n)
    while len(samples) < n:
        new_samples = dataset.take(batch_size)
        new_samples = list(filter(lambda x: config.min_chars_len <= len(x['text']) <= config.max_chars_len, new_samples))
        samples.extend(new_samples)
        bar.update(len(new_samples))

    return samples

train_samples = take_n_samples(config.n_train, split='train')
valid_samples = take_n_samples(config.n_valid, split='validation')
test_samples = take_n_samples(config.n_test, split='test')

TypeError: take_n_samples() got an unexpected keyword argument 'split'

In [None]:
def extract_texts(samples):
    return [sample['text'] for sample in samples]

train_texts = extract_texts(train_samples)
valid_texts = extract_texts(valid_samples)
test_texts = extract_texts(test_samples)

In [None]:
print(valid_texts[0])
print(test_texts[0])

There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \(x\ge 0\)  [1]}, [2]}. Typically, player \(i\)  maximizes the following quadratic utility function
\(u_i(x_i;{\bf {x}}_{-i}) = \alpha x_i - \frac{1}{2}x_i^2 +\gamma \sum _{j\ne i} \mathcal {A}_{ij}x_ix_j,\) 

There is a strand of literature on continuous-action games on networks in which each player takes an action represented by a real value \(x\ge 0\)  [1]}, [2]}. Typically, player \(i\)  maximizes the following quadratic utility function
\(u_i(x_i;{\bf {x}}_{-i}) = \alpha x_i - \frac{1}{2}x_i^2 +\gamma \sum _{j\ne i} \mathcal {A}_{ij}x_ix_j,\) 



In [None]:
import numpy as np

train_lens = np.array([len(text) for text in train_texts])

# 0.25 quantile, 0.5 quantile, 0.75 quantile
print("0.25, 0.5, 0.75 quantile:", np.quantile(train_lens, [0.25, 0.5, 0.75]))
print("Max len:", np.max(train_lens))
print("Min len:", np.min(train_lens))
print("Example:", train_texts[np.random.randint(0, len(train_texts))])

0.25, 0.5, 0.75 quantile: [289.   382.   458.25]
Max len: 508
Min len: 145
Example: Theorem B (Equivalent version of Beurling's Theorem, [1]}). 
A closed subspace of \(H^{2}\)  is shift-invariant iff it is invariant under multiplication by every bounded analytic function in \(H^{\infty }\) .




In [None]:
import os
import json

for texts, split_name in [
    (train_texts, 'train'),
    (valid_texts, 'valid'),
    (test_texts, 'test')
]:
    path = os.path.join(config.save_dir, split_name + '.json')
    with open(path, 'w') as f:
        json.dump(texts, f, indent=4, ensure_ascii=False)