# Creating new percentage datasets

In [None]:
%load_ext dotenv
%dotenv

In [None]:
import huggingface_hub
import os
from datasets import load_dataset, DatasetDict, concatenate_datasets

In [None]:
try:
    hf_token = os.getenv("HUGGINGFACE_API_KEY")
    huggingface_hub.login(token=hf_token)
except:
    huggingface_hub.login()

## Get base datasets

In [None]:
ds_hep_th = load_dataset('LLMsForHepth/hep-th_primary')
ds_hep_ph_gr_qc = load_dataset('LLMsForHepth/hep-ph_gr-qc_primary')

In [None]:
sizes = {k: ds_hep_th[k].num_rows for k in ds_hep_th.keys()}
sizes_15 = {k: int(0.15 * v) for k, v in sizes.items()}

## Get 70% and 85% of hep-th dataset

In [None]:
ds_hep_85 = DatasetDict({split: ds_hep_th[split].select(range(sizes[split] - sizes_15[split])) for split in sizes.keys()})
ds_hep_70 = DatasetDict({split: ds_hep_th[split].select(range(sizes[split] - 2 * sizes_15[split])) for split in sizes.keys()})

## Get 15% each of the hep-ph and gr-qc categories

In [None]:
# Filter datasets so we have only 'hep-ph' or 'gr-qc' catagories appearing
ds_hep_ph = ds_hep_ph_gr_qc.filter(lambda x: x['categories'][:6] == 'hep-ph')
ds_gr_qc = ds_hep_ph_gr_qc.filter(lambda x: x['categories'][:5] == 'gr-qc')

In [None]:
ds_hep_ph_15 = DatasetDict({split: ds_hep_ph[split].select(range(sizes_15[split])) for split in sizes.keys()})
ds_gr_qc_15 = DatasetDict({split: ds_gr_qc[split].select(range(sizes_15[split])) for split in sizes.keys()})

## Concatenate datasets

### Create hep_th_85_gr_qc_15

In [None]:
ds_hep_th_85_gr_qc_15 = DatasetDict()
names = ['train', 'test', 'validation']

for name in names:
    ds_hep_th_85_gr_qc_15[name] = concatenate_datasets([ds_hep_85[name], ds_gr_qc_15[name]])

# randomly shuffle the concatenated datasets
ds_hep_th_85_gr_qc_15 = ds_hep_th_85_gr_qc_15.shuffle(seed=42)
ds_hep_th_85_gr_qc_15 = ds_hep_th_85_gr_qc_15.flatten_indices()

In [None]:
# check datasets have same number of rows
for split in ds_hep_th.keys():
    assert ds_hep_th_85_gr_qc_15[split].num_rows == ds_hep_th[split].num_rows

In [None]:
ds_hep_th_85_gr_qc_15.push_to_hub('LLMsForHepth/hep-th_85_gr-qc_15')

### Create hep_th_85_hep_ph_15

In [None]:
ds_hep_th_85_hep_ph_15 = DatasetDict()
names = ['train', 'test', 'validation']

for name in names:
    ds_hep_th_85_hep_ph_15[name] = concatenate_datasets([ds_hep_85[name], ds_hep_ph_15[name]])

# randomly shuffle the concatenated datasets
ds_hep_th_85_hep_ph_15 = ds_hep_th_85_hep_ph_15.shuffle(seed=42)
ds_hep_th_85_hep_ph_15 = ds_hep_th_85_hep_ph_15.flatten_indices()

In [None]:
# check datasets have same number of rows
for split in ds_hep_th.keys():
    assert ds_hep_th_85_hep_ph_15[split].num_rows == ds_hep_th[split].num_rows

In [None]:
ds_hep_th_85_hep_ph_15.push_to_hub('LLMsForHepth/hep-th_85_hep-ph_15')

### Create hep_th_70_gr_qc_15_hep_ph_15

In [None]:
ds_hep_th_70_gr_qc_15_hep_ph_15 = DatasetDict()
names = ['train', 'test', 'validation']

for name in names:
    ds_hep_th_70_gr_qc_15_hep_ph_15[name] = concatenate_datasets([ds_hep_70[name], ds_gr_qc_15[name], ds_hep_ph_15[name]])

# randomly shuffle the concatenated datasets
ds_hep_th_70_gr_qc_15_hep_ph_15 = ds_hep_th_70_gr_qc_15_hep_ph_15.shuffle(seed=42)
ds_hep_th_70_gr_qc_15_hep_ph_15 = ds_hep_th_70_gr_qc_15_hep_ph_15.flatten_indices()

In [None]:
# check datasets have same number of rows
for split in ds_hep_th.keys():
    assert ds_hep_th_70_gr_qc_15_hep_ph_15[split].num_rows == ds_hep_th[split].num_rows

In [None]:
ds_hep_th_70_gr_qc_15_hep_ph_15.push_to_hub('LLMsForHepth/hep-th_70_gr-qc_15_hep-ph_15')

## Logout

In [None]:
huggingface_hub.logout()