# Create new combined datasets

In [None]:
import os

import pandas as pd
import huggingface_hub
from datasets import load_dataset, DatasetDict, concatenate_datasets

%load_ext dotenv
%dotenv

In [None]:
try:
    hf_token = os.getenv("HUGGINGFACE_API_KEY")
    huggingface_hub.login(token=hf_token)
except:
    huggingface_hub.login()

## Get basic datasets

In [None]:
ds_1 = 'LLMsForHepth/hep-th_primary'
ds_2 = 'LLMsForHepth/gr-qc_primary'
ds_3 = 'LLMsForHepth/hep-ph_primary'
ds_4 = 'LLMsForHepth/q-bio_primary'
ds_5 = 'LLMsForHepth/cs_primary_200k'

In [None]:
ds_1 = load_dataset(ds_1)
ds_2 = load_dataset(ds_2)
ds_3 = load_dataset(ds_3)
ds_4 = load_dataset(ds_4)
ds_5 = load_dataset(ds_5)

## Create hep-th + hep-ph and hep-th + gr-qc datasets

### Concatenate datasets

In [None]:
ds_hep_th_gr_qc = DatasetDict()
ds_hep_th_hep_ph = DatasetDict()
names = ds_1.keys()

for name in names:
    ds_hep_th_gr_qc[name] = concatenate_datasets([ds_1[name], ds_2[name]])
    ds_hep_th_hep_ph[name] = concatenate_datasets([ds_1[name], ds_3[name]])

In [None]:
ds_hep_th_gr_qc.num_rows

In [None]:
ds_hep_th_hep_ph.num_rows

### Reshuffle the data in each split

In [None]:
ds_hep_th_gr_qc = ds_hep_th_gr_qc.shuffle(seed=42)
ds_hep_th_gr_qc = ds_hep_th_gr_qc.flatten_indices()

In [None]:
ds_hep_th_hep_ph = ds_hep_th_hep_ph.shuffle(seed=42)
ds_hep_th_hep_ph = ds_hep_th_hep_ph.flatten_indices()

### Push to Huggingface

In [None]:
try:
    ds_hep_th_gr_qc.push_to_hub('LLMsForHepth/hep-th_gr-qc_primary')
except:
    huggingface_hub.create_repo(repo_id='LLMsForHepth/hep-th_gr-qc_primary',
                                repo_type="dataset",
                                private=False)
    ds_hep_th_gr_qc.push_to_hub('LLMsForHepth/hep-th_gr-qc_primary')

In [None]:
try:
    ds_hep_th_hep_ph.push_to_hub('LLMsForHepth/hep-th_hep-ph_primary')
except:
    huggingface_hub.create_repo(repo_id='LLMsForHepth/hep-th_hep-ph_primary',
                                repo_type="dataset",
                                private=False)
    ds_hep_th_hep_ph.push_to_hub('LLMsForHepth/hep-th_hep-ph_primary')

## Create combo of gr-qc and hep-ph but keep to same size as hep-th

In [None]:
size_hep_th = sum([ds_1[split].num_rows for split in ds_1.keys()])
size_gr_qc = sum([ds_2[split].num_rows for split in ds_2.keys()])
size_hep_ph = sum([ds_3[split].num_rows for split in ds_3.keys()])

In [None]:
prop = size_hep_th / (size_gr_qc + size_hep_ph)
prop

In [None]:
# We take prop * gr-qc

gr_qc_samp = DatasetDict()
for split in ds_2.keys():
    gr_qc_samp[split] = ds_2[split].shuffle(seed=42).select(range(int(prop * ds_2[split].num_rows)))

# For hep-ph we take enough to make up to the size of hep-th

hep_ph_samp = DatasetDict()
for split in ds_3.keys():
    hep_ph_samp[split] = ds_3[split].shuffle(seed=42).select(range(ds_1[split].num_rows - gr_qc_samp[split].num_rows))

In [None]:
# The combined sizes are equal to hep-th

for split in ds_2.keys():
    print(f'Split {split} has size {gr_qc_samp[split].num_rows + hep_ph_samp[split].num_rows}')

In [None]:
# Concatenate gr_qc_samp and hep_ph_samp datasets

ds_gr_qc_hep_ph_small = DatasetDict()
for split in gr_qc_samp.keys():
    ds_gr_qc_hep_ph_small[split] = concatenate_datasets([gr_qc_samp[split], hep_ph_samp[split]])

In [None]:
# Randomly shuffle the concatenated dataset

ds_gr_qc_hep_ph_small = ds_gr_qc_hep_ph_small.shuffle(seed=42)
ds_gr_qc_hep_ph_small = ds_gr_qc_hep_ph_small.flatten_indices()

In [None]:
# Push to Huggingface

try:
    ds_gr_qc_hep_ph_small.push_to_hub('LLMsForHepth/gr-qc_hep-ph_small')
except:
    huggingface_hub.create_repo(repo_id='LLMsForHepth/gr-qc_hep-ph_small',
                                repo_type="dataset",
                                private=False)
    ds_gr_qc_hep_ph_small.push_to_hub('LLMsForHepth/gr-qc_hep-ph_small')

## Create hep-th + q-bio + cs dataset

In [None]:
# We are creating a new dataset which should have the same size as the combined hep-th + hep-ph + gr-qc
wanted_sizes = {split: ds_2[split].num_rows + ds_3[split].num_rows for split in ds_1.keys()}

In [None]:
# these are the number of rows we need to select from the cs dataset
cs_needed = {split: wanted_sizes[split] - ds_4[split].num_rows for split in wanted_sizes.keys()}
cs_needed

In [None]:
cs_samp = DatasetDict()
for split in ds_5.keys():
    cs_samp[split] = ds_5[split].shuffle(seed=42).select(range(cs_needed[split]))

In [None]:
ds_hep_th_qbio_cs = DatasetDict()
for split in ds_1.keys():
    ds_hep_th_qbio_cs[split] = concatenate_datasets([ds_1[split], ds_4[split], cs_samp[split]])

In [None]:
# Randomly shuffle the concatenated dataset

ds_hep_th_qbio_cs = ds_hep_th_qbio_cs.shuffle(seed=42)
ds_hep_th_qbio_cs = ds_hep_th_qbio_cs.flatten_indices()

In [None]:
# Push to Huggingface

try:
    ds_hep_th_qbio_cs.push_to_hub('LLMsForHepth/hep-th_qbio_cs')
except:
    huggingface_hub.create_repo(repo_id='LLMsForHepth/hep-th_qbio_cs',
                                repo_type="dataset",
                                private=False)
    ds_hep_th_qbio_cs.push_to_hub('LLMsForHepth/hep-th_qbio_cs')

## Logout

In [None]:
huggingface_hub.logout()