In [1]:
from datasets import load_dataset, load_from_disk, interleave_datasets
import tqdm
import random
import os

dataset1 = load_from_disk("../data/open_platypus_justified_2")
dataset2 = load_from_disk("../data/arxiv-physics-instruct-tune-30k-formatted")
dataset3 = load_from_disk("../data/theoremqa_justified_formatted_2")

# Access the specific split for dataset2
dataset2_train = dataset2['train']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset1[0]

{'question': 'A board game spinner is divided into three parts labeled $A$, $B$  and $C$. The probability of the spinner landing on $A$ is $\\frac{1}{3}$ and the probability of the spinner landing on $B$ is $\\frac{5}{12}$.  What is the probability of the spinner landing on $C$? Express your answer as a common fraction.',
 'choices': None,
 'correct_choice': None,
 'topic': 'math',
 'answer': 'To find the probability of the spinner landing on $C$, I need to subtract the probabilities of the spinner landing on $A$ and $B$ from $1$, since the sum of the probabilities of all possible outcomes is $1$. I can write this as an equation: $P(C) = 1 - P(A) - P(B)$. I know that $P(A) = \\frac{1}{3}$ and $P(B) = \\frac{5}{12}$, so I can plug those values into the equation and simplify. I get: $P(C) = 1 - \\frac{1}{3} - \\frac{5}{12} = \\frac{12}{12} - \\frac{4}{12} - \\frac{5}{12} = \\frac{3}{12}$. I can reduce this fraction by dividing the numerator and denominator by $3$, and I get: $P(C) = \\fr

In [3]:
dataset2_train[0]

{'question': 'Which integral prime represents the least weight?',
 'answer': 'The least weight of an integral prime is 2. This is because 2 is the smallest prime number and also happens to be the only even prime number.',
 'choices': None,
 'correct_choice': None,
 'topic': 'physics'}

In [4]:
dataset3[0]

{'question': 'How many ways are there to divide a set of 8 elements into 5 non-empty ordered subsets?',
 'answer': 'First, we need to choose the elements that will be in each subset. Since the subsets are ordered, we need to consider the order in which elements are chosen. We can do this by using a combination formula.\n\nTo divide the 8 elements into 5 subsets, we need to choose 4 dividers from the 7 spaces between the elements (since we need 5 subsets, and there are 8 elements). This can be done in 7 choose 4 ways, which is equal to 7! / (4! * 3!) = 35 ways.\n\nNext, we need to assign the elements to each subset. The first subset can have any number of elements from 1 to 4, the second subset can have any number of elements from 1 to 3, and so on. This can be done by considering the number of ways to partition the remaining elements after placing the dividers.\n\nThus, the total number of ways to divide the set of 8 elements into 5 non-empty ordered subsets is 35 * 4! * 3! * 2! * 1! =

In [5]:
datasets_list = [dataset1, dataset2_train, dataset3]

In [6]:
dataset_probabilities = [0.5, 0.3, 0.2]

interleaved_dataset = interleave_datasets(
    datasets=datasets_list,
    probabilities=None,  # or use probabilities list defined above
    seed=42,           # optional: use a seed for reproducibility
    stopping_strategy='first_exhausted'  # or 'all_exhausted'
)

# If we need only 30k rows 
if len(interleaved_dataset) > 30000:
    interleaved_dataset = interleaved_dataset.select(range(30000))

interleaved_dataset.save_to_disk("../data/interleaved_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 2241/2241 [00:00<00:00, 98849.86 examples/s] 


In [7]:
interleaved_dataset = load_from_disk("../data/interleaved_dataset")
interleaved_dataset[0]

{'question': 'A board game spinner is divided into three parts labeled $A$, $B$  and $C$. The probability of the spinner landing on $A$ is $\\frac{1}{3}$ and the probability of the spinner landing on $B$ is $\\frac{5}{12}$.  What is the probability of the spinner landing on $C$? Express your answer as a common fraction.',
 'choices': None,
 'correct_choice': None,
 'topic': 'math',
 'answer': 'To find the probability of the spinner landing on $C$, I need to subtract the probabilities of the spinner landing on $A$ and $B$ from $1$, since the sum of the probabilities of all possible outcomes is $1$. I can write this as an equation: $P(C) = 1 - P(A) - P(B)$. I know that $P(A) = \\frac{1}{3}$ and $P(B) = \\frac{5}{12}$, so I can plug those values into the equation and simplify. I get: $P(C) = 1 - \\frac{1}{3} - \\frac{5}{12} = \\frac{12}{12} - \\frac{4}{12} - \\frac{5}{12} = \\frac{3}{12}$. I can reduce this fraction by dividing the numerator and denominator by $3$, and I get: $P(C) = \\fr

### If we want to test different dataset probability splits

In [None]:
dataset_probabilities = [
    [0.33, 0.33, 0.34],
    [0.50, 0.30, 0.20],
    [0.20, 0.50, 0.30],
    [0.10, 0.45, 0.45],
    [0.25, 0.25, 0.50]
]

# Base path for saving datasets
base_save_path = "../data/interleaved_variations/"

for i, probs in enumerate(dataset_probabilities):
    # Interleave datasets with current probability distribution
    interleaved_dataset = interleave_datasets(
        datasets=datasets_list,
        probabilities=probs,
        seed=42,  # Ensuring reproducibility
        stopping_strategy='all_exhausted'
    )

    # If we need only 30k rows 
    if len(interleaved_dataset) > 30000:
        interleaved_dataset = interleaved_dataset.select(range(30000))

    # Save each version to a different directory
    interleaved_dataset.save_to_disk(f"{base_save_path}variation_{i+1}")