In this notebook we will curate a math dataset avaliable on huggingface: https://huggingface.co/datasets/ajibawa-2023/Maths-College

In [None]:
# Save the dataset to a local file
import datasets as ds

# Load the dataset
data = ds.load_dataset("ajibawa-2023/Maths-College")
data.save_to_disk("MathDataset")

Now we will load the dataset and make some small changes to the instructions

In [None]:
import datasets as ds
dts = ds.load_from_disk("MathDataset")
def edit_instruction(example):
    old_instruction = "Write an educational piece suited for college students related to the following text snippet:"
    new_instruction = "Write an educational piece related to the following text snippet:"
    example['context'] = example['instruction'].replace(old_instruction, '')
    # remove starting \n from the context if it exists
    if example['context'].startswith('\n'):
        example['context'] = example['context'][1:]
    example['instruction'] = new_instruction
    return example
new_dts = dts.map(edit_instruction)
new_dts.save_to_disk("MathInstructDataset")

Sweet, now we have a dataset in the correct format

This edited dataset is now on huggingface [Math-instruct-dataset](https://huggingface.co/datasets/patrickjmcbride/math-instruct-dataset)

It has a 75/25 train/test split

In [None]:
# Load the dataset
data = ds.load_dataset("patrickjmcbride/math-instruct-dataset")
data.save_to_disk("MathInstructDataset")

Now were going create a text column that is the question, context, and answer combined into one string

In [None]:
import datasets as ds

all_data = ds.data = ds.load_dataset("patrickjmcbride/math-instruct-dataset")

def generate_full_text(entry):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{entry["instruction"]}\n\n### Input:\n{entry["context"]}\n\n### Response:\n"{entry["output"]}"""
# For each example in test and train, generate the full text and add it to the dataset under the key "text"
for split in ["train", "test"]:
    all_data[split] = all_data[split].map(lambda x: {"text": generate_full_text(x)})

# Save the dataset
all_data.save_to_disk("MathInstructDataset")


In [None]:
# Load the dataset
data = ds.load_from_disk("MathInstructDataset")
# print the column names
print(data.column_names)
print("Test: ", len(data["test"]))
print("Train: ", len(data["train"]))

In [None]:

# Load the dataset
%env HF_TOKEN=api_token
data = ds.load_from_disk("MathInstructDataset")
# push the dataset to the hub
data.push_to_hub("math-instruct-dataset")



Now we need to get an idea of how many tokens are in each row

In [None]:
# Load the dataset
from datasets import load_from_disk, concatenate_datasets

# Load the datasets
data = load_from_disk("MathInstructDataset")

# Concatenate the datasets
train_data = data["train"]
test_data = data["test"]

# Create a new concatenated dataset
concatenated_data = concatenate_datasets([train_data, test_data])

# Print the length of the new concatenated dataset
print(len(concatenated_data))


In [None]:
import concurrent.futures
from transformers import AutoTokenizer
from datasets import concatenate_datasets, load_from_disk

# Load the datasets
data = load_from_disk("MathInstructDataset")

# Concatenate the datasets
train_data = data["train"]
test_data = data["test"]
concatenated_data = concatenate_datasets([train_data, test_data])

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("failspy/Meta-Llama-3-8B-Instruct-abliterated-v3")

# Function to tokenize a single example
def tokenize_example(example):
    return len(tokenizer(example["text"])["input_ids"])

# Tokenize using multiple threads
num_threads = 12  # Adjust the number of threads based on your CPU
dic_sizes = []

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(tokenize_example, concatenated_data))

# Print the result
print(results)

In [None]:
import matplotlib.pyplot as plt
plt.hist(results, bins=10)

In [None]:
count = 0
count_less_than_1024 = 0
for i in results:
    if i < 1536:
        if i < 1024:
            count_less_than_1024 += 1
        else:
            count += 1
print(count)
# print percentage of examples that are more than 1024 tokens and less than 1536 tokens
print("percent in range 1024-1536: ",count / len(results) * 100)
print("percent less than 1024:     ",count_less_than_1024 / len(results) * 100)

From this we can see that we can create 2 datasets, one with a max token length of 1024 and one with a max token length of 1536 and min token length of 1024.



In [None]:
# Load the datasets
data = load_from_disk("MathInstructDataset")

# Concatenate the datasets
train_data = data["train"]
test_data = data["test"]
concatenated_data = concatenate_datasets([train_data, test_data])

# save the concatenated dataset
concatenated_data.save_to_disk("MathInstructDataset")

Now we will create a new dataset with 3 splits, small, medium, and large

We will use the following token lengths for each split:
- small: [min-1024)
- medium: [1024-1536)
- large: [1536, max]

In [None]:
import concurrent.futures
from transformers import AutoTokenizer
from datasets import concatenate_datasets, load_from_disk

# Load the datasets
data = load_from_disk("MathInstructDataset")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

# Function to tokenize a single example
def tokenize_example(example):
    return len(tokenizer(example["text"])["input_ids"])

# Tokenize using multiple threads
num_threads = 16  # Adjust the number of threads based on your CPU
dic_sizes = []

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(tokenize_example, data))

print(results[0])

In [None]:
from datasets import Dataset, DatasetDict

# Load the dataset

small_data = {"instruction": [], "context": [], "output": [], "text": []}
medium_data = {"instruction": [], "context": [], "output": [], "text": []}
large_data = {"instruction": [], "context": [], "output": [], "text": []}

ct = 0
for example in data:
    # Get the number of tokens
    num_tokens = results[ct]
    
    # Add the example to the corresponding split
    if num_tokens < 1024:
        for key in small_data.keys():
            small_data[key].append(example[key])
    elif num_tokens < 1536:
        for key in medium_data.keys():
            medium_data[key].append(example[key])
    else:
        for key in large_data.keys():
            large_data[key].append(example[key])
    ct += 1

import json
with open("small_data.json", "w") as f:
    json.dump(small_data, f)
with open("medium_data.json", "w") as f:
    json.dump(medium_data, f)
with open("large_data.json", "w") as f:
    json.dump(large_data, f)

In [None]:
import datasets as ds
import json 

The following are done one at a time since it requires a lot of memory

In [None]:
# This will take about 15GB of ram to run
with open("small_data.json", "r") as f:
    small_data = json.load(f)

small_dataset = ds.Dataset.from_dict(small_data)
small_dataset.save_to_disk("MathInstructSmall")

In [None]:
# This will take about 10GB of ram to run
with open("medium_data.json", "r") as f:
    medium_data = json.load(f)

medium_dataset = ds.Dataset.from_dict(medium_data)
medium_dataset.save_to_disk("MathInstructMedium")

In [None]:
# This will take about 70GB of ram to run
with open("large_data.json", "r") as f:
    large_data = json.load(f)

large_dataset = ds.Dataset.from_dict(large_data)
large_dataset.save_to_disk("MathInstructLarge")

Now we will create a new dataset with 3 splits, small, medium, and large.

This will not take a lot of memory since the splits are already created as separate datasets

In [None]:
import datasets as ds
small = ds.load_from_disk("MathInstructSmall")
medium = ds.load_from_disk("MathInstructMedium")
large = ds.load_from_disk("MathInstructLarge")

data_dict = ds.DatasetDict({"small": small, "medium": medium, "large": large})

data_dict.save_to_disk("MathInstructBinned")

Now we will push the datasets to huggingface

In [None]:
binned_data = ds.load_from_disk("MathInstructBinned")
%env HF_TOKEN=api_token
binned_data.push_to_hub("math-instruct-binned")

In [None]:
import datasets as ds
binned_data = ds.load_dataset("patrickjmcbride/math-instruct-binned")
# get the length of each split
len_small = len(binned_data["small"])
len_medium = len(binned_data["medium"])
len_large = len(binned_data["large"])

# print the percentage of each split
print("Small: ", len_small / (len_small + len_medium + len_large) * 100)
print("Medium: ", len_medium / (len_small + len_medium + len_large) * 100)
print("Large: ", len_large / (len_small + len_medium + len_large) * 100)

binned_data.save_to_disk("MathInstructBinned")

In [None]:
# print the number of examples in each split
print("Small: ", len_small)
print("Medium: ", len_medium)
print("Large: ", len_large)