In [5]:
import datasets
import numpy as np
import json

In [2]:
hf_data = datasets.load_dataset('allenai/tulu-v2-sft-mixture')

In [3]:
hf_data

DatasetDict({
    train: Dataset({
        features: ['dataset', 'id', 'messages'],
        num_rows: 326154
    })
})

In [17]:
hf_data['train'][1]

{'dataset': 'flan_v2',
 'id': 'flan_v2_1',
 'messages': [{'role': 'user',
   'content': 'Q: A.S. Roma, chairman, James Pallotta; James Pallotta, birthPlace, Boston\nA: James Pallotta, who was born in Boston, is the chairman of A.S. Roma.\nQ: Egg Harbor Township, New Jersey, isPartOf, Atlantic County, New Jersey\nA: Egg Harbor Township is a township in Atlantic County, New Jersey, United States.\nQ: Hypermarcas, type, S.A. (corporation); Hypermarcas, numberOfEmployees, 10252\nA: Hypermarcas is an S.A. corporation which employs 10,252 people.\nQ: Singapore, language, English language; Ayam penyet, region, Singapore\nA:'},
  {'role': 'assistant',
   'content': 'English is one of the languages spoken in Singapore where ayam penyet comes from.'}]}

In [52]:
# Function to reformat new dataset entries
def reformat_new_entry(entry):
    # Extracting instruction and output
    instruction = entry['instruction']
    output = entry['output']
    
    # Create user message
    user_message = {
        
        'content': instruction,
        'role': 'user'
    }
    
    # Create assistant message
    assistant_message = {
        'content': output,
        'role': 'assistant'
    }
    
    return {'messages': [user_message, assistant_message]}

# Load the new dataset
with open('train_hs.json', 'r') as f:
    hs = json.load(f)

with open('boolq.json', 'r') as f:
    boolq = json.load(f)

with open('winogrande.json', 'r') as f:
    wino = json.load(f)

# Reformat the new dataset
ref_hs = [reformat_new_entry(entry) for entry in hs]
ref_b = [reformat_new_entry(entry) for entry in boolq]
ref_w = [reformat_new_entry(entry) for entry in wino]


formatted_data = {
    'dataset': [],
    'id': [],
    'messages': []
}

for i, entry in enumerate(ref_hs):
    formatted_data['dataset'].append('HellaSwag')
    formatted_data['id'].append(f'HellaSwag_{i}')
    formatted_data['messages'].append(entry['messages'])

for i, entry in enumerate(ref_b):
    formatted_data['dataset'].append('BoolQ')
    formatted_data['id'].append(f'BoolQ_{i}')
    formatted_data['messages'].append(entry['messages'])

for i, entry in enumerate(ref_w):
    formatted_data['dataset'].append('WinoGrande')
    formatted_data['id'].append(f'WinoGrande_{i}')
    formatted_data['messages'].append(entry['messages'])
# Create Hugging Face Dataset
hf_dataset = datasets.Dataset.from_dict(formatted_data)

In [54]:
hf_dataset[-1]

{'dataset': 'WinoGrande',
 'id': 'WinoGrande_63237',
 'messages': [{'content': 'Please choose the correct answer to fill in the blank to complete the given sentence: Volleyball was a favorite sport for Kyle but not Justin because _ as was really tall.\n\nOption1: Kyle Option2: Justin Answer format: option1/option2',
   'role': 'user'},
  {'content': 'the correct answer is option1', 'role': 'assistant'}]}

In [4]:
np.unique(hf_data['train']['dataset'])

array(['code_alpaca', 'cot', 'flan_v2', 'gpt4_alpaca', 'hard_coded',
       'lima', 'oasst1', 'open_orca', 'science.evidence_inference',
       'science.qasper_truncated_4000', 'science.scierc_ner',
       'science.scierc_relation', 'science.scifact_json',
       'science.scitldr_aic', 'sharegpt', 'wizardlm'], dtype='<U29')

In [31]:
lima = hf_data.filter(lambda example: example["dataset"] in ['lima', 'open_orca'])

In [32]:
lima['train'][0], hf_dataset[0]

({'dataset': 'lima',
  'id': 'lima_0',
  'messages': [{'role': 'user',
    'content': 'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).'},
   {'role': 'assistant',
    'content': 'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which th

In [55]:
from datasets import Dataset, DatasetDict, concatenate_datasets

lima_dataset = Dataset.from_dict({'dataset': lima['train']['dataset'], 'id': lima['train']['id'], 'messages': lima['train']['messages']})
new_dataset = Dataset.from_dict({'dataset': hf_dataset['dataset'], 'id': hf_dataset['id'], 'messages': hf_dataset['messages']})

# Concatenate datasets
concatenated_dataset = concatenate_datasets([lima_dataset, new_dataset])


In [60]:
final_dataset = DatasetDict({'train': new_dataset})

In [61]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['dataset', 'id', 'messages'],
        num_rows: 112570
    })
})

In [49]:
from huggingface_hub import notebook_login


In [50]:
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [62]:
final_dataset.push_to_hub("VityaVitalich/hellaswag_boolq_winogrande")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/113 [00:00<?, ?ba/s]

In [11]:
lima['train']['dataset'][1000:1050]

['lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'lima',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca',
 'open_orca']

In [26]:
loaded_lima = datasets.load_dataset("VityaVitalich/LIMA")

Downloading readme:   0%|          | 0.00/436 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1018 [00:00<?, ? examples/s]

In [27]:
loaded_lima['train']

Dataset({
    features: ['dataset', 'id', 'messages'],
    num_rows: 1018
})

In [29]:
loaded_lima['train'][1]

{'dataset': 'lima',
 'id': 'lima_1',
 'messages': [{'role': 'user',
   'content': 'In our computer systems lecture we were introduced to the MIPS processor. It was (re)developed over the course of the term and has in fact been quite easy to understand. It uses a RISC design, that is its elementary commands are regularly encoded and there are only few of them in order to keep the wires simple.\nIt was mentioned that CISC follows a different philosophy. I looked briefly at the x86 instruction set and was shocked. I can not image how anyone would want to build a processor that uses so complex a command set!\nSo I figure there have to be good arguments why large portions of the processor market use CISC architectures. What are they? '},
  {'role': 'assistant',
   'content': "There is a general historical trend.\nIn the olden days, memories were small, and so programs were perforce small. Also, compilers were not very smart, and many programs were written in assembler, so it was considered 