In [None]:
%pip install datasets
%pip install pyarrow
import datasets
from datasets import load_dataset
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

dataset = load_dataset('lmsys/lmsys-chat-1m', trust_remote_code=True)
#the dataset only has "train"
dataset["train"].to_parquet("train_dataset_lmsys.parquet") #code to generate train_dataset_lmsys.parquet

In [None]:
split_by_lang = False

In [None]:
from datasets import Dataset, concatenate_datasets
from collections import defaultdict

#to avoid loading all the conversations into memory
def batch_group(group, batch_size=10000):
    for i in range(0, len(group), batch_size):
        yield group[i:i + batch_size] #returns a batch

#for the case "split_by_lang = True". Divides the dataset in test and train splits according to the language of the conversation
#since the number of conversations is very big, this should not have a major effect on the performance 
def split_dataset_by_language(dataset, test_size=0.2, seed=42, batch_size=10000):
    split_datasets = {'train': [], 'test': []}
    
    # group conversations by language
    grouped_by_language = defaultdict(list)
    for entry in dataset:
        language = entry['language']
        grouped_by_language[language].append(entry) 
    
    #train and test splits for each language
    for lang, group in grouped_by_language.items():
        if len(group) > batch_size:
            for batch in batch_group(group, batch_size):
                batch_dict = {key: [entry[key] for entry in batch] for key in batch[0].keys()}
                batch_dataset = Dataset.from_dict(batch_dict)
                split = batch_dataset.train_test_split(test_size=test_size, seed=seed)
                split_datasets['train'].append(split['train'])
                split_datasets['test'].append(split['test'])
        else:
            group_dict = {key: [entry[key] for entry in group] for key in group[0].keys()}
            group_dataset = Dataset.from_dict(group_dict)
            if len(group) < 2: #languages with only 1 conversation: directly to train split
                split_datasets['train'].append(group_dataset)
            else:
                split = group_dataset.train_test_split(test_size=test_size, seed=seed)
                split_datasets['train'].append(split['train'])
                split_datasets['test'].append(split['test'])
    
    # combine all the subsets of the different languages
    combined_train = concatenate_datasets(split_datasets['train'])
    combined_test = concatenate_datasets(split_datasets['test'])
    
    return combined_train, combined_test


In [None]:
from datasets import Dataset, concatenate_datasets
from collections import defaultdict
# load the dataset from the Parquet file
train_dataset = Dataset.from_parquet("train_dataset_lmsys.parquet")

if split_by_lang:
    
    #GENERATE CORPUS: SPLIT BY LANGUAGE
    
    # obtain train and test splits
    combined_train, combined_test = split_dataset_by_language(train_dataset, batch_size=5000)

    # save results in parquet format
    combined_train.to_parquet("train_combined_splitbylanguage.parquet")
    combined_test.to_parquet("test_combined_splitbylanguage.parquet") 


    print(f"Train set size: {len(combined_train)}")
    print(f"Test set size: {len(combined_test)}")
    
    
else:
    
    #GENERATE CORPUS: RANDOM SPLIT
    # 80% train 20% test
    split = train_dataset.train_test_split(test_size=0.2, seed=42)

    # obtain train and test splits
    combined_train = split['train']
    combined_test = split['test']

    # save results in parquet format
    combined_train.to_parquet("train_combined_random.parquet")
    combined_test.to_parquet("test_combined_random.parquet")

    print(f"Train set size: {len(combined_train)}")
    print(f"Test set size: {len(combined_test)}")

In [None]:
from datasets import Dataset

def add_clean_columns(dataset):
    """
    Adds three new columns to the dataset:
    - 'clean_input': Contains only the texts from the 'user' role in each conversation.
    - 'clean_conversation': Contains the texts from both 'user' and 'assistant', merged into a single string.
    - 'clean_assistant': Contains only the texts from the 'assistant' role in each conversation.
    """
    clean_inputs = [] 
    clean_conversations = []
    clean_outputs = []
    
    # Iterate through each conversation and extract texts from 'user' and 'assistant'
    for conversation in dataset["conversation"]:
        user_texts = [entry["content"] for entry in conversation if entry["role"] == "user"]
        assistant_texts = [entry["content"] for entry in conversation if entry["role"] == "assistant"]
        
        clean_inputs.append(" ".join(user_texts))
        clean_outputs.append(" ".join(assistant_texts))
        clean_conversations.append(" ".join(user_texts + assistant_texts))
    
    # Add the new columns to the dataset
    dataset = dataset.add_column("clean_input", clean_inputs)
    dataset = dataset.add_column("clean_conversation", clean_conversations)
    dataset = dataset.add_column("clean_output", clean_outputs)
    
    return dataset

# Add the columns to the previously generated splits
combined_train_clean = add_clean_columns(combined_train)
combined_test_clean = add_clean_columns(combined_test)


In [None]:
#Save results
if split_by_lang:
    combined_train_clean.to_parquet("train_languagesplit.parquet")
    combined_test_clean.to_parquet("test_languagesplit.parquet")
    print("Language split train and test saved")
else:
    combined_train_clean.to_parquet("train_randomsplit.parquet")
    combined_test_clean.to_parquet("test_randomsplit.parquet")
    print("Random split train and test saved")