In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import Dataset, load_dataset

#CREATE AND SAVE TRAIN AND TEST SPLITS FROM THE CONVERSATIONAL DATASET LMSYS

#Hugging Face login
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

dataset = load_dataset('lmsys/lmsys-chat-1m', trust_remote_code=True)

#the dataset only has "train"
dataset["train"].to_parquet("train_dataset_lmsys.parquet") #code to generate train_dataset_lmsys.parquet

In [None]:
#load the dataset from the Parquet file
train_dataset = Dataset.from_parquet("train_dataset_lmsys.parquet")

# 80% train 20% test
split = train_dataset.train_test_split(test_size=0.2, seed=42)

#obtain train and test splits
combined_train = split['train']
combined_test = split['test']

#save results in parquet format
combined_train.to_parquet("train_combined_random.parquet")
combined_test.to_parquet("test_combined_random.parquet")

print(f"Train set size: {len(combined_train)}")
print(f"Test set size: {len(combined_test)}")
    

In [None]:
def add_clean_columns(dataset):
    """
    Adds three new columns to the dataset:
    -'clean_input': Contains only the texts from the 'user' role in each conversation.
    -'clean_conversation': Contains the texts from both 'user' and 'assistant', merged into a single string.
    -'clean_assistant': Contains only the texts from the 'assistant' role in each conversation.
    """
    clean_inputs = [] 
    clean_conversations = []
    clean_outputs = []
    
    #iterate through each conversation and extract texts from 'user' and 'assistant'
    for conversation in dataset["conversation"]:
        user_texts = [entry["content"] for entry in conversation if entry["role"] == "user"]
        assistant_texts = [entry["content"] for entry in conversation if entry["role"] == "assistant"]
        
        clean_inputs.append(" ".join(user_texts))  #there can be several input texts
        clean_outputs.append(" ".join(assistant_texts)) #there can be several output texts
        clean_conversations.append(" ".join(user_texts + assistant_texts)) #the order of the conversation is not mantained (doesn't matter)
    
    #add the new columns to the dataset
    dataset = dataset.add_column("clean_input", clean_inputs)
    dataset = dataset.add_column("clean_conversation", clean_conversations)
    dataset = dataset.add_column("clean_output", clean_outputs)
    
    return dataset

#add the columns to the previously generated splits
combined_train_clean = add_clean_columns(combined_train)
combined_test_clean = add_clean_columns(combined_test)


In [None]:
combined_train_clean.to_parquet("train_randomsplit.parquet")
combined_test_clean.to_parquet("test_randomsplit.parquet")