In [None]:
import os
import csv
import re
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login
from tqdm import tqdm
import tiktoken
from transformers import AutoTokenizer

#EVALUATE FERTILITY IN CONVERSATIONS AND LLM TRAINING DATASET

#Hugging Face login
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

#models to evaluate:
hf_models = [
    "deepseek-ai/DeepSeek-R1",
    "meta-llama/Llama-3.1-8B",
    "google/gemma-2-9b",
    "mistralai/Mistral-7B-v0.1",
    "bigscience/bloom"
]
openai_models = ["gpt-4", "gpt4-o"]

#corpus to evaluate:
corpus = "c4_allenai" #"lmsys" to evaluate conversations, "c4_allenai" to evaluate training dataset


In [None]:
#load dataset in streaming mode
if corpus == "lmsys":
     dataset = load_dataset('lmsys/lmsys-chat-1m', trust_remote_code=True, streaming=True)
elif corpus == "c4_allenai":
     dataset = load_dataset("allenai/c4", "en", streaming=True) #only english texts
     validation_dataset = dataset["validation"] #the train split is too big 
else:
    raise ValueError("Name of dataset not recognised. Use 'lmsys' or 'c4_allenai'.")

In [None]:
#to count number of words in a text
def count_words(text):
    words = re.findall(r'\b\w+\b', text) #this was chosen instead of split() to focus on actual words (alphanumeric sequences), ignoring punctuation and symbols
    return len(words)

#count number of words in the selected dataset:
if corpus == "lmsys":
    #initialize counters: count separately the words in input and output texts
    total_words_input = 0
    total_words_output = 0

    #word processing
    for conversation in tqdm(dataset["train"], desc="Processing English conversations"):
        if conversation.get("language") == "English": #only english to compare fertility with c4_allenai
            if "conversation" in conversation: #make sure that we can access this field of the dataset
                user_texts = [entry["content"] for entry in conversation["conversation"] if entry["role"] == "user"]
                assistant_texts = [entry["content"] for entry in conversation["conversation"] if entry["role"] == "assistant"]

                total_words_input += sum(count_words(text) for text in user_texts)
                total_words_output += sum(count_words(text) for text in assistant_texts)

    print(f"Total number of words in user texts (English): {total_words_input}")
    print(f"Total number of words in assistant texts (English): {total_words_output}")
    
elif corpus == "c4_allenai":
    #initialize counter
    total_words = 0

    for example in tqdm(validation_dataset, desc="Processing C4"):
        if "text" in example:
            total_words += count_words(example["text"])

    print(f"Total number of words in the C4 corpus validation split (English): {total_words}")

In [None]:
#fertility calculation for OpenAI tokenizers:
for openai_model in openai_models: #repeat process for each model
    encoder = tiktoken.encoding_for_model(openai_model) #load the tokenizer

    if corpus == "lmsys": #if the selected corpus is the conversational one
        #initialize counters
        total_tokens_input = 0 
        total_tokens_output = 0

        for conversation in tqdm(dataset["train"], desc=f"Tokenizing LMSYS for {openai_model}"):
            if conversation.get("language") == "English":
                if "conversation" in conversation:
                    user_texts = [entry["content"] for entry in conversation["conversation"] if entry["role"] == "user"] #extract input texts
                    assistant_texts = [entry["content"] for entry in conversation["conversation"] if entry["role"] == "assistant"] #extract output texts
                    total_tokens_input += sum(len(encoder.encode(text, allowed_special="all")) for text in user_texts) #tokenize and count input tokens
                    total_tokens_output += sum(len(encoder.encode(text, allowed_special="all")) for text in assistant_texts) #tokenize and count output tokens

        print(f"Total tokens (user): {total_tokens_input}")
        print(f"Total tokens (assistant): {total_tokens_output}")

        fertility_input = total_tokens_input / total_words_input #fertility calculation input
        fertility_output = total_tokens_output / total_words_output #fertility calculation output

        result_row = {
            "model": openai_model,
            "total_words_input": total_words_input,
            "total_tokens_input": total_tokens_input,
            "fertility_input": round(fertility_input, 2), #only 2 decimal places
            "total_words_output": total_words_output,
            "total_tokens_output": total_tokens_output,
            "fertility_output": round(fertility_output, 2) #only 2 decimal places
        }

        csv_filename = f"evaluation_fertility_{corpus}.csv" 
        file_exists = os.path.exists(csv_filename)

        with open(csv_filename, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=result_row.keys())
            if not file_exists: #if the file did not exist before, the header needs to be written
                writer.writeheader()
            writer.writerow(result_row) #save results

    elif corpus == "c4_allenai": #if the selected corpus is the training dataset
        #initialize counter
        total_tokens = 0

        for example in tqdm(validation_dataset, desc=f"Tokenizing C4 for {openai_model}"): #tokenize text to text
            text = example["text"]
            tokens = encoder.encode(text, disallowed_special=())
            total_tokens += len(tokens)

        print(f"Total tokens (C4): {total_tokens}")

        fertility = total_tokens / total_words #fertility calculation c4
        
        result_row = {
            "model": openai_model,
            "total_words": total_words,
            "total_tokens": total_tokens,
            "fertility": round(fertility, 2) #only 2 decimal places
        }

        csv_filename = f"evaluation_fertility_{corpus}.csv" 
        file_exists = os.path.exists(csv_filename)

        with open(csv_filename, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=result_row.keys()) 
            if not file_exists: #if the file did not exist before, the header needs to be written
                writer.writeheader()
            writer.writerow(result_row) #save results

In [None]:
#fertility calculation for HF models tokenizers:
for model_name in hf_models: #repeat process for each model
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) #load the tokenizer

    if corpus == "lmsys": #if the selected corpus is the conversational one
        #initialize counters
        total_tokens_input = 0 
        total_tokens_output = 0

        for conversation in tqdm(dataset["train"], desc=f"Tokenizing LMSYS with {model_name}"):
            if conversation.get("language") == "English" and "conversation" in conversation:
                user_texts = [entry["content"] for entry in conversation["conversation"] if entry["role"] == "user"] #extract input texts
                assistant_texts = [entry["content"] for entry in conversation["conversation"] if entry["role"] == "assistant"] #extract input texts
                total_tokens_input += sum(len(tokenizer(text, padding=False, truncation=False)["input_ids"]) for text in user_texts) #tokenize and count input tokens
                total_tokens_output += sum(len(tokenizer(text, padding=False, truncation=False)["input_ids"]) for text in assistant_texts) #tokenize and count output tokens

        fertility_input = total_tokens_input / total_words_input #fertility calculation input
        fertility_output = total_tokens_output / total_words_output #fertility calculation output

        result_row = {
            "model": model_name,
            "total_words_input": total_words_input,
            "total_tokens_input": total_tokens_input,
            "fertility_input": round(fertility_input, 2), #only 2 decimal places
            "total_words_output": total_words_output,
            "total_tokens_output": total_tokens_output,
            "fertility_output": round(fertility_output, 2) #only 2 decimal places
        }

        csv_filename = f"evaluation_fertility_{corpus}.csv"
        file_exists = os.path.exists(csv_filename)
        with open(csv_filename, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=result_row.keys())
            if not file_exists: #if the file did not exist before, the header needs to be written
                writer.writeheader()
            writer.writerow(result_row) #save results

    elif corpus == "c4_allenai": #if the selected corpus is the training dataset
        #initialize counter
        total_tokens = 0

        for example in tqdm(validation_dataset, desc=f"Tokenizing C4 with {model_name}"): #tokenize text to text
            text = example["text"]
            tokens = tokenizer(text, padding=False, truncation=False)["input_ids"]
            total_tokens += len(tokens)

        fertility = total_tokens / total_words #fertility calculation c4

        result_row = {
            "model": model_name,
            "total_words": total_words,
            "total_tokens": total_tokens,
            "fertility": round(fertility, 2)
        }

        csv_filename = f"evaluation_fertility_{corpus}.csv"
        file_exists = os.path.exists(csv_filename)
        with open(csv_filename, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=result_row.keys())
            if not file_exists: #if the file did not exist before, the header needs to be written
                writer.writeheader()
            writer.writerow(result_row) #save results