In [None]:
import os
import csv
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm

#EVALUATE THE LOSS OF THE RETRAINED TOKENIZERS IN THE C4 CORPUS

#Hugging Face login
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

#models to evaluate
hf_models = [
    "deepseek-ai/DeepSeek-R1",
   "meta-llama/Llama-3.1-8B",
    "google/gemma-2-9b",
   "mistralai/Mistral-7B-v0.1",
   "bigscience/bloom",
   "microsoft/phi-4"
] #a loop is going to be used to obtain all the results in one execution


In [None]:
#load dataset in streaming mode
dataset = load_dataset("allenai/c4", "en", streaming=True) #english conversations (as in the first evaluation)
validation_dataset = dataset["validation"] #the train split is too big

In [None]:
#to count the number of tokens in a dataset
def tokenize_and_count(dataset, tokenizer, tokenizer_name):
    total_tokens = 0
    for example in tqdm(dataset, desc=f"Tokenizing C4 with {tokenizer_name}"):
        text = example["text"]
        tokens = tokenizer(text, padding=False, truncation=False)["input_ids"]
        total_tokens += len(tokens)
    return total_tokens

In [None]:
csv_filename = "loss_c4_retrained.csv" #file where the results are going to be saved
fieldnames = [ #for the csv file with the results
    "model", 
    "tokens_original",
    "tokens_retrained_conversation", "gain_conversation",
    "tokens_retrained_input", "gain_input",
    "tokens_retrained_output", "gain_output"
]

with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader() #write the fieldnames

    for model in hf_models:
        print(f"Processing model: {model}")
        model_name = model.split("/")[-1]

        try:
            #load tokenizers
            original_tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
            tokenizer_input = AutoTokenizer.from_pretrained(f"conversational_tokenizers/tokenizer_{model_name}_input")
            tokenizer_output = AutoTokenizer.from_pretrained(f"conversational_tokenizers/tokenizer_{model_name}_output")
            tokenizer_conversation = AutoTokenizer.from_pretrained(f"conversational_tokenizers/tokenizer_{model_name}_conversation")

            #token counts
            tokens_original = tokenize_and_count(validation_dataset, original_tokenizer, model_name + " (original)")
            tokens_retrained_conversation = tokenize_and_count(validation_dataset, tokenizer_conversation, model_name + " (conversation)")
            tokens_retrained_input = tokenize_and_count(validation_dataset, tokenizer_input, model_name + " (input)")
            tokens_retrained_output = tokenize_and_count(validation_dataset, tokenizer_output, model_name + " (output)")

            #compute gains
            gain_conversation = (1-tokens_retrained_conversation/tokens_original)*100
            gain_input = (1-tokens_retrained_input/tokens_original)*100
            gain_output = (1-tokens_retrained_output/tokens_original)*100

            #add to csv
            writer.writerow({
                "model": model_name,
                "tokens_original": tokens_original,
                "tokens_retrained_conversation": tokens_retrained_conversation,
                "gain_conversation": round(gain_conversation, 2),
                "tokens_retrained_input": tokens_retrained_input,
                "gain_input": round(gain_input, 2),
                "tokens_retrained_output": tokens_retrained_output,
                "gain_output": round(gain_output, 2)
            })

        except Exception as e:
            print(f"Error processing {model}: {e}")