In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import Dataset
import csv
import pandas as pd
from collections import Counter, defaultdict
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

#STUDY OPTIMIZATION OF RETRAINED TOKENIZERS FOR DIFFERENT LANGUAGES

#Hugging Face login
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

model = "mistralai/Mistral-7B-v0.1" #a different model can be processed in each execution of the code, these ones or others:
#"mistralai/Mistral-7B-v0.1"
#"meta-llama/Llama-3.1-8B"
#"google/gemma-2-9b"
#"deepseek-ai/DeepSeek-R1"
#"bigscience/bloom"
#"microsoft/phi"

In [None]:
#load the test split of the conversational dataset:
test_corpus = Dataset.from_parquet("test_randomsplit.parquet") #generated with create_corpus.ipynb

In [None]:
#graph with the language distribution:

#count languages
language_counts = Counter(test_corpus['language'])

#sort by frequency
sorted_languages = language_counts.most_common()
languages, counts = zip(*sorted_languages)

#compute percentages
total = sum(counts)
percentages = [count / total * 100 for count in counts]

#group into top n + others
top_n = 10
others_percentage = sum(percentages[top_n:])
languages_modified = list(languages[:top_n]) + ["others"]
percentages_modified = percentages[:top_n] + [others_percentage]

#plot
plt.figure(figsize=(10, 10))
colors = plt.cm.get_cmap("tab20", len(languages_modified)).colors

plt.pie(
    percentages_modified,
    labels=languages_modified,
    autopct='%1.1f%%',
    startangle=180,
    labeldistance=1.05,
    pctdistance=0.85,
    colors=colors,
    textprops={'fontsize': 19}
)

plt.show()

In [None]:
#to access the different tokenizers created in "conversational_tokenizers_gain.ipynb"

model_name = model.split("/")[-1]  #extract the model name after the slash

#load tokenizers manually from the conversational_tokenizers folder
original_tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) #original tokenizer: load from Hugging Face
tokenizer_input = AutoTokenizer.from_pretrained(f"retrained_conversational_tokenizers/tokenizer_{model_name}_input")
tokenizer_output = AutoTokenizer.from_pretrained(f"retrained_conversational_tokenizers/tokenizer_{model_name}_output")
tokenizer_conversation = AutoTokenizer.from_pretrained(f"retrained_conversational_tokenizers/tokenizer_{model_name}_conversation")

In [None]:
#to test the number of tokens generated for input and output separately:
def  get_test_corpus_input(test_dataset): #returns batches of the input of the conversations
    column = "clean_input"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )

def  get_test_corpus_output(test_dataset): #returns batches of the output of the conversations
    column = "clean_output"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )

#to count the number of tokens in a corpus from a generator (like get_test_corpus_input(test_dataset) or get_test_corpus_output(test_dataset)): 
def tokenize_and_count_from_generator(generator, tokenizer):
    total_tokens = 0
    for fragment_list in generator: 
        batch_tokenized = tokenizer(
            fragment_list,
            truncation=True,
            add_special_tokens=False
        )
        total_tokens += sum(len(ids) for ids in batch_tokenized["input_ids"]) #numerical identifiers of the tokens
    return total_tokens

In [None]:
#to check if latin is actually latin in the conversations:

def get_test_corpus_conversation(test_dataset):
    column = "conversation"
    return (
        test_dataset[i : i + 100][column]
        for i in range(0, len(test_dataset), 100)
    )

latin_conversations = []
generator = get_test_corpus_conversation(test_corpus)

#loop through batches of 100
for i, chunk in enumerate(generator):
    #track the start and end index of the current batch
    start_idx = i * 100
    end_idx = min(start_idx + 100, len(test_corpus))

    #get the languages of the current batch
    languages = test_corpus[start_idx:end_idx]["language"]

    for convo, lang in zip(chunk, languages):
        if lang == "Latin":
            latin_conversations.append(convo)
            if len(latin_conversations) == 100:
                break
    if len(latin_conversations) == 100:
        break

#save the 100 conversations to a file
with open("latin_conversations.txt", "w", encoding="utf-8") as f:
    for i, convo in enumerate(latin_conversations, 1):
        f.write(f"--- Conversation {i} ---\n{convo}\n\n")


In [None]:
retrained_tokenizers = { #to iterate through the three tokenizers
    "input": tokenizer_input,
    "output": tokenizer_output,
    "conversation": tokenizer_conversation,
}

#dictionary to store temporary results for each tokenizer type
all_token_counts = {kind: {
    "original_input": defaultdict(int),
    "original_output": defaultdict(int),
    "new_input": defaultdict(int),
    "new_output": defaultdict(int),
} for kind in retrained_tokenizers}

#process in batches
batch_size = 100
for i in range(0, len(test_corpus), batch_size):
    batch = test_corpus[i:i+batch_size]
    inputs = batch["clean_input"]
    outputs = batch["clean_output"]
    languages = batch["language"]

    #original: reference to compare the performance of the new tokenizers
    orig_in = original_tokenizer(inputs, truncation=True, add_special_tokens=False)["input_ids"] #tokenize input texts
    orig_out = original_tokenizer(outputs, truncation=True, add_special_tokens=False)["input_ids"] #tokenize output texts

    #tokenize with each retrained tokenizer
    for kind, tokenizer in retrained_tokenizers.items():
        new_in = tokenizer(inputs, truncation=True, add_special_tokens=False)["input_ids"]
        new_out = tokenizer(outputs, truncation=True, add_special_tokens=False)["input_ids"]

        #accumulate token counts
        for lang, oi, oo, ni, no in zip(languages, orig_in, orig_out, new_in, new_out): #they all align correctly because they correspond, in order, to the same conversations
            all_token_counts[kind]["original_input"][lang] += len(oi) 
            all_token_counts[kind]["original_output"][lang] += len(oo)
            all_token_counts[kind]["new_input"][lang] += len(ni)
            all_token_counts[kind]["new_output"][lang] += len(no)

#function to compute gain
gain = lambda orig, new: 100 * (orig - new) / orig if orig > 0 else 0

#create and save csvs
output_dir = os.path.join("gains_by_language", model_name) #folder to save the results for each tokenizer
os.makedirs(output_dir, exist_ok=True)

for kind, counts in all_token_counts.items():
    results = []
    languages = sorted(
        counts["original_input"].keys(),
        key=lambda l: counts["original_input"][l] + counts["original_output"][l],
        reverse=True
    )
    for lang in languages: #number of tokens (input and output) for the original and the chosen retrained tokenizer
        orig_in = counts["original_input"][lang]
        orig_out = counts["original_output"][lang]
        new_in = counts["new_input"][lang]
        new_out = counts["new_output"][lang]

        results.append({
            "language": lang,
            "tokens_old_input": orig_in,
            "tokens_new_input": new_in,
            "gain_input": gain(orig_in, new_in),
            "tokens_old_output": orig_out,
            "tokens_new_output": new_out,
            "gain_output": gain(orig_out, new_out),
        })

    pd.DataFrame(results).to_csv(os.path.join(output_dir, f"gain_tokenizer_{kind}.csv"), index=False) #save results
