In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
import csv
from itertools import combinations
from collections import Counter
from transformers import AutoTokenizer
from datasets import Dataset

#ANALYSIS AND COMPARISON OF THE TOKEN VOCABULARIES 

#Hugging Face login
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))

model = "google/gemma-2-9b" #a different model can be processed in each execution of the code, these ones or others:
#"mistralai/Mistral-7B-v0.1"
#"meta-llama/Llama-3.1-8B"
#"google/gemma-2-9b"
#"deepseek-ai/DeepSeek-R1"
#"bigscience/bloom"
#"microsoft/phi"

model_name = model.split("/")[-1] #to extract the local tokenizers from their folders

In [None]:
#load tokenizers
original_tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) #original tokenizer from Hugging Face
tokenizer_input = AutoTokenizer.from_pretrained(f"retrained_conversational_tokenizers/tokenizer_{model_name}_input")
tokenizer_output = AutoTokenizer.from_pretrained(f"retrained_conversational_tokenizers/tokenizer_{model_name}_output")
tokenizer_conversation = AutoTokenizer.from_pretrained(f"retrained_conversational_tokenizers/tokenizer_{model_name}_conversation")

In [None]:
#computation of the percentage of common tokens between the different tokenizers of the same model:

#create output folder (if it does not exist yet)
output_dir = "comparison_vocabularies"
os.makedirs(output_dir, exist_ok=True)

print(f"Processing model: {model_name}")

#dictionary with vocabularies
tokenizers = {
    "original": set(original_tokenizer.get_vocab().keys()),
    "input": set(tokenizer_input.get_vocab().keys()),
    "output": set(tokenizer_output.get_vocab().keys()),
    "conversation": set(tokenizer_conversation.get_vocab().keys()),
}

tokenizer_names = list(tokenizers.keys())
num_tokens = len(tokenizers["original"])  #take original as reference (all should have the same, or almost)
print(f"{model_name}: {num_tokens} tokens in the original vocabulary")

#prepare results
results = []
for name1, name2 in combinations(tokenizer_names, 2):
    vocab1 = tokenizers[name1]
    vocab2 = tokenizers[name2]

    #intersection and common percentage (with respect to original)
    common_tokens = vocab1 & vocab2
    percent_common = len(common_tokens) / num_tokens * 100
    results.append([name1, name2, len(common_tokens), f"{percent_common:.2f}"])

#save results in csv
csv_filename = os.path.join(output_dir, f"tokenizer_comparison_{model_name}.csv") #create a csv for each model inside the "comparison_vocabularies" folder
file_exists = os.path.isfile(csv_filename)

with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["tokenizer 1", "tokenizer 2", "number shared tokens", "% common"])
    writer.writerows(results)

In [None]:
#load test split of the conversational dataset
test_corpus = Dataset.from_parquet("test_randomsplit.parquet") #generated with create_corpus.ipynb

In [None]:
#to test input and output separately:
def  get_test_corpus_input(test_dataset):
    column = "clean_input"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )

def  get_test_corpus_output(test_dataset):
    column = "clean_output"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )
    
#to tokenize text from a generator and count the number of times that each token appears 
def tokenize_and_analyze(generator, tokenizer):
    token_counts = Counter() #dictionary to save the number of times each token appears
    for fragment_list in generator:
        batch_tokenized = tokenizer(
            fragment_list,
            truncation=True,
            add_special_tokens=False
        )
        for token_ids in batch_tokenized["input_ids"]:
            tokens = tokenizer.convert_ids_to_tokens(token_ids)
            token_counts.update(tokens) 
    return token_counts

In [None]:
#list of tokenizers
tokenizers = { #imported previously
    "original": original_tokenizer,
    "input": tokenizer_input,
    "output": tokenizer_output,
    "conversation": tokenizer_conversation
}

In [None]:
#tokenize the test corpus and save the 10,000 most common tokens for each tokenizer:

#create output folder (if it does not exist yet)
output_folder = f"token_frequencies/{model_name}"
os.makedirs(output_folder, exist_ok=True)

#helper function to analyze, save and compare
def process_corpus(get_corpus_fn, suffix):
    #get token counts
    token_counts_all = {
        name: tokenize_and_analyze(get_corpus_fn(test_corpus), tokenizer)
        for name, tokenizer in tokenizers.items()
    }
    #save the 10,000 most frequent tokens to a csv for each tokenizer
    for name, counts in token_counts_all.items():
        top_10000 = counts.most_common(10000) 
        csv_path = os.path.join(output_folder, f"tokenizer_{name}_top_tokens_in_{suffix}.csv") #generates, for each of the 4 tokenizers, to csv files (input and output)
        with open(csv_path, mode="w", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["token", "frequency"]) 
            writer.writerows(top_10000)
        print(f"results exported to: {csv_path}") 

    #comparison: original vs input, output, conversation
    N_values = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
    original_counts = token_counts_all["original"]
    #show preliminary results, but does not save them (will be saved later, after computing also the top tokens for complete conversations)
    for name in ["conversation", "input", "output"]: 
        print(f"\noriginal-{name.upper()} ({suffix})") #suffix indicates if it is the input or the output what has been tokenized
        other_counts = token_counts_all[name]
        for N in N_values:
            top_original = set([token for token, _ in original_counts.most_common(N)])
            top_other = set([token for token, _ in other_counts.most_common(N)])
            common = top_original.intersection(top_other)
            percent_common = len(common) / N * 100
            print(f"top {N}: {percent_common:.2f}%")

#run analysis for input and output
process_corpus(get_test_corpus_input, "input")
process_corpus(get_test_corpus_output, "output")


In [None]:
#generate a third file of top tokens for each tokenizer: the one that combines input and output to extract the most common tokens in conversations

#folder where the csv files are located
output_folder = f"token_frequencies/{model_name}"
tokenizer_names = ["original", "input", "output", "conversation"]

#to extract the token counts 
def load_token_counts(filepath): 
    counts = Counter() #a dictionary
    with open(filepath, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            token = row["Token"]
            freq = int(row["Frequency"])
            counts[token] += freq #counts[token] = freq could be used as well because each tokens appears only once with the total frequency already
    return counts 

#to add the frequencies in input and output and extract the ones in conversation
def combine_input_output_to_conversation():
    for name in tokenizer_names:
        try:
            #load the two files of the tokenizer that is being processed
            path_input = os.path.join(output_folder, f"tokenizer_{name}_top_tokens_in_input.csv") 
            path_output = os.path.join(output_folder, f"tokenizer_{name}_top_tokens_in_output.csv")
            
            #dictionary with token counts of each file
            counts_input = load_token_counts(path_input) 
            counts_output = load_token_counts(path_output)
            
            combined = counts_input + counts_output #combines the dictionaries and calculates the total counts
            top_combined = combined.most_common(10000)
            
            path_convo = os.path.join(output_folder, f"tokenizer_{name}_top_tokens_in_conversation.csv") #new file generated for each tokenizer processed
            with open(path_convo, mode="w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["Token", "Frequency"])
                writer.writerows(top_combined)
            
            print(f"Combined input+output:  {path_convo}")
        
        except FileNotFoundError as e:
            print(f"Files missing for {name}: {e}")

#Execute
combine_input_output_to_conversation()


In [None]:
#from the csv files generated with the top 10,000 tokens of each tokenizer: comparison between the top tokens of the different tokenizers

#parameters
comparisons = [("original", "input"), ("original", "output"), ("original", "conversation")]
N_values = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000]
tokenizer_names = ["original", "input", "output", "conversation"]

#load tokens from csv file
def load_top_tokens(filepath, max_tokens=10000):
    tokens = []
    with open(filepath, encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            tokens.append(row["Token"])
            if len(tokens) >= max_tokens:
                break
    return tokens

#analyze token overlap from saved csv files
def analyze_overlap_from_csv(output_folder, suffix):
    #load tokens from each tokenizer file
    top_tokens = {}
    for name in tokenizer_names:
        path = os.path.join(output_folder, f"tokenizer_{name}_top_tokens_in_{suffix}.csv")
        top_tokens[name] = load_top_tokens(path, max_tokens=max(N_values))

    #compare and store results
    results = []

    for a, b in comparisons:
        row = [f"{a}-{b}"]
        tokens_a = top_tokens[a]
        tokens_b = top_tokens[b]
        for N in N_values:
            top_a = set(tokens_a[:N])
            top_b = set(tokens_b[:N])
            common = top_a.intersection(top_b)
            percent_common = len(common) / N * 100
            row.append(f"{percent_common:.2f}")
        results.append(row)

    #save results to csv
    output_csv = os.path.join(output_folder, f"comparison_overlap_{suffix}.csv") #three new csv files for each model: input, output and conversation
    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["comparison"] + [f"top {n}" for n in N_values])
        writer.writerows(results)
    print(f"Comparison saved to: {output_csv}")

#run for input, output and conversation
#for each model, the csv files generated previously with the most common tokens are in the folder "token_frequencies/{model_name}"
analyze_overlap_from_csv(f"token_frequencies/{model_name}", "input") 
analyze_overlap_from_csv(f"token_frequencies/{model_name}", "output")
analyze_overlap_from_csv(f"token_frequencies/{model_name}", "conversation")
