In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))
#parameters for experiment:
model = "mistralai/Mistral-7B-v0.1"
#"meta-llama/Llama-3.1-8B"
#"mistralai/Mistral-7B-v0.3"
#"01-ai/Yi-1.5-9B-Chat"
#"google/gemma-2-9b"
#"deepseek-ai/DeepSeek-R1"

only_input = False #True to train only with human-generated text
split_by_lang = False

In [2]:
from datasets import Dataset
#choose the train and test splits
if split_by_lang:
    train_corpus = Dataset.from_parquet("train_languagesplit.parquet") #generate with create_corpus.ipynb
    test_corpus = Dataset.from_parquet("test_languagesplit.parquet") #generate with create_corpus.ipynb
else:
    train_corpus = Dataset.from_parquet("train_randomsplit.parquet") #generate with create_corpus.ipynb
    test_corpus = Dataset.from_parquet("test_randomsplit.parquet") #generate with create_corpus.ipynb

In [3]:
from tqdm import tqdm
if only_input:
    column = "clean_input" #column with only "user" texts
else:
    column = "clean_conversation" #column with "user" and "assistant" texts

#to avoid loading all the texts into memory
def batch_iterator(dataset, batch_size=1000):
    for i in tqdm(range(0, len(dataset), batch_size), desc = "Traning Progress"):
        yield dataset[i : i + batch_size][column]


#we always test the tokenization in both "user" and "assistant" texts
def  get_test_corpus(test_dataset):
    column = "clean_conversation"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )

In [4]:
def tokenize_and_count_from_generator(generator, tokenizer):
    total_tokens = 0
    for fragment_list in generator: 
        batch_tokenized = tokenizer(
            fragment_list,
            truncation=True,
            add_special_tokens=False
        )
        total_tokens += sum(len(ids) for ids in batch_tokenized["input_ids"]) #numerical identifiers of the tokens
    return total_tokens

In [None]:
def create_filename(base_name):

    # Añade etiquetas al nombre base dependiendo de las flags
    if only_input:
        base_name += "_only_input"
    if split_by_lang:
        base_name += "_split_by_lang"

    # Añade la extensión al archivo
    base_name += ".csv"
    return base_name

In [None]:
from transformers import AutoTokenizer
import csv
%pip install --upgrade transformers accelerate

print(f"Processing: {model}")

#Loads the tokenizer from the model
old_tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
print("Original tokenizer loaded")

#train the new tokenizer using the iterator from the training corpus
tokenizer = old_tokenizer.train_new_from_iterator(batch_iterator(train_corpus), old_tokenizer.vocab_size, length=len(train_corpus))
print("Retrained tokenizer completed")

In [None]:
# Computes the number of tokens for the test corpus for the retrained tokenizer
tokens_new = tokenize_and_count_from_generator(get_test_corpus(test_corpus), tokenizer)
print("tokens_new obtained")

In [None]:
# Computes the number of tokens for the test corpus for the original tokenizer
tokens_old = tokenize_and_count_from_generator(get_test_corpus(test_corpus), old_tokenizer)
print("tokens_old obtained")
# Computes the gain achieved by the retraining
gain = (1 - tokens_new / tokens_old) * 100

In [None]:
import os
import csv

# Print the details of the model and token counts
print(f"Model: {model}, Old tokens: {tokens_old}, New tokens: {tokens_new}, Gain: {gain:.2f}%")

# Create the CSV file name
csv_file = create_filename("conversational_tokenizers")

# Check if the file already exists
file_exists = os.path.isfile(csv_file)

# Open the file in "append" mode if it exists, or "write" mode if it doesn't
with open(csv_file, mode="a" if file_exists else "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["model", "tokens_old", "tokens_new", "gain"])
    
    # If the file doesn't exist, write the header
    if not file_exists:
        writer.writeheader()
    
    # Directly write the single result row
    writer.writerow({
        "model": model,
        "tokens_old": tokens_old,
        "tokens_new": tokens_new,
        "gain": round(gain, 2)  # Round the gain to 2 decimal places
    })

print(f"Results saved in {csv_file}")
