In [None]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv("key.env")
login(os.getenv("HF_TOKEN"))
#parameters for experiment:
model = "google/gemma-2-9b"
#"mistralai/Mistral-7B-v0.1"
#"mistralai/Mistral-7B-v0.1"
#"meta-llama/Llama-3.1-8B"
#"mistralai/Mistral-7B-v0.3"
#"01-ai/Yi-1.5-9B-Chat"
#"google/gemma-2-9b"
#"deepseek-ai/DeepSeek-R1"

mode_training = "output" #"output", "conversation"
split_by_lang = False
batch_size = 100

In [None]:
from datasets import Dataset
#choose the train and test splits
if split_by_lang:
    train_corpus = Dataset.from_parquet("train_languagesplit.parquet") #generate with create_corpus.ipynb
    test_corpus = Dataset.from_parquet("test_languagesplit.parquet") #generate with create_corpus.ipynb
else:
    train_corpus = Dataset.from_parquet("train_randomsplit.parquet") #generate with create_corpus.ipynb
    test_corpus = Dataset.from_parquet("test_randomsplit.parquet") #generate with create_corpus.ipynb

In [None]:
from tqdm import tqdm


#to avoid loading all the texts into memory
def batch_iterator(dataset, batch_size=1000, mode = "conversation"): #used for training
    column_map = {
    "input": "clean_input",
    "output": "clean_output",  # Asegúrate de que esta columna existe
    "conversation": "clean_conversation"
    }
    column = column_map[mode]
    
    for i in tqdm(range(0, len(dataset), batch_size), desc = "Traning Progress"):
        yield dataset[i : i + batch_size][column]



#to test the number of tokens generated for input and output separately:

def  get_test_corpus_input(test_dataset):
    column = "clean_input"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )

def  get_test_corpus_output(test_dataset):
    column = "clean_output"
    return (
         test_dataset[i : i + 100][column] 
        for i in range(0, len(test_dataset), 100)
    )
    

In [None]:
def tokenize_and_count_from_generator(generator, tokenizer):
    total_tokens = 0
    for fragment_list in generator: 
        batch_tokenized = tokenizer(
            fragment_list,
            truncation=True,
            add_special_tokens=False
        )
        total_tokens += sum(len(ids) for ids in batch_tokenized["input_ids"]) #numerical identifiers of the tokens
    return total_tokens

In [None]:
def create_filename(base_name, mode, split_by_lang):
    mode_map = {
        "input": "only_input",
        "output": "only_output",
        "conversation": "conversation"
    }

    base_name += f"_{mode_map[mode]}"
    
    if split_by_lang:
        base_name += "_split_by_lang"

    base_name += ".csv"
    return base_name

In [None]:
# %install --upgrade transformers accelerate
from transformers import AutoTokenizer
import csv


print(f"Processing: {model}")

#Loads the tokenizer from the model
old_tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
print("Original tokenizer loaded")

#train the new tokenizer using the iterator from the training corpus
tokenizer = old_tokenizer.train_new_from_iterator(batch_iterator(train_corpus, batch_size, mode_training), old_tokenizer.vocab_size, length=len(train_corpus))
print("Retrained tokenizer completed")

In [None]:
# Computes the number of tokens for the input of the test corpus for the retrained tokenizer
tokens_new_input = tokenize_and_count_from_generator(get_test_corpus_input(test_corpus), tokenizer)
print("tokens_new_input obtained")

In [None]:
# Computes the number of tokens for the input of the test corpus for the original tokenizer
tokens_old_input = tokenize_and_count_from_generator(get_test_corpus_input(test_corpus), old_tokenizer)
print("tokens_old_input obtained")
# Computes the gain achieved by the retraining for the input (user) texts
gain_input = (1 - tokens_new_input / tokens_old_input) * 100

In [None]:
# Computes the number of tokens for the output of the test corpus for the retrained tokenizer
tokens_new_output = tokenize_and_count_from_generator(get_test_corpus_output(test_corpus), tokenizer)
print("tokens_new_output obtained")

In [None]:
# Computes the number of tokens for the input of the test corpus for the original tokenizer
tokens_old_output = tokenize_and_count_from_generator(get_test_corpus_output(test_corpus), old_tokenizer)
print("tokens_old_output obtained")
# Computes the gain achieved by the retraining for the output (assistant) texts
gain_output = (1 - tokens_new_output / tokens_old_output) * 100

In [None]:
import os
import csv

# Print the details of the model and token counts

print(f"Modelo: {model}, Tokens antiguos input: {tokens_old_input}, Tokens nuevos input: {tokens_new_input}, Ganancia input: {gain_input:.2f}%, Tokens antiguos output: {tokens_old_output}, Tokens nuevos output: {tokens_new_output}, Ganancia output: {gain_output:.2f}% ")

# Create the CSV file name
csv_file = create_filename("conversational_tokenizers", mode_training, split_by_lang)

# Check if the file already exists
file_exists = os.path.isfile(csv_file)

# Open the file in "append" mode if it exists, or "write" mode if it doesn't
with open(csv_file, mode="a" if file_exists else "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=[
        "model", 
        "tokens_old_input", "tokens_new_input", "gain_input",
        "tokens_old_output", "tokens_new_output", "gain_output"
    ])
    # If the file doesn't exist, write the header
    if not file_exists:
        writer.writeheader()
    
    # Directly write the single result row
    writer.writerow({
        "model": model,
        "tokens_old_input": tokens_old_input,
        "tokens_new_input": tokens_new_input,
        "gain_input": round(gain_input, 2),  # Round the gain to 2 decimal places
        "tokens_old_output": tokens_old_output,
        "tokens_new_output": tokens_new_output,
        "gain_output": round(gain_output, 2),  # Round the gain to 2 decimal places        
    })

print(f"Results saved in {csv_file}")
