In [None]:
# Montar Google Drive para acceder al dataset y otros archivos
from google.colab import drive
drive.mount('/content/drive')

# Instalar la biblioteca transformers si aún no está instalada
!pip install transformers

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW

# Ruta al archivo de datos
dataset_path = '/content/drive/MyDrive/GPI_PIA/Dataset_Final_Limpio_V2.txt'

Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from torch.nn.utils.rnn import pad_sequence

In [None]:
# Cargar el dataset
df = pd.read_csv(dataset_path, sep=',')

# Configurar el modelo y el tokenizer T5
model_name = 't5-small'  # Seleccionar el modelo preentrenado
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
df.head()

Unnamed: 0,Español,Mapudungun
0,se pierde en el agua,ñamkonki ta komu
1,y el que tiene espiritu malo,ka nielunga trafentun
2,por eso me cuesta llegar,feyta newe puwtuwekelan
3,ya ahí de repente me levanté,feymu witruiñpüramen
4,había un solo corazón,mülekefuy kiñe piwke


In [None]:
class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data.dropna().reset_index(drop=True)  # Eliminar filas con NaN
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data) * 2  # Duplicamos el tamaño para incluir ambas direcciones

    def __getitem__(self, index):
        real_index = index // 2    # Ajustar el índice real
        is_reverse = index % 2 == 1  # Indica si es una traducción inversa (de español a Mapu)

        if is_reverse:
            input_text = f"translate Spanish to Mapudungun: {self.data.iloc[real_index]['Español']}"
            target_text = self.data.iloc[real_index]['Mapudungun']
        else:
            input_text = f"translate Mapudungun to Spanish: {self.data.iloc[real_index]['Mapudungun']}"
            target_text = self.data.iloc[real_index]['Español']

        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=self.max_length, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors='pt', max_length=self.max_length, truncation=True)

        return {
            'input_ids': input_ids.flatten(),
            'target_ids': target_ids.flatten()
        }

In [None]:
# Preparar el dataset con DataLoader y padding dinámico
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    target_ids = [item['target_ids'] for item in batch]

    # Padding dinámico
    input_ids = pad_sequence(input_ids, batch_first=True)
    target_ids = pad_sequence(target_ids, batch_first=True)

    return {
        'input_ids': input_ids,
        'target_ids': target_ids
    }

In [None]:
# Crear DataLoader con collate_fn personalizado
dataset = TranslationDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

In [None]:
# Verificar si hay una GPU disponible y configurar el dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
# Mover el modelo y los datos al dispositivo (GPU si está disponible)
model.to(device)

# Configurar el optimizador AdamW
optimizer = AdamW(model.parameters(), lr=1e-4)



In [None]:
# Configurar el modelo en modo de entrenamiento
model.train()

# Bucle de entrenamiento
num_epochs = 5  # Ejemplo, ajustar según sea necesario

In [None]:
for epoch in range(num_epochs):
    total_loss = 0

    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        target_ids = batch['target_ids'].to(device)

        # Generar la salida del modelo
        outputs = model(input_ids=input_ids, labels=target_ids)

        loss = outputs.loss
        total_loss += loss.item()

        # Realizar la retropropagación y la actualización de parámetros
        loss.backward()
        optimizer.step()

    # Imprimir la pérdida promedio por epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader)}')

In [None]:
# Guardar el modelo entrenado si es necesario
model.save_pretrained('/content/drive/MyDrive/GPI_PIA/translator_model_V2')


print("Entrenamiento completado.")

Entrenamiento completado.


Se puede observar que en las 5 epochs de prueba, la loss constantemente disminuyó. Si se le aplican más epochs, el modelo podría ser más preciso, por lo tanto, se recomienda entrar en más epochs (se demoró 3 horas y 25 min aprox. en realizar las 5 epochs, ad portas de llegar al máximo de tiempo de ejecución de GPU gratis de Google Colab)

El resultado de las 5 epochs en términos de Loss es:
Epoch [1/5], Loss: 1.1375518550490418
Epoch [2/5], Loss: 0.8440102795699922
Epoch [3/5], Loss: 0.7538309259522425
Epoch [4/5], Loss: 0.7007055242611164
Epoch [5/5], Loss: 0.6624663313597254

Se espera que con al menos 10 epochs, la precisión del modelo aumente considerablemente.

Es recomendable, realizar la ejecución del entrenamiento con más epochs (al menos 10) en un entorno local QUE POSEA GPU.