In [1]:
# Instalar las bibliotecas necesarias si aún no están instaladas
# Ejecuta estas líneas de instalación en tu terminal o anótalas en un script separado de instalación
# !pip install transformers pandas torch

!pip3 install pandas
!pip3 install torch transformers
!pip3 install scikit-learn





[notice] A new release of pip available: 22.3 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

# Configurar el dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

# Configurar el modelo y el tokenizer T5
model_name = 't5-small'  # Seleccionar el modelo preentrenado
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Cargar el dataset
dataset_path = 'C:/Users/olate/OneDrive/Escritorio/PIA Traductor/Dataset_Final_Limpio_V2.txt'
df = pd.read_csv(dataset_path, sep=',')
df.head()

class TranslationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data.dropna().reset_index(drop=True)  # Eliminar filas con NaN
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data) * 2  # Duplicamos el tamaño para incluir ambas direcciones

    def __getitem__(self, index):
        real_index = index // 2    # Ajustar el índice real
        is_reverse = index % 2 == 1  # Indica si es una traducción inversa (de español a Mapu)

        if is_reverse:
            input_text = f"translate Spanish to Mapudungun: {self.data.iloc[real_index]['Español']}"
            target_text = self.data.iloc[real_index]['Mapudungun']
        else:
            input_text = f"translate Mapudungun to Spanish: {self.data.iloc[real_index]['Mapudungun']}"
            target_text = self.data.iloc[real_index]['Español']

        input_ids = self.tokenizer.encode(input_text, return_tensors='pt', max_length=self.max_length, truncation=True)
        target_ids = self.tokenizer.encode(target_text, return_tensors='pt', max_length=self.max_length, truncation=True)

        return {
            'input_ids': input_ids.flatten(),
            'target_ids': target_ids.flatten()
        }

# Preparar el dataset con DataLoader y padding dinámico
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    target_ids = [item['target_ids'] for item in batch]

    # Padding dinámico
    input_ids = pad_sequence(input_ids, batch_first=True)
    target_ids = pad_sequence(target_ids, batch_first=True)

    return {
        'input_ids': input_ids,
        'target_ids': target_ids
    }

# Crear DataLoader con collate_fn personalizado
dataset = TranslationDataset(df, tokenizer)

# Dividir el dataset en entrenamiento y validación
total_size = len(dataset)
train_size = int(0.8 * total_size) # 80% para entrenamiento
val_size = total_size - train_size # 20% para validación

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Crear DataLoader para entrenamiento y validación
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Función para calcular la precisión
def calculate_accuracy(predictions, targets):
    pred_id = torch.argmax(predictions, dim=-1)
    return accuracy_score(targets.cpu().numpy().flatten(), pred_id.cpu().numpy().flatten())

# Configurar el optimizador y el scheduler
optimizer = AdamW(model.parameters(), lr=1e-4)
num_epochs = 5
num_training_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Entrenamiento del modelo
best_val_loss = float('inf')
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_accuracy = 0

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        target_ids = batch['target_ids'].to(device)

        outputs = model(input_ids=input_ids, labels=target_ids)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        accuracy = calculate_accuracy(logits, target_ids)
        total_accuracy += accuracy

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'accuracy': f"{accuracy:.4f}",
            'learning_rate': f"{scheduler.get_last_lr()[0]:.6f}"
        })
    
    avg_loss = total_loss / len(train_dataloader)
    avg_accuracy = total_accuracy / len(train_dataloader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Average Loss: {avg_loss:.4f}")
    print(f"  Average Accuracy: {avg_accuracy:.4f}")
    print(f"  Learning Rate: {scheduler.get_last_lr()[0]:.6f}")

    # Evaluación en el conjunto de validación
    model.eval()
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            target_ids = batch['target_ids'].to(device)

            outputs = model(input_ids=input_ids, labels=target_ids)
            val_loss += outputs.loss.item()
            val_accuracy += calculate_accuracy(outputs.logits, target_ids)
    
    avg_val_loss = val_loss / len(val_dataloader)
    avg_val_accuracy = val_accuracy / len(val_dataloader)
    print(f" Validation Loss: {avg_val_loss:.4f}")
    print(f" Validation Accuracy: {avg_val_accuracy:.4f}")

    # Guardar el mejor modelo basado en la pérdida de validación
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        print(" Nuevo mejor modelo guardado")

print("Entrenamiento completado")

# Guardar el modelo entrenado
model.save_pretrained('/ruta/para/guardar/translator_model_final')
print("Modelo guardado")


  from .autonotebook import tqdm as notebook_tqdm


Usando dispositivo: cuda


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Epoch 1/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40323/40323 [1:14:43<00:00,  8.99it/s, loss=1.3100, accuracy=0.7403, learning_rate=0.000080]


Epoch 1/5
  Average Loss: 1.1584
  Average Accuracy: 0.7542
  Learning Rate: 0.000080
 Validation Loss: 0.8619
 Validation Accuracy: 0.8029
 Nuevo mejor modelo guardado


Epoch 2/5: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40323/40323 [1:15:16<00:00,  8.93it/s, loss=1.0623, accuracy=0.7682, learning_rate=0.000060]


Epoch 2/5
  Average Loss: 0.8964
  Average Accuracy: 0.7948
  Learning Rate: 0.000060
 Validation Loss: 0.7661
 Validation Accuracy: 0.8193
 Nuevo mejor modelo guardado


Epoch 3/5:   1%|▉                                                                                                                                                                   | 231/40323 [00:26<1:15:49,  8.81it/s, loss=1.2123, accuracy=0.7535, learning_rate=0.000060]

In [1]:
import torch
print(torch.cuda.is_available())  # Esto debería devolver True si la GPU está disponible
print(torch.cuda.device_count())  # Esto debería mostrar cuántas GPUs hay disponibles
print(torch.cuda.get_device_name(0))  # Esto debería mostrar el nombre de tu GPU


True
1
NVIDIA GeForce RTX 3050
