In [2]:
# Importar las bibliotecas necesarias
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score


In [None]:
# Verificar si CUDA está disponible y configurarla
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")


In [None]:
# Cargar el archivo CSV con los datos de traducción (español-mapudungun)
csv_file = 'ruta/a/tu/archivo.csv'  # Cambia la ruta al archivo correspondiente
data = pd.read_csv(csv_file)

# Mostrar las primeras filas del dataset para verificar que se cargó correctamente
data.head()


In [None]:
# Definir una clase personalizada para manejar el dataset de traducción
class TranslationDataset(Dataset):
    def __init__(self, data, source_column, target_column):
        self.data = data
        self.source_column = source_column
        self.target_column = target_column

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_text = self.data.loc[idx, self.source_column]
        target_text = self.data.loc[idx, self.target_column]
        
        source_encoding = tokenizer(
            source_text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            target_text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        return {
            'input_ids': source_encoding['input_ids'].flatten().to(device),
            'attention_mask': source_encoding['attention_mask'].flatten().to(device),
            'labels': target_encoding['input_ids'].flatten().to(device)
        }


In [None]:
# Cargar el tokenizador y el modelo T5
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)


In [None]:
# Crear el dataset y dividirlo en entrenamiento y validación (80% y 20%)
translation_dataset = TranslationDataset(data, 'Español', 'Mapudungun')
total_size = len(translation_dataset)
train_size = int(0.8 * total_size)
val_size = total_size - train_size
train_dataset, val_dataset = random_split(translation_dataset, [train_size, val_size])

# Crear DataLoader para entrenamiento y validación
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=10, pin_memory=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=10, pin_memory=True)


In [None]:
# Función para calcular la precisión
def calculate_accuracy(predictions, targets):
    pred_id = torch.argmax(predictions, dim=-1)
    return accuracy_score(targets.cpu().numpy().flatten(), pred_id.cpu().numpy().flatten())


In [None]:
# Configurar el optimizador y el scheduler
optimizer = AdamW(model.parameters(), lr=1e-4)
num_epochs = 2
num_training_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
# Bucle de entrenamiento
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    total_train_acc = 0

    # Iterar sobre el dataloader de entrenamiento
    for batch in train_dataloader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_train_loss += loss.item()
        total_train_acc += calculate_accuracy(logits, batch['labels'])

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_acc / len(train_dataloader)

    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}, Training Accuracy: {avg_train_acc}')

    # Validación
    model.eval()
    total_val_loss = 0
    total_val_acc = 0

    with torch.no_grad():
        for batch in val_dataloader:
            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            loss = outputs.loss
            logits = outputs.logits

            total_val_loss += loss.item()
            total_val_acc += calculate_accuracy(logits, batch['labels'])

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_acc / len(val_dataloader)

    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_acc}')
