In [48]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, AutoTokenizer
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

In [49]:
# Experiment variables

input_data = "/xdisk/josorio1/aconverse/GEC_BETO/ling-582-fall-2024-course-project-code-skitcon/data/COWS-L2H-unlabeled-STRICT.txt"
model_name = "vgaraujov/bart-base-spanish"

epochs = 16
batch_size = 16

train_size = .8

In [50]:
# Dataset Definition

class Seq2SeqTextDataset(Dataset):
    def __init__(self, x, y, tokenizer, max_length=128):
        self.samples = list(zip(x, y))
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        source, target = self.samples[idx]
        source_enc = self.tokenizer(
            source,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_enc = self.tokenizer(
            target,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": source_enc["input_ids"].squeeze(),
            "attention_mask": source_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze(),
        }

In [51]:
def load_data(file_path):
    samples = []

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        
    for i in range(0, len(lines), 3):  # Assumes blank line separates pairs
        if i + 1 < len(lines):
            source = lines[i].strip()
            target = lines[i + 1].strip()
            if source and target:
                samples.append((source, target))
    return samples

def eval_model_bleu(model, dataloader, device="cuda"):
    model.eval()
    references = []  # List of reference sentences (ground truth)
    hypotheses = []  # List of generated sentences (model predictions)

    with torch.no_grad():
        for batch in dataloader:
            # Move inputs to the device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Generate predictions
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)

            # Decode predictions and labels
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Append references and hypotheses
            references.extend([[ref.split()] for ref in decoded_labels])
            hypotheses.extend([hyp.split() for hyp in decoded_preds])

    # Calculate BLEU score
    return corpus_bleu(references, hypotheses)

In [52]:
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

In [53]:
# Load dataset

base_data = load_data(input_data)

train_x, test_x, train_y, test_y = train_test_split(*zip(*base_data), train_size=0.8, random_state=42)

train_dataset = Seq2SeqTextDataset(train_x, train_y, tokenizer)
test_dataset = Seq2SeqTextDataset(test_x, test_y, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(epochs):
    
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss = total_loss / len(train_dataloader)

    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            total_loss += outputs.loss.item()
    test_loss = total_loss / len(test_dataloader)

    train_bleu_score = eval_model_bleu(model, train_dataloader, device=device)
    test_bleu_score = eval_model_bleu(model, test_dataloader, device=device)

    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {test_loss:.4f}")
    print(f"Train BLEU: {train_bleu_score:.4f}")
    print(f"Test BLEU: {test_bleu_score:.4f}")

# Save model
model.save_pretrained("./seq2seq_gec_model")

Epoch 1/16
Train Loss: 0.1444
Validation Loss: 0.0470
Train BLEU: 0.7829
Test BLEU: 0.7746
Epoch 2/16
Train Loss: 0.0457
Validation Loss: 0.0425
Train BLEU: 0.8045
Test BLEU: 0.7886
