<a href="https://colab.research.google.com/github/Solenabera/AOGEC-BERT/blob/main/AOGEC_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import random
import csv
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
import matplotlib.pyplot as plt

In [57]:
# Set random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7deb257c5510>

In [58]:
# Define the paths to your dataset file
dataset_file = "datasets.csv"

# Read the dataset file
with open(dataset_file, "r", encoding="latin1") as file:
    dataset_reader = csv.reader(file)
    next(dataset_reader)  # Skip the header row
    dataset = list(dataset_reader)

# Shuffle the dataset
random.shuffle(dataset)

In [59]:
# Split the dataset into train, validation, and test datasets
train_ratio = 0.8  # Ratio of examples for training
valid_ratio = 0.1  # Ratio of examples for validation
test_ratio = 0.1  # Ratio of examples for testing

train_size = int(len(dataset) * train_ratio)
valid_size = int(len(dataset) * valid_ratio)

train_dataset = dataset[:train_size]
valid_dataset = dataset[train_size:train_size + valid_size]
test_dataset = dataset[train_size + valid_size:]


In [60]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_sentence, target_sentence = self.data[index]

        # Remove "grammar:" prefix from input sentence
        input_sentence = input_sentence.replace("grammar: ", "")

        # Tokenize the input and target sentences
        input_tokens = self.tokenizer.tokenize(input_sentence)
        target_tokens = self.tokenizer.tokenize(target_sentence)

        input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)
        target_ids = self.tokenizer.convert_tokens_to_ids(target_tokens)

        return {
            "input_ids": input_ids,
            "target_ids": target_ids
        }

In [61]:
# Define a custom collate function
def collate_fn(batch):
    input_ids = [torch.tensor(item["input_ids"]) for item in batch]
    target_ids = [torch.tensor(item["target_ids"]) for item in batch]

    # Pad sequences to the same length
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    target_ids = torch.nn.utils.rnn.pad_sequence(target_ids, batch_first=True)

    return {
        "input_ids": input_ids,
        "target_ids": target_ids
    }

In [62]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
# Create dataloaders for train, validation, and test datasets
train_dataset = CustomDataset(train_dataset, tokenizer)
valid_dataset = CustomDataset(valid_dataset, tokenizer)
test_dataset = CustomDataset(test_dataset, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [64]:
# Define the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

train_losses = []
valid_losses = []

num_epochs = 1

for epoch in range(num_epochs):
    train_loss = 0.0
    valid_loss = 0.0

    model.train()
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        target_ids = batch["target_ids"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, labels=target_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids = batch["input_ids"].to(device)
            target_ids = batch["target_ids"].to(device)

            outputs = model(input_ids, labels=target_ids)
            loss = outputs.loss

            valid_loss += loss.item()

    # Calculate average training and validation loss
    train_loss /= len(train_dataloader)
    valid_loss /= len(valid_dataloader)

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss}, Valid Loss: {valid_loss}")

ValueError: Expected input batch_size (16) to match target batch_size (368).

In [None]:
# Save the trained model
torch.save(model.state_dict(), "aogec_bert_model.pth")


In [None]:
# Plot the training and validation loss
plt.plot(range(1, num_epochs + 1), train_losses, label="Train Loss")
plt.plot(range(1, num_epochs + 1), valid_losses, label="Valid Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()