<a href="https://colab.research.google.com/github/Solenabera/AOGEC-BERT/blob/main/AOGEC_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
# Step 1: Load the T5 model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Step 2: Define your custom dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example = self.data.iloc[index]
        input_text = example['input']
        target_text = example['target']

        input_ids = self.tokenizer.encode(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_ids = self.tokenizer.encode(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        return {
            'input_ids': input_ids.squeeze(),
            'attention_mask': input_ids.squeeze().gt(0),
            'labels': target_ids.squeeze()
        }

In [6]:
# Step 3: Split your dataset into training and validation sets
csv_file_path = 'Data - 04.csv'
dataset = pd.read_csv(csv_file_path)
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [7]:
# Step 4: Create custom datasets for training and validation
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)

In [8]:
# Step 5: Create data loaders for training and validation
batch_size = 2  # Default 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [9]:
# Step 6: Measure the loss function before training
model.eval()

total_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

average_loss = total_loss / len(val_dataloader)
print(f'Loss before training: {average_loss:.4f}')

Loss before training: 16.2931


In [10]:
# Step 7: Fine-tune the model
model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 1 # Default 10
print("Training Started")
for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(total_loss)

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1} - Average Loss: {average_loss:.4f}')

print("Training Ended")



Training Started
Epoch: 0
12.405478477478027
25.731800079345703
38.628366470336914
51.75589370727539
65.03003215789795
77.35015296936035
Epoch 1 - Average Loss: 12.8917
Training Ended


In [11]:
# Step 8: Measure the loss function after training
model.eval()

total_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

average_loss = total_loss / len(val_dataloader)
print(f'Loss after training: {average_loss:.4f}')

Loss after training: 12.3265


In [12]:
# Step 8: Generate grammar error corrections
model.eval()

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Generate corrections
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4)
        predicted_ids = outputs

        # Decode predictions into texts
        predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

        # Print predicted corrections
        for predicted_text in predicted_texts:
            print(f'Predicted correction: {predicted_text}')

Predicted correction: sareen doolaara kuma afur nyaattee maatii harka qullaatti hambise.
Predicted correction: False
Predicted correction: Otoo si maal gootu?


In [13]:
# Step 9: Save the fine-tuned model
model.save_pretrained('OutputModel')

In [14]:
model = T5ForConditionalGeneration.from_pretrained('OutputModel')
csv_file_path = 'Data - 04.csv'
dataset = pd.read_csv(csv_file_path)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

test_dataset = CustomDataset(test_data, tokenizer)

batch_size = 2  # Default 4
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [15]:
MAX_LENGTH = 512

In [16]:
model.eval()

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_LENGTH)
        predicted_ids = outputs

        predicted_texts = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
        label_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        for predicted_text, label_text in zip(predicted_texts, label_texts):
            print(f'Predicted: {predicted_text}')
            print(f'Correct: {label_text}')
            print('---')

Predicted: False
Correct: Sareen doolaara kuma afur nyaattee maatii harka qullaatti hambiste.
---
Predicted: False
Correct: Sareen doolaara kuma afur nyaatee maatii harka qullaatti hambise.
---
Predicted: Otoo si maal gootu?
Correct: Otoo isinii maal gootu?
---
