<a href="https://colab.research.google.com/github/Solenabera/AOGEC-BERT/blob/main/AOGEC_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install transformers



In [28]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd

In [29]:
# Step 1: Load the T5 model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
# Step 2: Define your custom dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example = self.data.iloc[index]
        input_text = example['input']
        target_text = example['target']

        input_ids = self.tokenizer.encode(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_ids = self.tokenizer.encode(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        return {
            'input_ids': input_ids.squeeze(),
            'attention_mask': input_ids.squeeze().gt(0),
            'labels': target_ids.squeeze()
        }

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 3: Split your dataset into training and validation sets
csv_file_path = 'datasets.csv'
dataset = pd.read_csv(csv_file_path, encoding='latin1')
train_data, test_data = train_test_split(dataset, test_size=0.05, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.05, random_state=42)

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Testing set size:", len(test_data))

Training set size: 2790
Validation set size: 147
Testing set size: 155


In [34]:
# Step 4: Create custom datasets for training and validation
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

print("Training set size:", len(train_dataset))
print("Validation set size:", len(val_dataset))
print("Testing set size:", len(test_dataset))

print(val_dataset)

Training set size: 2790
Validation set size: 147
Testing set size: 155
<__main__.CustomDataset object at 0x79bfb12ee260>


In [35]:
# Step 5: Create data loaders for training and validation
batch_size = 4  # Increase the batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [36]:
# Step 6: Measure the loss function before training
model.eval()

total_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

average_loss = total_loss / len(val_dataloader)
print(f'Loss before training: {average_loss:.4f}')

Loss before training: 17.9575


In [None]:
# Step 7: Fine-tune the model
model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 2 # Default 10
print("Training Started")
for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(total_loss)

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1} - Average Loss: {average_loss:.4f}')

print("Training Ended")

Training Started
Epoch: 0




15.47948169708252
29.773025512695312
43.663923263549805
57.9139928817749
72.38865852355957
85.60444259643555
98.66634845733643
110.78164100646973
122.39564990997314
133.96028995513916
146.2484951019287
156.76288890838623
166.50126552581787
176.4260959625244
186.47482681274414
196.6559076309204
205.7172975540161
215.16497898101807
224.0402135848999
233.1852512359619
242.31921768188477
250.15030145645142
257.94907569885254
265.96845626831055
274.47233295440674
281.6438479423523
288.948118686676
297.51538133621216
305.2451319694519
311.8370966911316
320.0324749946594
327.40478324890137
333.38630867004395
339.78901386260986
345.86026525497437
351.19479990005493
358.13169288635254
363.85344982147217
369.0247507095337
374.421413898468
380.04563188552856
384.5905661582947
389.53480863571167
394.01132917404175
399.2731103897095
403.28811264038086
407.2824983596802
411.70711040496826
415.79375171661377
419.37093114852905
423.1544508934021
426.6487236022949
429.0176384449005
432.17870712280273
4

In [10]:
# Step 8: Measure the loss function after training
model.eval()

total_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

average_loss = total_loss / len(val_dataloader)
print(f'Loss after training: {average_loss:.4f}')

Loss after training: 8.2627


In [11]:
# Step 9: Save the fine-tuned model
model.save_pretrained('OutputModel')

In [17]:
# Step 8: Generate grammar error corrections
model = T5ForConditionalGeneration.from_pretrained('OutputModel')
model.eval()

predicted_texts = []
correct_texts = []
test_inputs = []

batch_size = 1
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

output_file = 'Testing_Data_Output.txt'

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)

        # Generate corrections
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4)
        predicted_ids = outputs

        # Decode predictions into texts
        predicted_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        correct_text = tokenizer.decode(labels[0], skip_special_tokens=True)

        # Store predictions, correct texts, and test inputs
        predicted_texts.append(predicted_text)
        correct_texts.append(correct_text)
        test_inputs.extend(input_texts)

with open(output_file, 'w') as f:
    for test_input, predicted_text, correct_text in zip(test_inputs, predicted_texts, correct_texts):
        print(f'Test Input: {test_input}')
        print(f'Predicted correction: {predicted_text}')
        print(f'Correct One: {correct_text}')

        f.write(f'Test Input: {test_input}\n')
        f.write(f'Predicted correction: {predicted_text}\n')
        f.write(f'Correct One: {correct_text}\n')

    f.close()

Test Input: grammar: Firoomsaan Leenca ajjeste.
Predicted correction: grammar: Firoomsaan Leenca ajjeste.
Correct One: Firoomsaan Leenca ajjeese.
Test Input: grammar: Isaan Leenca ajjeeste.
Predicted correction: grammar: Isaan Leenca ajjeeste.
Correct One: Isaan Leenca ajjeesan.
Test Input: grammar: Inni kaleessa dhufu.
Predicted correction: grammar: Inni kaleessa dhufu.
Correct One: Inni kaleessa dhufe.
Test Input: grammar: Isaan ulee cabsite.
Predicted correction: grammar: Isaan ulee cabsite.
Correct One: Isaan ulee cabsan.
Test Input: grammar: Guutaan kaleessa deema.
Predicted correction: grammar: Guutaan kaleessa deema.
Correct One: Guutaan kaleessa deeme.
Test Input: grammar: Isheen muka muran.
Predicted correction: grammar: Isheen muka muran.
Correct One: Isheen muka murte.
Test Input: grammar: Isaan kaleessa dhufte.
Predicted correction: grammar: Isaan kaleessa dhufte.
Correct One: Isaan kaleessa dhufan.
Test Input: grammar: Caaltuun muka mure.
Predicted correction: grammar: Caa

In [18]:
def calculate_precision_recall_f1(reference_sentences, hypothesis_sentences):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for reference, hypothesis in zip(reference_sentences, hypothesis_sentences):
        # Split the sentences into words for comparison
        reference_words = reference.split()
        hypothesis_words = hypothesis.split()

        # Calculate true positives, false positives, and false negatives
        for word in hypothesis_words:
            if word in reference_words:
                true_positives += 1
                reference_words.remove(word)
            else:
                false_positives += 1

        false_negatives += len(reference_words)

    # Calculate precision, recall, and F1-score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = (2 * precision * recall) / (precision + recall)

    return precision, recall, f1_score

# Remove the 'grammar: ' prefix from predicted sentences
predicted_sentences = [sentence.split(': ')[1] for sentence in predicted_texts]

# Remove the '.' at the end of predicted sentences
predicted_sentences = [sentence[:-1] for sentence in predicted_sentences]

precision, recall, f1_score = calculate_precision_recall_f1(correct_texts, predicted_sentences)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1-score: 0.6666666666666666
