<a href="https://colab.research.google.com/github/Solenabera/AOGEC-T5/blob/main/AOGEC_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Step 1: Load the T5 model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Step 2: Define your custom dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example = self.data.iloc[index]
        input_text = example['input']
        target_text = example['target']

        input_ids = self.tokenizer.encode(input_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        target_ids = self.tokenizer.encode(target_text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

        return {
            'input_ids': input_ids.squeeze(),
            'attention_mask': input_ids.squeeze().gt(0),
            'labels': target_ids.squeeze()
        }

In [5]:
# Step 3: Split your dataset into training and validation sets
csv_file_path = 'datasets.csv'
dataset = pd.read_csv(csv_file_path, encoding='latin1')
train_data, test_data = train_test_split(dataset, test_size=0.05, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.05, random_state=42)

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Testing set size:", len(test_data))

Training set size: 2790
Validation set size: 147
Testing set size: 155


In [6]:
# Step 4: Create custom datasets for training and validation
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

print("Training set size:", len(train_dataset))
print("Validation set size:", len(val_dataset))
print("Testing set size:", len(test_dataset))

print(val_dataset)

Training set size: 2790
Validation set size: 147
Testing set size: 155
<__main__.CustomDataset object at 0x7b1243c7c0d0>


In [7]:
# Step 5: Create data loaders for training and validation
batch_size = 8  # Increase the batch size
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [8]:
# Step 6: Measure the loss function before training
model.eval()

total_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

average_loss = total_loss / len(val_dataloader)
print(f'Loss before training: {average_loss:.4f}')

Loss before training: 17.8952


In [None]:
# Step 7: Fine-tune the model
model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 2 # Default 10
print("Training Started")
for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        print(total_loss)

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1} - Average Loss: {average_loss:.4f}')

print("Training Ended")



Training Started
Epoch: 0
17.721576690673828
32.210777282714844
46.27025127410889
58.51090049743652
73.10059928894043
85.81281566619873
96.73740768432617
108.46265506744385
119.69819831848145
130.9886884689331
141.81834888458252
152.33846759796143
163.2640199661255
173.13247394561768
182.59901523590088
192.64951705932617
201.86326599121094
210.40295886993408
219.20622730255127
227.51930141448975
235.92709636688232
243.57718420028687
251.33315801620483
259.5055537223816
267.4879398345947
274.24182176589966
280.79689836502075
287.4851894378662
293.2657389640808
299.5186777114868
305.46791982650757
311.0170011520386
317.1637806892395
322.64324617385864
328.1463785171509
333.6443748474121
339.1126198768616
343.0228896141052
347.3229022026062
351.28184247016907
355.5818531513214
359.59937024116516
362.6938977241516
365.7818148136139
368.6666314601898
371.26216077804565
374.08408880233765
375.91876804828644
378.499022603035
380.4930566549301
383.61616933345795
386.70592725276947
388.36785590

In [None]:
# Step 8: Measure the loss function after training
model.eval()

total_loss = 0

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

average_loss = total_loss / len(val_dataloader)
print(f'Loss after training: {average_loss:.4f}')

Loss after training: 8.2627


In [None]:
# Step 9: Save the fine-tuned model
model.save_pretrained('OutputModel')

In [None]:
# Step 8: Generate grammar error corrections
model = T5ForConditionalGeneration.from_pretrained('OutputModel')
model.eval()

predicted_texts = []
correct_texts = []
test_inputs = []

batch_size = 1
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

output_file = 'Testing_Data_Output.txt'

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)

        # Generate corrections
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128, num_beams=4)
        predicted_ids = outputs

        # Decode predictions into texts
        predicted_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        correct_text = tokenizer.decode(labels[0], skip_special_tokens=True)

        # Store predictions, correct texts, and test inputs
        predicted_texts.append(predicted_text)
        correct_texts.append(correct_text)
        test_inputs.extend(input_texts)

with open(output_file, 'w') as f:
    for test_input, predicted_text, correct_text in zip(test_inputs, predicted_texts, correct_texts):
        print(f'Test Input: {test_input}')
        print(f'Predicted correction: {predicted_text}')
        print(f'Correct One: {correct_text}')

        f.write(f'Test Input: {test_input}\n')
        f.write(f'Predicted correction: {predicted_text}\n')
        f.write(f'Correct One: {correct_text}\n')

    f.close()

Test Input: grammar: Firoomsaan Leenca ajjeste.
Predicted correction: grammar: Firoomsaan Leenca ajjeste.
Correct One: Firoomsaan Leenca ajjeese.
Test Input: grammar: Isaan Leenca ajjeeste.
Predicted correction: grammar: Isaan Leenca ajjeeste.
Correct One: Isaan Leenca ajjeesan.
Test Input: grammar: Inni kaleessa dhufu.
Predicted correction: grammar: Inni kaleessa dhufu.
Correct One: Inni kaleessa dhufe.
Test Input: grammar: Isaan ulee cabsite.
Predicted correction: grammar: Isaan ulee cabsite.
Correct One: Isaan ulee cabsan.
Test Input: grammar: Guutaan kaleessa deema.
Predicted correction: grammar: Guutaan kaleessa deema.
Correct One: Guutaan kaleessa deeme.
Test Input: grammar: Isheen muka muran.
Predicted correction: grammar: Isheen muka muran.
Correct One: Isheen muka murte.
Test Input: grammar: Isaan kaleessa dhufte.
Predicted correction: grammar: Isaan kaleessa dhufte.
Correct One: Isaan kaleessa dhufan.
Test Input: grammar: Caaltuun muka mure.
Predicted correction: grammar: Caa

In [None]:
def calculate_precision_recall_f1(reference_sentences, hypothesis_sentences):
    true_positives = 0
    false_positives = 0
    false_negatives = 0

    for reference, hypothesis in zip(reference_sentences, hypothesis_sentences):
        # Split the sentences into words for comparison
        reference_words = reference.split()
        hypothesis_words = hypothesis.split()

        # Calculate true positives, false positives, and false negatives
        for word in hypothesis_words:
            if word in reference_words:
                true_positives += 1
                reference_words.remove(word)
            else:
                false_positives += 1

        false_negatives += len(reference_words)

    # Calculate precision, recall, and F1-score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = (2 * precision * recall) / (precision + recall)

    return precision, recall, f1_score

# Remove the 'grammar: ' prefix from predicted sentences
predicted_sentences = [sentence.split(': ')[1] for sentence in predicted_texts]

# Remove the '.' at the end of predicted sentences
predicted_sentences = [sentence[:-1] for sentence in predicted_sentences]

precision, recall, f1_score = calculate_precision_recall_f1(correct_texts, predicted_sentences)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Precision: 0.6666666666666666
Recall: 0.6666666666666666
F1-score: 0.6666666666666666


In [None]:
print("Hello")