# Grammar Correction using LLM

In [1]:
import torch
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Dataset

In [4]:
# Grammar Correction Dataset : https://www.kaggle.com/datasets/satishgunjal/grammar-correction

data = pd.read_csv("./dataset/Grammar Correction.csv", sep=",")
data.head()

Unnamed: 0,Serial Number,Error Type,Ungrammatical Statement,Standard English
0,1,Verb Tense Errors,I goes to the store everyday.,I go to the store everyday.
1,2,Verb Tense Errors,They was playing soccer last night.,They were playing soccer last night.
2,3,Verb Tense Errors,She have completed her homework.,She has completed her homework.
3,4,Verb Tense Errors,He don't know the answer.,He doesn't know the answer.
4,5,Verb Tense Errors,The sun rise in the east.,The sun rises in the east.


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018 entries, 0 to 2017
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Serial Number            2018 non-null   int64 
 1   Error Type               2018 non-null   object
 2   Ungrammatical Statement  2018 non-null   object
 3   Standard English         2018 non-null   object
dtypes: int64(1), object(3)
memory usage: 63.2+ KB


In [6]:
# count of error types

error_counts = data['Error Type'].value_counts()
print(error_counts)

Error Type
Sentence Structure Errors                         103
Verb Tense Errors                                 100
Article Usage                                     100
Subject-Verb Agreement                            100
Spelling Mistakes                                 100
Preposition Usage                                  95
Punctuation Errors                                 60
Relative Clause Errors                             51
Ambiguity                                          50
Negation Errors                                    50
Tautology                                          50
Mixed Metaphors/Idioms                             50
Incorrect Auxiliaries                              50
Slang, Jargon, and Colloquialisms                  50
Gerund and Participle Errors                       50
Abbreviation Errors                                50
Lack of Parallelism in Lists or Series             50
Agreement in Comparative and Superlative Forms     49
Passive Voice Ove

## Model: T5

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

# 80 : 20
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# 10 : 10
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 1614
Validation set size: 202
Test set size: 202


## Tokenize data

In [10]:
def tokenize_data(data, tokenizer, max_length=128):
    inputs = tokenizer(
        data['Ungrammatical Statement'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    outputs = tokenizer(
        data['Standard English'].tolist(),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    return inputs, outputs

# tokenmize datasets
train_inputs, train_outputs = tokenize_data(train_data[['Ungrammatical Statement', 'Standard English']], tokenizer)
val_inputs, val_outputs = tokenize_data(val_data[['Ungrammatical Statement', 'Standard English']], tokenizer)
test_inputs, test_outputs = tokenize_data(test_data[['Ungrammatical Statement', 'Standard English']], tokenizer)

train_inputs = {key: value.to(device) for key, value in train_inputs.items()}
train_outputs = train_outputs['input_ids'].to(device)

val_inputs = {key: value.to(device) for key, value in val_inputs.items()}
val_outputs = val_outputs['input_ids'].to(device)

test_inputs = {key: value.to(device) for key, value in test_inputs.items()}
test_outputs = test_outputs['input_ids'].to(device)

In [11]:
train_inputs

{'input_ids': tensor([[  451, 15687,   160,  ...,     0,     0,     0],
         [  216,   228,    43,  ...,     0,     0,     0],
         [   27,    43,     3,  ...,     0,     0,     0],
         ...,
         [ 1902,     5, 12587,  ...,     0,     0,     0],
         [   37,  1595,   562,  ...,     0,     0,     0],
         [   37,   167,   167,  ...,     0,     0,     0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

## Data Loader

In [12]:
from torch.utils.data import DataLoader, TensorDataset

# tensordatasets
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_outputs)
val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], val_outputs)
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_outputs)

# dataloaders
train_loader = DataLoader(train_dataset, batch_size=8)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [13]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f02d7bfbd30>

## Fine-Tune T5 model

In [17]:
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# fine-tuning loop
model.train()

epochs = 3

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # training
    model.train()
    train_loss = []
    for batch in tqdm(train_loader, desc='Training..'):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

    print(f"Training Loss: {np.mean(train_loss)}")

    # validation
    model.eval()
    val_loss = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating..'):
            input_ids, attention_mask, labels = batch
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            val_loss.append(outputs.loss.item())

    print(f"Validation Loss: {np.mean(val_loss)}")

Epoch 1/3


Training..: 100%|██████████| 202/202 [01:44<00:00,  1.94it/s]


Training Loss: 0.04381610620490248


Validating..: 100%|██████████| 26/26 [00:03<00:00,  6.60it/s]


Validation Loss: 0.023145177777713306
Epoch 2/3


Training..: 100%|██████████| 202/202 [01:44<00:00,  1.94it/s]


Training Loss: 0.0223785596162007


Validating..: 100%|██████████| 26/26 [00:03<00:00,  6.66it/s]


Validation Loss: 0.019431156473449215
Epoch 3/3


Training..: 100%|██████████| 202/202 [01:44<00:00,  1.93it/s]


Training Loss: 0.016164672245170073


Validating..: 100%|██████████| 26/26 [00:03<00:00,  6.65it/s]

Validation Loss: 0.017873825863576852





In [18]:
# evaluate on test set

model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        test_loss += outputs.loss.item()

avg_test_loss = test_loss / len(test_loader)
print(f"Test Loss: {avg_test_loss}")

Test Loss: 0.015719354600430682


In [19]:
# save model and tokenizer

save_directory = "./models/t5_model_finetuned"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./models/t5_model_finetuned_with_error_type


In [21]:
## convert to zip file

# import shutil
# zip_file_path = "./models/t5_model_finetuned.zip"
# shutil.make_archive(zip_file_path[:-4], 'zip', save_directory)

'/content/models/t5_model_finetuned_with_error_type.zip'

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# load model and tokenizer
load_directory = "./models/t5_model_finetuned"

loaded_model = T5ForConditionalGeneration.from_pretrained(load_directory)
loaded_tokenizer = T5Tokenizer.from_pretrained(load_directory)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)

print("Model and tokenizer loaded successfully!")

## Model Inference

In [28]:
def correct_sentence(input_sentence, model, tokenizer):
    model.eval()

    # tokenize input sentence
    inputs = tokenizer(input_sentence, return_tensors="pt").to(device)

    # output corrected sentence
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], max_length=50)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# inference
input_sentence = "I am gone to school."
corrected_sentence = correct_sentence(input_sentence, model, tokenizer)
print(f"Corrected Sentence: {corrected_sentence}")

Corrected Sentence: I am going to school.
