In [43]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import pandas as pd
import Levenshtein
import evaluate
from torch.utils.data import DataLoader, Dataset
from typing import List

In [28]:
device = 0 if torch.cuda.is_available() else -1  
print(f"Device set to use: {'cuda:0' if device==0 else 'cpu'}")

Device set to use: cuda:0


In [29]:
MODEL_NAME = "vennify/t5-base-grammar-correction"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [30]:
df=pd.read_csv('/kaggle/input/grammar-correction/Grammar Correction.csv')
df.drop(columns={'Serial Number','Error Type'},inplace=True)
df.head()

Unnamed: 0,Ungrammatical Statement,Standard English
0,I goes to the store everyday.,I go to the store everyday.
1,They was playing soccer last night.,They were playing soccer last night.
2,She have completed her homework.,She has completed her homework.
3,He don't know the answer.,He doesn't know the answer.
4,The sun rise in the east.,The sun rises in the east.


In [31]:
df2=df.copy()

In [32]:
corrector = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device,
)

Device set to use cuda:0


In [33]:
input_text = df['Ungrammatical Statement'].iloc[0]

predicted = corrector(input_text, max_new_tokens=128)[0]['generated_text']

print("Input:    ", input_text)
print("Corrected:", predicted)
print("Corrected:", df['Standard English'].iloc[0])

Input:     I goes to the store everyday.
Corrected: I go to the store everyday.
Corrected: I go to the store everyday.


In [None]:

texts = df2['Ungrammatical Statement'].tolist()

results = corrector(texts, max_new_tokens=64, batch_size=16)  
preds = [r['generated_text'] for r in results]

df2['predicted'] = preds


In [35]:
df2

Unnamed: 0,Ungrammatical Statement,Standard English,predicted
0,I goes to the store everyday.,I go to the store everyday.,I go to the store everyday.
1,They was playing soccer last night.,They were playing soccer last night.,They were playing soccer last night.
2,She have completed her homework.,She has completed her homework.,She has completed her homework.
3,He don't know the answer.,He doesn't know the answer.,He doesn't know the answer.
4,The sun rise in the east.,The sun rises in the east.,The sun rises in the east.
...,...,...,...
2013,"The festival celebrates music, culture, and to...","The festival celebrates music, culture, and br...","The festival celebrates music, culture, and to..."
2014,The seminar will address topics such as career...,The seminar will address topics such as career...,The seminar will address topics such as career...
2015,The research examines the effects of climate c...,The research examines the effects of climate c...,The research examines the effects of climate c...
2016,"The report highlights the need for investment,...","The report highlights the need for investment,...","The report highlights the need for investment,..."


In [None]:

bleu_metric = evaluate.load("bleu")

y_true = df2['Standard English'].tolist()
y_pred = df2['predicted'].tolist()

references = [[ref] for ref in y_true]

bleu_score = bleu_metric.compute(predictions=y_pred, references=references)
print("BLEU Score (pre-finetuning):", bleu_score["bleu"])


BLEU Score (pre-finetuning): 0.7062136835289252


In [None]:


def precision_recall_fbeta(y_true: List[List[str]], y_pred: List[List[str]], beta: float = 0.5):
    
    assert len(y_true) == len(y_pred), "Predictions and references must have the same length"
    
    total_correct, total_pred, total_true = 0, 0, 0
    
    for ref, pred in zip(y_true, y_pred):
        ref_set, pred_set = set(ref), set(pred)
        
        correct = len(ref_set & pred_set)
        total_correct += correct
        total_pred += len(pred_set)
        total_true += len(ref_set)
    
    precision = total_correct / total_pred if total_pred > 0 else 0
    recall = total_correct / total_true if total_true > 0 else 0
    
    if precision + recall == 0:
        fbeta = 0
    else:
        fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    
    return precision, recall, fbeta



In [45]:

precision, recall, f0_5 = precision_recall_fbeta(df2['Standard English'], df2['predicted'], beta=0.5)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F0.5 Score: {f0_5:.4f}")

Precision: 0.9681
Recall: 0.9681
F0.5 Score: 0.9681


In [42]:
df2['NormalizedEditDistance'] = df2.apply(
    lambda row: Levenshtein.distance(row['Standard English'], row['predicted']) / max(len(row['Standard English']), 1),
    axis=1
)

average_normalized = df2['NormalizedEditDistance'].mean()
print("Average Normalized Edit Distance:", average_normalized)

Average Normalized Edit Distance: 0.15826667032936773


In [46]:
losses = []

for idx, row in df2.iterrows():
    inputs = tokenizer(row['Ungrammatical Statement'], return_tensors="pt", padding=True, truncation=True).to(device)
    labels = tokenizer(row['Standard English'], return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Important: set labels input_ids
    labels_input_ids = labels["input_ids"]
    
    outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels_input_ids)
    loss = outputs.loss.item()
    losses.append(loss)

average_loss = sum(losses) / len(losses)
print("Average Cross-Entropy Loss:", average_loss)

Average Cross-Entropy Loss: 0.40006999447627956
