# Grammar Correction

[Thanks to Theitcrow's notebook](https://www.kaggle.com/code/kevinbnisch/grammar-errors-threshold-and-features-aes/notebook)

In [1]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoConfig
import numpy as np
import re
from nltk import sent_tokenize
import re

In [2]:
os.chdir("../../")

In [3]:
from lib.paths import Paths
from lib.utils.utils import seed_everything

In [4]:
seed_everything()

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Model: 
1. https://huggingface.co/juancavallotti/t5-base-gec
2. https://huggingface.co/shashank2123/t5-finetuned-for-GEC
3. https://huggingface.co/fenffef/t5-base-gec

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Unbabel/gec-t5_small")
model = T5ForConditionalGeneration.from_pretrained("Unbabel/gec-t5_small").to(device)

In [7]:
model.save_pretrained("output/T5")
tokenizer.save_pretrained("output/T5/tokenizer")

('output/T5/tokenizer/tokenizer_config.json',
 'output/T5/tokenizer/special_tokens_map.json',
 'output/T5/tokenizer/spiece.model',
 'output/T5/tokenizer/added_tokens.json',
 'output/T5/tokenizer/tokenizer.json')

In [8]:
def correct_sentence(sentences):
    sentences = [
        f"Fix grammatical errors, if any, in this sentence: {sentence}"
        for sentence in sentences
    ]
    input_ids = tokenizer(
        sentences,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    ).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=128)
    del input_ids
    corrected_sentences = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    del outputs
    return corrected_sentences

In [9]:
correct_sentence([
    "When I grow up, I starti to understand what he said is quite right.",
    "When I grow up, I starti to understand what he said is quite right.",
])

['When I grow up, I start to understand what he said is quite right.',
 'When I grow up, I start to understand what he said is quite right.']

In [10]:
train_df = pd.read_csv(
    Paths.COMPETITION_TRAIN_CSV_PATH,
    usecols=["essay_id", "full_text"],
)

In [11]:
def data_preprocessing(x: str) -> str:
    x = re.sub(r"<[^>]*>", "", x)
    x = re.sub("@\w+", "", x)
    x = re.sub("'\d+", "", x)
    x = re.sub("\d+", "", x)
    x = re.sub(r"http\S+", "", x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

In [12]:
def process_sentence(df: pd.DataFrame) -> pd.DataFrame:
    df["sentence"] = df["full_text"].map(lambda x: sent_tokenize(x))
    df = df.explode("sentence").reset_index(drop=True)
    df["sentence"] = df["sentence"].map(data_preprocessing)
    return df

In [13]:
sentence_df = process_sentence(train_df.copy(deep=True))
sentence_df.drop(columns=["full_text"], inplace=True)
sentence_df.shape

(330422, 2)

In [14]:
assert (
    train_df.essay_id.unique().shape == sentence_df.essay_id.unique().shape
), f"Expected: {train_df.essay_id.unique().shape}, Got: {sentence_df.essay_id.unique().shape}"

In [15]:
sentence_df["len"] = sentence_df.sentence.map(len)

In [22]:
sentence_df.drop(index=sentence_df[(sentence_df.len < 10)].index, inplace=True)

In [23]:
assert (
    train_df.essay_id.unique().shape == sentence_df.essay_id.unique().shape
), f"Expected: {train_df.essay_id.unique().shape}, Got: {sentence_df.essay_id.unique().shape}"

In [24]:
batch_size = 2048

In [25]:
corrected_sentences = None

for i in range(0, sentence_df.shape[0], batch_size):
    start, end = i, i + batch_size
    sentences = sentence_df["sentence"].iloc[start:end]
    corrected = correct_sentence(sentences)
    corrected = np.array(corrected).flatten()

    if corrected_sentences is None:
        corrected_sentences = corrected
    else:
        corrected_sentences = np.hstack([corrected_sentences, corrected]).flatten()

In [26]:
corrected_sentences.shape

(328524,)

In [27]:
corrected_sentences[:10]

array(['Many people have cars where they live.',
       "The thing they don't know is that when you use a car a lot of things can happen like you can get inaccidet or the smoke that the car has is bad to breathe on if someone is walking, but in VAUBAN, Germany they don't have that problem because percent of Vauban's families do not own cars, and percent sell a car to move there.",
       'Fix grammatical errors, if any, in this sentence: Street parking, driveways and home garages are forbidden on the outskirts of a neighbourhood near the French and Swiss borders.',
       'You probably won\'t see a car in Vauban\'s streets because they are completely "car free" but if some that lives in VAUBAN that owns car ownership is allowed, but there are only two places that you can park a large garage at the edge of the development, where a car owner buys a space but it is not cheap to buy one. They sell the space for your car for $, along with a home.',
       'The vauban people completed this i

In [28]:
sentence_df["corrected"] = corrected_sentences

In [29]:
sentence_df.sample(3)

Unnamed: 0,essay_id,sentence,len,corrected
270156,d055690,The author is using a problem and solution met...,50,"Fix grammatical errors, if any, in this senten..."
276314,d586afa,"In the article of ""The Challenge of Exploring ...",186,"In the article of ""The Challenge of Exploring ..."
119894,5e20c0f,When student dont understand somthing this sys...,177,"When students dont understand something, if an..."


In [30]:
sentence_df.loc[sentence_df.corrected.isna(), "corrected"] = sentence_df.loc[
    sentence_df.corrected.isna(), "sentence"
]

In [31]:
def post_process(text):
    pattern = r"sentence\s*:\s*"
    match = re.search(pattern, text, re.IGNORECASE)

    if match:
        text = text[match.end():]
        
    return text.strip()

In [32]:
sentence_df["corrected"] = sentence_df["corrected"].map(post_process)

In [33]:
assert (
    train_df.essay_id.unique().shape == sentence_df.essay_id.unique().shape
), f"Expected: {train_df.essay_id.unique().shape}, Got: {sentence_df.essay_id.unique().shape}"

In [34]:
train_df.loc[~train_df.index.isin(sentence_df.index), "full_text"]

26       Many people in todays society tend to travel b...
348      In the article "The Challenge of Exploring Ven...
349      a computer can not tell if your happy or if yo...
431      Kids across America probably know someone who ...
568      The advantages of limiting car is great becaus...
                               ...                        
15453    The electorian collage is a very popular thing...
15465    The future iscoming soon everday. And everyday...
15852    In the article "The Challenge of Exploring Ven...
16663    The author supports people to study Venus is b...
16856    Being able to detect other peoples and even yo...
Name: full_text, Length: 97, dtype: object

In [35]:
sentence_df.to_csv(
    "data/feature_engg/grammar_correct.csv",
    index=False,
    columns=["essay_id", "sentence", "corrected"],
)