# Grammar Correction

[Thanks to Theitcrow's notebook](https://www.kaggle.com/code/kevinbnisch/grammar-errors-threshold-and-features-aes/notebook)

In [1]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoConfig
import numpy as np
import re
from nltk import sent_tokenize

In [2]:
os.chdir("../../")

In [3]:
from lib.paths import Paths
from lib.utils.utils import seed_everything
from lib.model.gec import correct_sentence, process_sentence, correct_all_sentences

In [4]:
seed_everything()

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Model: 
1. https://huggingface.co/juancavallotti/t5-base-gec
2. https://huggingface.co/shashank2123/t5-finetuned-for-GEC
3. https://huggingface.co/fenffef/t5-base-gec

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Unbabel/gec-t5_small")
model = T5ForConditionalGeneration.from_pretrained("Unbabel/gec-t5_small").to(device)

In [7]:
model.save_pretrained("output/T5")
tokenizer.save_pretrained("output/T5/tokenizer")

('output/T5/tokenizer/tokenizer_config.json',
 'output/T5/tokenizer/special_tokens_map.json',
 'output/T5/tokenizer/spiece.model',
 'output/T5/tokenizer/added_tokens.json',
 'output/T5/tokenizer/tokenizer.json')

In [9]:
correct_sentence(
    model,
    tokenizer,
    device,
    [
        "When I grow up, I starti to understand what he said is quite right.",
        "When I grow up, I starti to understand what he said is quite right.",
    ],
)

['When I grow up, I start to understand what he said is quite right.',
 'When I grow up, I start to understand what he said is quite right.']

In [10]:
train_df = pd.read_csv(
    Paths.COMPETITION_TRAIN_CSV_PATH,
    usecols=["essay_id", "full_text"],
)

In [13]:
sentence_df = process_sentence(train_df.copy(deep=True))
sentence_df.drop(columns=["full_text"], inplace=True)
sentence_df.shape

(330422, 2)

In [14]:
assert (
    train_df.essay_id.unique().shape == sentence_df.essay_id.unique().shape
), f"Expected: {train_df.essay_id.unique().shape}, Got: {sentence_df.essay_id.unique().shape}"

In [15]:
sentence_df["len"] = sentence_df.sentence.map(len)

In [22]:
sentence_df.drop(index=sentence_df[(sentence_df.len < 10)].index, inplace=True)

In [23]:
assert (
    train_df.essay_id.unique().shape == sentence_df.essay_id.unique().shape
), f"Expected: {train_df.essay_id.unique().shape}, Got: {sentence_df.essay_id.unique().shape}"

In [25]:
sentence_df["corrected"]  = correct_all_sentences(
    model,
    tokenizer,
    device,
    sentence_df,
    batch_size=2048,
)

sentence_df["corrected"].sample(3)

In [30]:
sentence_df.loc[sentence_df.corrected.isna(), "corrected"] = sentence_df.loc[
    sentence_df.corrected.isna(), "sentence"
]

In [32]:
sentence_df["corrected"] = sentence_df["corrected"].map(post_process)

In [33]:
assert (
    train_df.essay_id.unique().shape == sentence_df.essay_id.unique().shape
), f"Expected: {train_df.essay_id.unique().shape}, Got: {sentence_df.essay_id.unique().shape}"

In [34]:
train_df.loc[~train_df.index.isin(sentence_df.index), "full_text"]

26       Many people in todays society tend to travel b...
348      In the article "The Challenge of Exploring Ven...
349      a computer can not tell if your happy or if yo...
431      Kids across America probably know someone who ...
568      The advantages of limiting car is great becaus...
                               ...                        
15453    The electorian collage is a very popular thing...
15465    The future iscoming soon everday. And everyday...
15852    In the article "The Challenge of Exploring Ven...
16663    The author supports people to study Venus is b...
16856    Being able to detect other peoples and even yo...
Name: full_text, Length: 97, dtype: object

In [35]:
sentence_df.to_csv(
    "data/feature_engg/grammar_correct.csv",
    index=False,
    columns=["essay_id", "sentence", "corrected"],
)