In [1]:
# ======================================
# 📦 CELL 1–3: LOAD, AUGMENT, FINETUNE AND SAVE
# ======================================
from datasets import load_dataset, DatasetDict, concatenate_datasets
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import nltk
import nlpaug.augmenter.word as naw
import torch

# Download NLTK resources for nlpaug
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# 1. Load and filter dataset
raw_dataset = load_dataset("grammarly/coedit")
selected_tasks = ["gec", "clarity", "simplification", "paraphrase"]
raw_dataset = raw_dataset.filter(lambda x: x["task"] in selected_tasks)

def add_prefix(example):
    example["input"] = f"{example['task']}: {example['src']}"
    example["output"] = example['tgt']
    return example

raw_dataset = raw_dataset.map(add_prefix)

# 2. Use 5% for train/val
dataset = DatasetDict({
    "train": raw_dataset["train"].shuffle(seed=42).select(range(int(0.05 * len(raw_dataset["train"])))),
    "validation": raw_dataset["validation"].shuffle(seed=42).select(range(int(0.05 * len(raw_dataset["validation"]))))
})

# 3. Data augmentation on source
syn_aug = naw.SynonymAug(aug_src='wordnet')

def augment_data(example):
    try:
        example['input'] = syn_aug.augment(example['input'])
    except:
        pass
    return example

augmented = dataset["train"].select(range(1000)).map(augment_data)
dataset["train"] = concatenate_datasets([dataset["train"], augmented])

# 4. Tokenization
model_name = "vennify/t5-base-grammar-correction"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=128, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = dataset["train"].map(preprocess, batched=True)
tokenized_val = dataset["validation"].map(preprocess, batched=True)

# 5. Training setup
args = TrainingArguments(
    output_dir="./multitask-gec-finetuned",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)

# 6. Train and save
trainer.train()
trainer.save_model("./multitask-gec-finetuned")
tokenizer.save_pretrained("./multitask-gec-finetuned")




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filter:   0%|          | 0/69071 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1712 [00:00<?, ? examples/s]

Map:   0%|          | 0/47885 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\VUONGLOCTRUONG\AppData\Ro

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1813,0.285653
2,0.1829,0.275821
3,0.1655,0.270408


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./multitask-gec-finetuned\\tokenizer_config.json',
 './multitask-gec-finetuned\\special_tokens_map.json',
 './multitask-gec-finetuned\\spiece.model',
 './multitask-gec-finetuned\\added_tokens.json')

In [1]:
# ======================================
# 📦 CELL 4: CUSTOM EVALUATION (GLEU & ERRANT)
# ======================================
import nltk
from nltk.translate.gleu_score import corpus_gleu
import sacrebleu
import errant
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load model and tokenizer
model_dir = "./multitask-gec-finetuned"

# If tokenizer files not present, load from base model and save
try:
    tokenizer = T5Tokenizer.from_pretrained(model_dir)
except OSError:
    print("Tokenizer files not found in finetuned directory. Copying from base model...")
    tokenizer = T5Tokenizer.from_pretrained("vennify/t5-base-grammar-correction")
    tokenizer.save_pretrained(model_dir)

model = T5ForConditionalGeneration.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")

# Load validation dataset for evaluation
from datasets import load_dataset

dataset = load_dataset("grammarly/coedit")
dataset = dataset.filter(lambda x: x["task"] == "gec")
dataset = dataset["validation"].shuffle(seed=42).select(range(int(0.05 * len(dataset["validation"]))))

def add_prefix(example):
    example["input"] = f"gec: {example['src']}"
    example["output"] = example['tgt']
    return example

dataset = dataset.map(add_prefix)

# Generate predictions
preds = []
refs = []
for ex in dataset:
    input_ids = tokenizer(ex["input"], return_tensors="pt").input_ids.to(model.device)
    output = model.generate(input_ids, max_length=128)
    pred = tokenizer.decode(output[0], skip_special_tokens=True)
    preds.append(pred)
    refs.append(ex["output"])

# GLEU (NLTK)
gleu_score = corpus_gleu([[ref.split()] for ref in refs], [pred.split() for pred in preds])
print("🟢 NLTK GLEU Score:", round(gleu_score * 100, 2))

# BLEU (SacreBLEU)
sacrebleu_score = sacrebleu.corpus_bleu(preds, [refs])
print("🟢 SacreBLEU Score:", round(sacrebleu_score.score, 2))

# ERRANT
def save_for_errant(preds, refs):
    with open("preds.txt", "w", encoding="utf-8") as f_pred, open("refs.txt", "w", encoding="utf-8") as f_ref:
        for pred, ref in zip(preds, refs):
            f_pred.write(pred.strip() + "\n")
            f_ref.write(ref.strip() + "\n")

    os.system("errant_parallel -orig refs.txt -cor preds.txt -out output.m2")
    os.system("errant_compare output.m2")

save_for_errant(preds, refs)

# ======================================
# 📦 CELL 5: HYBRID INFERENCE (LANGUAGETOOL + T5)
# ======================================
import language_tool_python
from difflib import SequenceMatcher
import re

tool = language_tool_python.LanguageTool('en-US')

# Post-process to fix "more + adj" → "adj-er"
def simplify_comparative(text):
    pattern = r"\bmore ([a-zA-Z]+?)\b"
    irregular = {
        "good": "better", "bad": "worse", "far": "farther",
        "angry": "angrier", "simple": "simpler", "little": "less"
    }
    blocked = {"better", "worse", "less", "more"}  # Already comparative or invalid

    def replace(match):
        word = match.group(1)
        if word in blocked:
            return word
        if word in irregular:
            return irregular[word]
        elif len(word) <= 6 and not word.endswith("ly"):
            if word.endswith("y"):
                return word[:-1] + "ier"
            return word + "er"
        return "more " + word

    return re.sub(pattern, replace, text)

# Fix conditional type III (If I had ..., I will → I would)
def fix_conditional_third(text):
    return re.sub(r"(If I had .*?), I will", r"\1, I would", text)

# Fix time expressions: since months → for months
def fix_time_expressions(text):
    return re.sub(r"\bsince (days|weeks|months|years)\b", r"for \1", text)

# Highlight and count remaining common grammar patterns
def detect_remaining_errors(text):
    patterns = [
        r"\bmore [a-zA-Z]+?\b",
        r"\b[a-zA-Z]+\s+(don't|doesn't|didn't)\s+[a-zA-Z]+\b",
        r"\b[a-zA-Z]+\s+have\s+[a-zA-Z]+ed\b",
        r"\bsince (days|weeks|months|years)\b"
    ]
    count = 0
    for pat in patterns:
        matches = re.findall(pat, text)
        count += len(matches)
    return count

def hybrid_correct_and_diff(text):
    # Step 1: LanguageTool correction
    lt_corrected = tool.correct(text)

    # Step 2: T5 model refinement
    input_ids = tokenizer("gec: " + lt_corrected, return_tensors="pt").input_ids.to(model.device)
    output = model.generate(input_ids, max_length=128)
    corrected = tokenizer.decode(output[0], skip_special_tokens=True)

    # Step 3: Postprocessing (comparative simplification + conditional fixes + time fixes)
    post_corrected = simplify_comparative(corrected)
    post_corrected = fix_conditional_third(post_corrected)
    post_corrected = fix_time_expressions(post_corrected)

    # Step 4: Show diffs
    input_tokens = text.strip().split()
    output_tokens = post_corrected.strip().split()
    matcher = SequenceMatcher(None, input_tokens, output_tokens)
    changes = [(" ".join(input_tokens[i1:i2]), " ".join(output_tokens[j1:j2]))
               for tag, i1, i2, j1, j2 in matcher.get_opcodes() if tag != "equal"]

    print("🔍 Original :", text)
    print("🛠 LanguageTool pre-fix:", lt_corrected)
    print("✅ Corrected:", post_corrected)
    print(f"🔄 {len(changes)} change(s):")
    for old, new in changes:
        print(f"   - '{old}' → '{new}'")

    # Step 5: Auto-analysis of remaining patterns
    missed = detect_remaining_errors(post_corrected)
    if "since" in post_corrected and "for" not in post_corrected:
        print("⚠️ Possible misuse of 'since' instead of 'for'. Consider reviewing time expressions.")
    print(f"🚨 Remaining suspicious patterns: {missed}")

    return post_corrected


🟢 NLTK GLEU Score: 52.71
🟢 SacreBLEU Score: 55.12


In [3]:
hybrid_correct_and_diff("Although he studied hard, but he failed the exam")

🔍 Original : Although he studied hard, but he failed the exam
🛠 LanguageTool pre-fix: Although he studied hard, but he failed the exam
✅ Corrected: Although he studied hard, he failed the exam.
🔄 2 change(s):
   - 'but' → ''
   - 'exam' → 'exam.'
🚨 Remaining suspicious patterns: 0


'Although he studied hard, he failed the exam.'

In [None]:
hybrid_correct_and_diff("Many student doesn't understand how climate change affect they daily life, so they continues to ignore scientific advice.")
hybrid_correct_and_diff("The people is more angry about government policies, which increase rapidly the tension between public and authority.")
hybrid_correct_and_diff("This method is more efficient and more simple than the previous one, but it takes more longer to process.")

🔍 Original : Many student doesn't understand how climate change affect they daily life, so they continues to ignore scientific advice.
🛠 LanguageTool pre-fix: Many students doesn't understand how climate change affect they daily life, so they continue to ignore scientific advice.
✅ Corrected: Many students don't understand how climate change affects their daily life, so they continue to ignore scientific advice.
🔄 3 change(s):
   - 'student doesn't' → 'students don't'
   - 'affect they' → 'affects their'
   - 'continues' → 'continue'
🚨 Remaining suspicious patterns: 1
🔍 Original : The people is more angry about government policies, which increase rapidly the tension between public and authority.
🛠 LanguageTool pre-fix: The people are more angry about government policies, which increase rapidly the tension between public and authority.
✅ Corrected: The people are angrier about government policies, which increase rapidly the tension between public and authority.
🔄 1 change(s):
   - 'is m

'This method is more efficient and simpler than the previous one, but it takes longer to process.'

In [2]:
complex_sentences = [
    "Despite of the fact that she studied hardly, she didn't passed the exam which she was preparing since months.",
    "The informations that was shared during the meeting wasn't accurate and needs to be verified.",
    "Every students in the class have submitted their assignments late because of the teacher’s unclear instructions.",
    "There is many reasons why the project was failed, but none of them were discussed during the review.",
    "He suggested to postpone the meeting because he have another appointment at the same time.",
    "If I would have known about the delay, I will not have rushed to finish my work.",
    "The technology are changing so fast that even experts is finding difficult to keep up.",
    "More better solutions should have been considered before making such irreversible decisions.",
    "She was so much tired that she fell asleep without eating anything, which it surprised her roommates.",
    "While people expects governments to act faster, but the bureaucratic process slow things down significantly."
]
for i, sent in enumerate(complex_sentences, 1):
    print(f"\n🧪 Test #{i}")
    hybrid_correct_and_diff(sent)



🧪 Test #1
🔍 Original : Despite of the fact that she studied hardly, she didn't passed the exam which she was preparing since months.
🛠 LanguageTool pre-fix: Despite the fact that she hardly studied, she didn't pass the exam which she was preparing since months.
✅ Corrected: Despite the fact that she hardly studied, she didn't pass the exam which she was preparing for months.
🔄 4 change(s):
   - 'of' → ''
   - 'studied hardly,' → 'hardly studied,'
   - 'passed' → 'pass'
   - 'since' → 'for'
🚨 Remaining suspicious patterns: 1

🧪 Test #2
🔍 Original : The informations that was shared during the meeting wasn't accurate and needs to be verified.
🛠 LanguageTool pre-fix: The information that was shared during the meeting wasn't accurate and needs to be verified.
✅ Corrected: The information that was shared during the meeting wasn't accurate and needs to be verified.
🔄 1 change(s):
   - 'informations' → 'information'
🚨 Remaining suspicious patterns: 0

🧪 Test #3
🔍 Original : Every students in 

In [None]:
from datasets import load_dataset

dataset = load_dataset("grammarly/coedit")

print(dataset)

for i in range (3):
    print(dataset["train"][i])

DatasetDict({
    train: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 69071
    })
    validation: Dataset({
        features: ['_id', 'task', 'src', 'tgt'],
        num_rows: 1712
    })
})
{'_id': '1', 'task': 'gec', 'src': 'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.', 'tgt': 'For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.'}
{'_id': '2', 'task': 'gec', 'src': 'Improve the grammaticality: As the number of people grows, the need of habitable environment is unquestionably essential.', 'tgt': 'As the number of people grows, the need for a habitable environment is unquestionably increasing.'}
{'_id': '3', 'task': 'gec', 'src': 'Improve the grammaticality of this sentence: Besides 

In [None]:
from datasets import load_dataset
from itertools import chain

# Load dataset
dataset = load_dataset("grammarly/coedit")

all_data = chain(dataset["train"], dataset["validation"])

all_tasks = set(example["task"] for example in all_data)

for task in sorted(all_tasks):
    print("-", task)


- clarity
- coherence
- gec
- neutralize
- paraphrase
- simplification
