In [None]:
!pip install transformers datasets sentencepiece scikit-learn
!pip install pandas datasets --quiet

In [None]:
#----------------------------------- DATASET EXPANASION ------------------------------------

import pandas as pd
import random

# Load your original file (make sure it's uploaded)
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))  # Reads the uploaded file

# Grammar error generation
subjects = ["he", "she", "they", "we", "it", "i", "john", "mary"]
verbs = ["go", "eat", "run", "walk", "play", "has", "do", "is", "was"]
objects = ["to school", "the food", "fast", "every day", "in the park", "a dog", "some books", "well"]

def generate_sentence_pairs(n):
    pairs = []
    for _ in range(n):
        subj = random.choice(subjects)
        verb = random.choice(verbs)
        obj = random.choice(objects)

        incorrect = f"{subj} {verb} {obj}"

        # Correction logic
        if subj in ["he", "she", "it", "mary", "john"] and verb in ["go", "eat", "run", "walk", "play", "do"]:
            corrected_verb = verb + "es" if verb.endswith('o') else verb + "s"
        elif subj in ["i", "we", "they"] and verb == "has":
            corrected_verb = "have"
        elif subj in ["he", "she", "it"] and verb == "do":
            corrected_verb = "does"
        elif subj in ["i", "we", "they"] and verb == "was":
            corrected_verb = "were"
        elif subj in ["he", "she", "it"] and verb == "were":
            corrected_verb = "was"
        elif subj == "i" and verb == "is":
            corrected_verb = "am"
        else:
            corrected_verb = verb

        correct = f"{subj.capitalize()} {corrected_verb} {obj}"
        pairs.append((incorrect, correct))
    return pairs

# Add 995 more
new_data = generate_sentence_pairs(95)
df_extra = pd.DataFrame(new_data, columns=["incorrect", "correct"])
df_full = pd.concat([df, df_extra], ignore_index=True)

# Save and download
df_full.to_csv("grammar_dataset_expanded.csv", index=False)
files.download("Grammar Correction Model (T5) Dataset.csv")


In [None]:
#------------------------------------------ GRAMMER CORRECTION T5 FINE - TUNING ------------------------------------------------

from google.colab import files
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# Upload dataset
uploaded = files.upload()  # upload your grammar_dataset.csv

df = pd.read_csv(next(iter(uploaded)))  # first uploaded file
df = df[["incorrect", "correct"]].dropna()

# Prepare dataset
tokenizer = T5Tokenizer.from_pretrained("t5-base")

def preprocess(example):
    input_text = "gec: " + example["incorrect"]
    target_text = example["correct"]

    input_enc = tokenizer(input_text, truncation=True, padding="max_length", max_length=128)
    target_enc = tokenizer(target_text, truncation=True, padding="max_length", max_length=128)

    # Fix for T5 training: Replace pad_token_id with -100 in labels
    labels = target_enc["input_ids"]
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    input_enc["labels"] = labels
    return input_enc


dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(preprocess)

model = T5ForConditionalGeneration.from_pretrained("t5-base")

training_args = TrainingArguments(
    output_dir="./t5-grammar-custom",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_strategy="no",
    logging_dir='./logs',
    save_total_limit=1
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
trainer.train()

model.save_pretrained("./t5-grammar-custom")
tokenizer.save_pretrained("./t5-grammar-custom")


In [None]:
#--------------------------- TEST MODEL --------------------------------

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load your fine-tuned model
model_path = "./t5-grammar-custom"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def correct_grammar(text):
    input_text = "gec: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    outputs = model.generate(
        input_ids,
        max_length=100,
        num_return_sequences=1,
        repetition_penalty=3.0,
        no_repeat_ngram_size=3,
        temperature=0.7,
        do_sample=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test it
sample_input = "she go school every day"
print("Corrected:", correct_grammar(sample_input))
