In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
!pip install transformers
!pip install datasets



In [None]:
!pip install transformers
!pip install datasets



In [None]:
import pandas as pd
from datasets import Dataset

# Load dataset
file_path = '/content/drive/MyDrive/ML_project/merged_data.csv'
df = pd.read_csv(file_path)

# Rename columns to match T5 input-output format
df = df.rename(columns={'sentence': 'input', 'corrections': 'target'})

# Add a prefix to the input column to guide the T5 model
df['input'] = "fix grammar: " + df['input']

# Drop any unnecessary columns (if present)
df = df[['input', 'target']]

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load tokenizer and model for T5
tokenizer = BartTokenizer.from_pretrained('./fine_tuned_bart')
model = BartForConditionalGeneration.from_pretrained('./fine_tuned_bart')

# Tokenization function for T5
def tokenize_function(examples):
    # Tokenize the input and target text
    model_inputs = tokenizer(
        examples['input'],
        max_length=64,
        truncation=True,
        padding='max_length'
    )

    # Tokenize target sentences (labels)
    labels = tokenizer(
        examples['target'],
        max_length=64,
        truncation=True,
        padding='max_length'
    )

    # Add labels to the model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input', 'target'])

Map:   0%|          | 0/2772 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset.features)

{'input': Value(dtype='string', id=None), 'target': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [None]:
from transformers import TrainingArguments, Trainer, BartForConditionalGeneration

# Ensure the correct model is loaded
from transformers import BartForConditionalGeneration
model = BartForConditionalGeneration.from_pretrained('./fine_tuned_bart')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,          # More epochs for small dataset
    per_device_train_batch_size=4,  # Stable gradient updates
    per_device_eval_batch_size=4,   # Matches training batch size
    warmup_steps=100,           # Short warmup
    weight_decay=0.01,          # Prevent overfitting
    logging_dir='./logs',
    logging_steps=10,           # Frequent logging
    evaluation_strategy="epoch",  # Evaluate after every epoch
    save_strategy="epoch",        # Save checkpoints after every epoch
    learning_rate=5e-5,         # Balanced learning rate
    load_best_model_at_end=True, # Automatically load the best model
    metric_for_best_model="loss", # Optimize for validation loss
    greater_is_better=False,     # Lower loss is better
    report_to="none"            # Disable W&B integration
)


# Define the Trainer
trainer = Trainer(
    model=model,                       # The pre-trained BERT model
    args=training_args,                # Training arguments (defined earlier)
    train_dataset=tokenized_dataset,   # Tokenized dataset for training
    eval_dataset=tokenized_dataset,    # Use the same dataset for evaluation (if no separate eval dataset)
    tokenizer=tokenizer,               # Tokenizer used for preprocessing
    compute_metrics=None               # Optional: Define custom metrics if needed
)


  trainer = Trainer(


In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1267,0.064202
2,0.1164,0.041861
3,0.0329,0.028717
4,0.0659,0.020747
5,0.0345,0.015386
6,0.0405,0.012216
7,0.0112,0.010205
8,0.0192,0.009703


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=5544, training_loss=0.1636846757428062, metrics={'train_runtime': 1754.2454, 'train_samples_per_second': 12.641, 'train_steps_per_second': 3.16, 'total_flos': 1688031272632320.0, 'train_loss': 0.1636846757428062, 'epoch': 8.0})

In [None]:
import torch

# Correct a sentence using the fine-tuned model
def correct_sentence(sentence):
    input_text = f"fix grammar: {sentence}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=64, truncation=True).to('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64, num_beams=5, early_stopping=True)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test Example
test_sentence = "I is happy to saw you."
corrected = correct_sentence(test_sentence)
print("Original Sentence:", test_sentence)
print("Corrected Sentence:", corrected)


Original Sentence: I is happy to saw you.
Corrected Sentence: I am happy to see you.


In [None]:
# Expanded Quick Accuracy Check Sentences
test_cases = [
    ("She will goes to the market yesterday but forgot her wallet.", "She went to the market yesterday, but forgot her wallet."),
    ("Me and him goes to store and buyed candies.", "He and I went to the store and bought candies."),
    ("He don't knows nothing about fixing cars.", "He doesn't know anything about fixing cars."),
    ("Although he was tired. Decided to keep working.", "Although he was tired, he decided to keep working."),
    ("Their going too the park later, isn't it?", "They're going to the park later, aren't they?"),
    ("The dog bark loudly every morning.", "The dog barks loudly every morning."),
    ("He didn't knew about the meeting.", "He didn't know about the meeting."),
    ("Me and her went to the cinema.", "She and I went to the cinema."),
    ("She don't wants to go out tonight.", "She doesn't want to go out tonight."),
    ("We was watching a movie yesterday.", "We were watching a movie yesterday."),
    ("I didn't see nobody at the party.", "I didn't see anybody at the party."),
    ("She is good in math.", "She is good at math."),
    ("The flowers smells nice.", "The flowers smell nice."),
    ("Running fast, the finish line was reached.", "Running fast, he reached the finish line."),
    ("She buyed a new dress last week.", "She bought a new dress last week."),
    ("Them is going to the event.", "They are going to the event."),
    ("He was so tired, he slept all the day.", "He was so tired that he slept all day."),
    ("I seen him at the mall yesterday.", "I saw him at the mall yesterday."),
    ("She enjoy to read books.", "She enjoys reading books."),
    ("Her don't like spicy food.", "She doesn't like spicy food."),
    ("They was playing soccer when I arrived.", "They were playing soccer when I arrived."),
    ("Him is taller than me.", "He is taller than me."),
    ("The book were on the table.", "The book was on the table."),
    ("He don't have no money.", "He doesn't have any money."),
    ("The cat was sitting on a tree who was very high.", "The cat was sitting on a tree that was very high."),
    ("She is married with a lawyer.", "She is married to a lawyer."),
    ("He doesn't knows where the library is.", "He doesn't know where the library is."),
    ("While walking through the park, the birds sang beautifully.", "While walking through the park, he heard the birds singing beautifully."),
    ("Me and my friend likes pizza.", "My friend and I like pizza."),
    ("He was interested on the new project.", "He was interested in the new project."),
    ("I cannot able to do it.", "I cannot do it."),
    ("She is knowing the answer.", "She knows the answer."),
    ("If I was you, I would apologize.", "If I were you, I would apologize."),
    ("The movie was too much long.", "The movie was too long."),
    ("I am agree with your opinion.", "I agree with your opinion."),
    ("She said me to come early.", "She told me to come early."),
    ("The train leave at 5 PM every day.", "The train leaves at 5 PM every day."),
    ("I have seen her yesterday.", "I saw her yesterday."),
    ("He suggested me to join the team.", "He suggested that I join the team."),
    ("They was not ready for the trip.", "They were not ready for the trip."),
    ("Neither of the boys are responsible.", "Neither of the boys is responsible."),
    ("She sings good.", "She sings well."),
    ("He is very interesting in science.", "He is very interested in science."),
    ("Me went to the party alone.", "I went to the party alone."),
    ("He didn't knew what to say.", "He didn't know what to say."),
    ("The team are playing well.", "The team is playing well."),
    ("You should to try harder.", "You should try harder."),
    ("He is more taller than me.", "He is taller than me."),
    ("She do her homework every evening.", "She does her homework every evening."),
    ("Each of the students were given a book.", "Each of the students was given a book."),
    ("He doesn't likes coffee.", "He doesn't like coffee."),
    ("This place is more better than the other one.", "This place is better than the other one."),
    ("I wish I was there.", "I wish I were there."),
    ("He has went to the market.", "He has gone to the market."),
    ("The dog is barking on the stranger.", "The dog is barking at the stranger."),
    ("He explained me the problem.", "He explained the problem to me."),
    ("She hasn't no time to talk.", "She has no time to talk."),
    ("I am looking forward to meet you.", "I am looking forward to meeting you."),
    ("She can sings beautifully.", "She can sing beautifully."),
    ("He don't likes working on weekends.", "He doesn't like working on weekends."),
    ("They didn't had lunch yet.", "They haven't had lunch yet."),
    ("He is very tired, but he keep working.", "He is very tired, but he keeps working."),
    ("This is the house where I was born in.", "This is the house where I was born."),
    ("He suggested me that I should join.", "He suggested that I should join."),
    ("She do not wants to leave.", "She does not want to leave.")
]

# Test the model
correct_predictions = 0
incorrect_predictions = []

for incorrect, expected in test_cases:
    corrected = correct_sentence(incorrect)
    print(f"Original: {incorrect}")
    print(f"Expected: {expected}")
    print(f"Corrected: {corrected}\n")

    if corrected.strip() == expected.strip():
        correct_predictions += 1
    else:
        incorrect_predictions.append((incorrect, expected, corrected))

# Calculate Quick Accuracy
accuracy = (correct_predictions / len(test_cases)) * 100

# Display Results
print(f"\n✅ Quick Estimated Accuracy: {accuracy:.2f}%")
print(f"❌ Number of Incorrect Predictions: {len(incorrect_predictions)}\n")

# Display Incorrect Predictions
if incorrect_predictions:
    print("🚨 **Incorrect Predictions:**")
    for i, (incorrect, expected, corrected) in enumerate(incorrect_predictions, 1):
        print(f"{i}. Original: {incorrect}")
        print(f"   Expected: {expected}")
        print(f"   Corrected: {corrected}\n")

Original: She will goes to the market yesterday but forgot her wallet.
Expected: She went to the market yesterday, but forgot her wallet.
Corrected: She went to the market yesterday, but forgot her wallet.

Original: Me and him goes to store and buyed candies.
Expected: He and I went to the store and bought candies.
Corrected: He and I went to the store and bought candies.

Original: He don't knows nothing about fixing cars.
Expected: He doesn't know anything about fixing cars.
Corrected: He doesn't know anything about fixing cars.

Original: Although he was tired. Decided to keep working.
Expected: Although he was tired, he decided to keep working.
Corrected: Although he was tired, he decided to keep working.

Original: Their going too the park later, isn't it?
Expected: They're going to the park later, aren't they?
Corrected: They're going to the park later, isn't it?

Original: The dog bark loudly every morning.
Expected: The dog barks loudly every morning.
Corrected: The dog barks 

In [None]:
model.save_pretrained('/content/drive/MyDrive/ML_project/final_model')
tokenizer.save_pretrained('/content/drive/MyDrive/ML_project/final_model')

('/content/drive/MyDrive/ML_project/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/ML_project/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/ML_project/final_model/spiece.model',
 '/content/drive/MyDrive/ML_project/final_model/added_tokens.json')