In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [6]:
import pandas as pd
from datasets import Dataset

# Load dataset
file_path = '/content/drive/MyDrive/ML_project/ML_project/merged_data.csv'
df = pd.read_csv(file_path)

# Rename columns to match T5 input-output format
df = df.rename(columns={'sentence': 'input', 'corrections': 'target'})

# Add a prefix to the input column to guide the T5 model
df['input'] = "fix grammar: " + df['input']

# Drop any unnecessary columns (if present)
df = df[['input', 'target']]

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model for Grammarly CoEdit
tokenizer = AutoTokenizer.from_pretrained('grammarly/coedit-large')
model = AutoModelForSeq2SeqLM.from_pretrained('grammarly/coedit-large')

# Tokenization function for T5
def tokenize_function(examples):
    # Tokenize the input and target text
    model_inputs = tokenizer(
        examples['input'],
        max_length=64,
        truncation=True,
        padding='max_length'
    )

    # Tokenize target sentences (labels)
    labels = tokenizer(
        examples['target'],
        max_length=64,
        truncation=True,
        padding='max_length'
    )

    # Add labels to the model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input', 'target'])


tokenizer_config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Map:   0%|          | 0/2772 [00:00<?, ? examples/s]

In [8]:
print(tokenized_dataset.features)

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [9]:
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM
from transformers import Trainer

# Load the Grammarly CoEdit model
model = AutoModelForSeq2SeqLM.from_pretrained('grammarly/coedit-large')

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',           # Save model and checkpoints
    num_train_epochs=15,              # More epochs for small datasets
    per_device_train_batch_size=8,    # Slightly larger batch size for stability
    per_device_eval_batch_size=8,     # Same as train batch size
    warmup_steps=500,                 # More warmup steps for smoother training
    weight_decay=0.01,                # Regularization to prevent overfitting
    logging_dir='./logs',             # Log directory
    logging_steps=10,                 # Log metrics every 10 steps
    evaluation_strategy="epoch",      # Evaluate after every epoch
    save_strategy="epoch",            # Save checkpoints after every epoch
    save_total_limit=2,               # Keep only the last 2 checkpoints
    learning_rate=2e-5,               # Slightly lower learning rate
    load_best_model_at_end=True,      # Automatically load the best model
    metric_for_best_model="loss",     # Optimize for validation loss
    greater_is_better=False,          # Lower loss is better
    # predict_with_generate=True,     # Removed from TrainingArguments
    fp16=True,                        # Enable mixed precision training for speed (if supported by GPU)
    report_to="none"                  # Disable Weights & Biases integration
)


# Define the Trainer
trainer = Trainer(
    model=model,                       # The pre-trained BERT model
    args=training_args,                # Training arguments (defined earlier)
    train_dataset=tokenized_dataset,   # Tokenized dataset for training
    eval_dataset=tokenized_dataset,    # Use the same dataset for evaluation (if no separate eval dataset)
    tokenizer=tokenizer,               # Tokenizer used for preprocessing
    compute_metrics=None               # Optional: Define custom metrics if needed
)

  trainer = Trainer(


In [10]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,
4,0.0,
5,0.0,
6,0.0,
7,0.0,
8,0.0,
9,0.0,
10,0.0,


TrainOutput(global_step=5205, training_loss=0.0, metrics={'train_runtime': 3362.2576, 'train_samples_per_second': 12.367, 'train_steps_per_second': 1.548, 'total_flos': 1.197857362673664e+16, 'train_loss': 0.0, 'epoch': 15.0})

In [11]:
import torch

# Correct a sentence using the fine-tuned model
def correct_sentence(sentence):
    input_text = f"fix grammar: {sentence}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=64, truncation=True).to('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64, num_beams=5, early_stopping=True)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test Example
test_sentence = "I is happy to saw you."
corrected = correct_sentence(test_sentence)
print("Original Sentence:", test_sentence)
print("Corrected Sentence:", corrected)


Original Sentence: I is happy to saw you.
Corrected Sentence: I am happy to see you.


In [12]:
model.save_pretrained('/content/drive/MyDrive/ML_project/coedit')
tokenizer.save_pretrained('/content/drive/MyDrive/ML_project/coedit')

('/content/drive/MyDrive/ML_project/coedit/tokenizer_config.json',
 '/content/drive/MyDrive/ML_project/coedit/special_tokens_map.json',
 '/content/drive/MyDrive/ML_project/coedit/spiece.model',
 '/content/drive/MyDrive/ML_project/coedit/added_tokens.json',
 '/content/drive/MyDrive/ML_project/coedit/tokenizer.json')