
# **Install libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets tqdm pandas sentencepiece transformers transformers[torch]



In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [None]:
# Check we have a GPU and check the memory size of the GPU
!nvidia-smi

Fri Oct 20 02:09:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Import packages**

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Set a seed**

In [None]:
import random
import numpy as np
import torch
import datasets

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)



```
# This is formatted as code
```

# ***C4-200M dataset***

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset.csv', encoding='ISO-8859-1')
df.shape

half_size = len(df) // 2
trimmed_df = df.head(half_size)
trimmed_df.shape
df=trimmed_df

In [None]:
df.shape

(500000, 2)

In [None]:
df.head()

Unnamed: 0,input,output
0,"Medell he, Ohio W. Shannon Kansas,R. C.","Medell, Ohio; W. Shannon. Kansas; R. C."
1,quarter of 1999 $ 25 million was repaid under this credit facility.,"quarter of 1999, $25 million was repaid under this credit facility."
2,It used as service center by the Block office for collection of Muster Rolls from the VECs on also for disbursement of wages slip along with verified Muster Rolls after MIS entry - for payment of wages to the MGNREGA workers to minimize the transportation and miscellaneous cost of VEC functionaries.,It can be used as a service center by the Block office for collection of Muster Rolls from the VECs and also for disbursement of wages slip along with verified Muster Rolls after MIS entry for payment of wages to the MGNREGA workers to minimize the transportation and miscellaneous cost of VEC functionaries.
3,"Tom offered two this time, one of old restaurants and another of new ones and the lists reminded me of the award I received the day before and about the capriciousness of such choices.","Tom offered two this time, one of old restaurants and another of new ones; and the lists reminded me of the award I had received the day before and about the capriciousness of such choices."
4,You can see 'Spring beauties' at The Botanical Gardens of Asheville (www.ashevillebotanicalgardens.org) combined a multitude of common natural berries from our region.,"You can see Spring Beauties at The Botanical Gardens of Asheville (www.ashevillebotanicalgardens.org), along with a multitude of common and rare native plants of our region."


In [None]:
from transformers import (
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
  )

from torch.utils.data import Dataset, DataLoader

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.10, shuffle=True)
train_df.shape, test_df.shape

((450000, 2), (50000, 2))

In [None]:
test_df['input_token_len'] = test_df['input'].apply(calc_token_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (971 > 512). Running this sequence through the model will result in indexing errors


In [None]:
test_df.head()

Unnamed: 0,input,output,input_token_len
104241,"Access to Information Programme Foundation is a registered non pro organization in Sofia under for Persons and Family Act, Reg. No 13 849/96, Sofia City Court, Bulgaria.","Access to Information Programme Foundation is a registered non-profit organization in Sofia under the Persons and Family Act, Reg. No. 13 849/96, Sofia City Court, Bulgaria.",38
199676,The problem will go beyond technologies though.,The problem goes beyond technology though.,9
140199,"During the last four decades due to rapidly growing populations and many other social and political factors, many developing countries have not been able to accumulate capital for agricultural and rural development rapidly enough.","During the last four decades, due to rapidly growing populations and many other social and political factors, many developing countries have not been able to accumulate capital for agricultural and rural development rapidly enough.",39
132814,2019 promises to be an exceptional year for golf in Ireland with Lahinch GC hosting Dubai Duty Free Irish Open July 4thto 7th and Royal Portrush in Northern Ireland hosting the '14.th Open Championship in 18thto 21st.,2019 promises to be an exceptional year for golf in Ireland with Lahinch GC hosting the Dubai Duty Free Irish Open July 4thto 7th and Royal Portrush in Northern Ireland hosting the 148thOpen Championship July 18thto 21st.,54
408697,Take a Tour Through Parts of Our History,Take a Tour Through Parts of Our History!,11


In [None]:
test_df['input_token_len'].describe()

count    50000.000000
mean        34.082100
std         27.610413
min          2.000000
25%         17.000000
50%         27.000000
75%         43.000000
max       1299.000000
Name: input_token_len, dtype: float64

### We will use a token length of 64 since it will cover the vast majority of examples

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
test_dataset

Dataset({
    features: ['input', 'output', 'input_token_len', '__index_level_0__'],
    num_rows: 50000
})

### Load the Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
class GrammarDataset(Dataset):
    def __init__(self, dataset, tokenizer,print_text=False):
        self.dataset = dataset
        self.pad_to_max_length = False
        self.tokenizer = tokenizer
        self.print_text = print_text
        self.max_len = 64

    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
        input_, target_ = example['input'], example['output']

        # tokenize inputs
        tokenized_inputs = tokenizer(input_, pad_to_max_length=self.pad_to_max_length,
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        tokenized_targets = tokenizer(target_, pad_to_max_length=self.pad_to_max_length,
                                            max_length=self.max_len,
                                            return_attention_mask=True)

        inputs={"input_ids": tokenized_inputs['input_ids'],
            "attention_mask": tokenized_inputs['attention_mask'],
            "labels": tokenized_targets['input_ids']
        }

        return inputs


    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])

        if self.print_text:
            for k in inputs.keys():
                print(k, len(inputs[k]))

        return inputs

In [None]:
dataset = GrammarDataset(test_dataset, tokenizer, True)
print(dataset[1])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


input_ids 9
attention_mask 9
labels 8
{'input_ids': [37, 682, 56, 281, 1909, 2896, 713, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [37, 682, 1550, 1909, 748, 713, 5, 1]}


### Define Evaluator

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=f093789fca21eb937b45cc93c2b762cc9e66c473747d7e8e58eff726f95ed1b2
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from datasets import load_metric
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [None]:
from evaluate import load
rouge_metric = load("rouge")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### Train Model

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding='longest', return_tensors='pt')

In [None]:
pip install transformers[torch]



In [None]:
pip install accelerate -U



In [None]:
pip show accelerate


Name: accelerate
Version: 0.23.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: sylvain@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, torch
Required-by: 


In [None]:
# defining training related arguments
batch_size = 10
args = Seq2SeqTrainingArguments(output_dir="/content/",
                        evaluation_strategy="steps",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=2e-5,
                        num_train_epochs=1,
                        weight_decay=0.01,
                        save_total_limit=2,
                        predict_with_generate=True,
                        fp16 = True,
                        gradient_accumulation_steps = 6,
                        eval_steps = 500,
                        save_steps = 500,
                        load_best_model_at_end=True)

In [None]:
import nltk
nltk.download('punkt')
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

    # Extract a few results
    #sult = {key: value.mid.fmeasure * 100 for key, value in result.items()}


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# defining trainer using 🤗
trainer = Seq2SeqTrainer(model=model,
                args=args,
                train_dataset= GrammarDataset(train_dataset, tokenizer),
                eval_dataset=GrammarDataset(test_dataset, tokenizer),
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,0.0,,0.390941,0.259529,0.367211,0.368588




KeyboardInterrupt: ignored

In [None]:
trainer.save_model('t5large_gec_model')

I have uploaded this model to HuggingFace Model Zoo and we can run inference using it

## Testing

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
model_name = 'deep-learning-analytics/GrammarCorrector'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def correct_grammar(input_text,num_return_sequences):
  batch = tokenizer([input_text],truncation=True,padding='max_length',max_length=64, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=64,num_beams=4, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text

In [None]:
text = 'He are moving here.'
print(correct_grammar(text, num_return_sequences=2))

In [None]:
text = 'Cat drinked milk'
print(correct_grammar(text, num_return_sequences=1))