In [18]:
import csv
import os
import random
from itertools import chain
from string import punctuation

import datasets
import nltk
import numpy as np
import pandas as pd
import torch
import time
from datasets import Dataset as dDataset
from datasets import load_metric
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import pickle as pkl
from tqdm import tqdm
from icecream import ic
from transformers import (DataCollatorForSeq2Seq, Seq2SeqTrainer,
                          Seq2SeqTrainingArguments, T5ForConditionalGeneration,
                          T5Tokenizer, BertTokenizer, BertModel)
from jiwer import wer
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model_name = 't5-base'

os.environ["WANDB_DISABLED"] = "true"
pd.set_option('display.max_colwidth', None)

cuda


In [3]:
import os
import pandas as pd
from tqdm import tqdm

if not os.path.exists("dataframePandasGrammar.csv"):
    # Load data from the CSV file
    df = pd.read_csv('C4_200M_1M.csv', header=None)

    # Limit the data to the first 1,000,000 rows
    df = df.iloc[:100000]

    # Rename columns if necessary
    df.columns = ["input", "output"]

    # Drop rows with any missing values
    df.dropna(inplace=True)

    # Save the DataFrame to a CSV file
    df.to_csv("dataframePandasGrammar.csv", index=False)
else:
    # Load the DataFrame from the CSV file
    df = pd.read_csv("dataframePandasGrammar.csv")


In [4]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [5]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [6]:
train_df, val_df = train_test_split(df, test_size=0.20, shuffle=True)
train_df, test_df = train_test_split(train_df, test_size=0.20, shuffle=True)
ic(train_df.shape, val_df.shape, test_df.shape)

ic| train_df.shape: (64000, 2)
    val_df.shape: (20000, 2)
    test_df.shape: (16000, 2)


((64000, 2), (20000, 2), (16000, 2))

In [7]:
val_df['input_token_len'] = val_df['input'].apply(calc_token_len)

In [8]:
train_dataset = dDataset.from_pandas(train_df)
test_dataset = dDataset.from_pandas(val_df)

In [9]:
class LangDataset(Dataset):
    def __init__(self, dataset, tokenizer, print_text=False):
        self.dataset = dataset
        self.maxPad = False
        self.tokenizer = tokenizer
        self.max_len = 64

    def __len__(self):
        return len(self.dataset)

    def tokenize_data(self, example):
        input_, target_ = example['input'], example['output']

        # tokenize inputs
        tokenized_inputs = tokenizer(input_, pad_to_max_length=self.maxPad,
                                     max_length=self.max_len,
                                     return_attention_mask=True)

        tokenized_targets = tokenizer(target_, pad_to_max_length=self.maxPad,
                                      max_length=self.max_len,
                                      return_attention_mask=True)

        inputs = {"input_ids": tokenized_inputs['input_ids'],
                  "attention_mask": tokenized_inputs['attention_mask'],
                  "labels": tokenized_targets['input_ids']
                  }

        return inputs

    def __getitem__(self, index):
        inputs = self.tokenize_data(self.dataset[index])

        return inputs

dataset = LangDataset(test_dataset, tokenizer, True)


In [10]:
rouge_metric = load_metric("rouge")

  rouge_metric = load_metric("rouge")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [11]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, padding='longest', return_tensors='pt')


# defining training related arguments
batch_size = 10
args = Seq2SeqTrainingArguments(output_dir="./weights",
                                evaluation_strategy="steps",
                                per_device_train_batch_size=batch_size,
                                per_device_eval_batch_size=batch_size,
                                learning_rate=2e-5,
                                num_train_epochs=2,
                                weight_decay=0.01,
                                save_total_limit=10,
                                predict_with_generate=True,
                                fp16=True,
                                gradient_accumulation_steps=5,
                                eval_steps=9000,
                                save_steps=2000
                                )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(
        predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                     for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    result = rouge_metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(
        pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

In [13]:
trainer = Seq2SeqTrainer(model=model,
                         args=args,
                         train_dataset=LangDataset(
                             train_dataset, tokenizer),
                         eval_dataset=LangDataset(test_dataset, tokenizer),
                         tokenizer=tokenizer,
                         data_collator=data_collator,
                         compute_metrics=compute_metrics)

if not os.path.exists('t5'):
    trainer.train()
    trainer.save_model('t5')

print('Loading model')
model = T5ForConditionalGeneration.from_pretrained('./t5/')
model.to(device)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Step,Training Loss,Validation Loss


Loading model


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [14]:

def correct_grammar(input_text, num_return_sequences):
    batch = tokenizer([input_text], truncation=True, padding='max_length',
                      max_length=64, return_tensors="pt")
    batch = batch.to(device)
    translated = model.generate(**batch, max_length=64, num_beams=4,
                                num_return_sequences=num_return_sequences, temperature=1.5)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text


## Testing

In [20]:
if not os.path.exists('./grammarOutput.csv'):
    test_df = test_df[:1000]
    grammarOutputFile = open('./grammarOutput.csv', 'w')
    grammarOutputWriter = csv.DictWriter(
        grammarOutputFile, fieldnames=['input', 'output', 'truth'])
    grammarOutputWriter.writeheader()

    for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Testing"):
        inp = row['input']
        prediction = correct_grammar(inp, num_return_sequences=1)[0]
        grammarOutputWriter.writerow(
            {'input': inp, 'output': prediction, 'truth': row['output']})
        # truth = row['output']
        # total_wer += wer(truth, prediction)
        # total_fake_wer += wer(truth, inp)

    grammarOutputFile.close()
# print('WER: ', total_wer/len(test_df))
# print('Fake WER: ', total_fake_wer/len(test_df))

scorer = rouge_scorer.RougeScorer(
    ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
total_rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
total_rouge_scoresFake = {"rouge1": 0, "rouge2": 0, "rougeL": 0}
grammarFile = open('./grammarOutput.csv', 'r')
grammarReader = csv.DictReader(grammarFile)
total_wer = 0
total_fake_wer = 0
correct = []
predicted = []
Inps = []
data = list(grammarReader)
for row in tqdm(data, desc="Calculating Stats"):
    truth = row['truth']
    prediction = row['output']
    inp = row['input']

    correct.append(truth)
    predicted.append(prediction)
    Inps.append(inp)

    rougeScore = scorer.score(truth, prediction)
    fakeRougeScore = scorer.score(truth, inp)

    for key in rougeScore:
        total_rouge_scores[key] += rougeScore[key].fmeasure
        total_rouge_scoresFake[key] += fakeRougeScore[key].fmeasure

    total_wer += wer(truth, prediction)
    total_fake_wer += wer(truth, inp)


print('WER: ', total_wer/len(data))
print('Fake WER: ', total_fake_wer/len(data))
percentChange = (total_wer-total_fake_wer)/total_fake_wer
print(f"Percent change: {percentChange*100}%")
avgRouge = {}
avgRougeFake = {}
for key in total_rouge_scores:
    avgRouge[key] = total_rouge_scores[key]/len(data)
    avgRougeFake[key] = total_rouge_scoresFake[key]/len(data)
print('Avg ROUGE: ', avgRouge)
print('Avg Fake ROUGE: ', avgRougeFake)

percChangeRouge = {}
for key in avgRouge:
    percChangeRouge[key] = 100 * \
        (avgRouge[key]-avgRougeFake[key])/avgRougeFake[key]
print(f'Percent change ROUGE: {percChangeRouge}')

# BLEU
# bleuCorrect = [[text.split()] for text in correct]
# bleuPredicted = [text.split() for text in predicted]
# blueInps = [text.split() for text in Inps]
# actualBleu = corpus_bleu(bleuCorrect, bleuPredicted)
# print('BLEU: ', actualBleu)

# fakeBleu = corpus_bleu(bleuCorrect, blueInps)
# print("Fake BLEU: ", fakeBleu)

# print(f"Percent change: {((actualBleu-fakeBleu)/fakeBleu)*100}%")

# BERTScore
def cosine_similarity(vector1, vector2):
  
    dot_product = np.dot(vector1, vector2.T)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased")

correct_bert =   tokenizer_bert(correct, return_tensors="pt", padding=True, truncation=True)
predicted_bert = tokenizer_bert(predicted, return_tensors="pt", padding=True, truncation=True)
inps_bert =      tokenizer_bert(Inps, return_tensors="pt", padding=True, truncation=True)


correct_output =  model_bert(**correct_bert)
predicted_output= model_bert(**predicted_bert)
inps_output =     model_bert(**inps_bert)

# Obtain the representation vectors
embeddings_correct =   correct_output.last_hidden_state.mean(dim=1).detach().numpy()
embeddings_predicted = predicted_output.last_hidden_state.mean(dim=1).detach().numpy()
embeddings_inps =      inps_output.last_hidden_state.mean(dim=1).detach().numpy()



similarity_bert  =  cosine_similarity(embeddings_correct, embeddings_predicted)
similarity_fake  =  cosine_similarity(embeddings_correct, embeddings_inps)


print("Similarity between the texts: {:.4f}".format(similarity_bert[0][0]))
print("Fake Similarity between the texts: {:.4f}".format(similarity_fake[0][0]))

Calculating Stats:   0%|          | 0/1000 [00:00<?, ?it/s]

Calculating Stats: 100%|██████████| 1000/1000 [00:01<00:00, 672.94it/s]


WER:  0.25430218068180593
Fake WER:  0.2726298614615928
Percent change: -6.722550743902578%
Avg ROUGE:  {'rouge1': 0.8747341358004327, 'rouge2': 0.763919816176895, 'rougeL': 0.8673250689402646}
Avg Fake ROUGE:  {'rouge1': 0.8712120802953993, 'rouge2': 0.7395022535201441, 'rougeL': 0.8632321464185431}
Percent change ROUGE: {'rouge1': 0.4042707378253019, 'rouge2': 3.301891581874097, 'rougeL': 0.4741392612291661}


: 