In [None]:
import nlpaug.augmenter.char as nac

import nlpaug.augmenter.word as naw

import nlpaug.augmenter.sentence as nas

import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
from datasets import load_dataset

imdb_train= load_dataset('imdb',

split="train[:1000]+train[-1000:]")

imdb_test= load_dataset('imdb',

split="test[:500]+test[-500:]")

imdb_val= load_dataset('imdb',

split="test[500:1000]+test[-1000:-500]")

imdb_train.shape, imdb_test.shape, imdb_val.shape

In [None]:
from sklearn.metrics import (accuracy_score,

precision_recall_fscore_support)

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels,preds, average='macro')
  acc = accuracy_score(labels, preds)
  return {
  'Accuracy': acc,
  'F1': f1,
  'Precision': precision,
  'Recall': recall
  }

def tokenize_it(e):

  return tokenizer(e['text'],
  padding=True,
  truncation=True)

In [None]:
import nlpaug.augmenter.word as naw

#substitute character by keyboard distance

aug1 = nac.KeyboardAug(aug_word_p=0.2,

aug_char_max=2,

aug_word_max=4)

# random insert/swap/delete

aug2 = nac.RandomCharAug(action="insert", aug_char_max=1)

aug3 = nac.RandomCharAug(action="swap", aug_char_max=1)

aug4 = nac.RandomCharAug(action="delete", aug_char_max=1)

# spelling error

aug5 = naw.SpellingAug()

# contextual word insertion / substitute

aug6 = naw.ContextualWordEmbsAug(

model_path='bert-base-uncased',

action="insert")

aug7 = naw.ContextualWordEmbsAug(

model_path='bert-base-uncased',

action="substitute")

# wordnet-based synonym replacement

aug8 = naw.SynonymAug(aug_src='wordnet')

# random word deletion

aug9 = naw.RandomWordAug()

# back-translation

aug10 = naw.BackTranslationAug(

from_model_name='facebook/wmt19-en-de',

to_model_name='facebook/wmt19-de-en', device='cuda')

In [None]:
def augment_it(text, label):

  result= [eval("aug"+str(i)).augment(text)[0] for i in range(1,11) ]

  return result, [label]* len(result)

In [None]:
import pandas as pd

imdb_df=pd.DataFrame(imdb_train)

texts=[]

labels=[]

for r in imdb_df.sample(frac=0.1).itertuples(index=False):
  t,l=augment_it(r.text, r.label)

  texts+= t

  labels+=l

aug_df=pd.DataFrame()

aug_df["text"]= texts

aug_df["label"]= labels

imdb_augmented=pd.concat([imdb_df, aug_df])

imdb_df.shape, imdb_augmented.shape

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

model_path= 'bert-base-uncased'

tokenizer = BertTokenizerFast.from_pretrained(model_path)

#imdb train data with augmentation

imdb_augmented2= Dataset.from_pandas(imdb_augmented)

enc_train=imdb_augmented2.map(tokenize_it,  batched=True, batch_size=1000)

# imdb train data without augmentation

enc_train=imdb_train.map(tokenize_it,  batched=True, batch_size=1000)

enc_test=imdb_test.map(tokenize_it,  batched=True, batch_size=1000)

enc_val=imdb_val.map(tokenize_it,

batched=True, batch_size=1000)

model_path= "bert-base-uncased"

model = BertForSequenceClassification.from_pretrained(model_path,  id2label={0:"NEG", 1:"POS"},  label2id={"NEG":0, "POS":1})

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments( output_dir='./MyIMDBModel',  do_train=True, do_eval=True, num_train_epochs=3,  per_device_train_batch_size=16,  per_device_eval_batch_size=16, fp16=True, load_best_model_at_end=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics= compute_metrics)

trainer.train()

q=[trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]