In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/711_as3/dataAug/

In [None]:
! pip install transformers

In [5]:
import pandas as pd
import numpy as np

train_csv_path = 'train.tsv'

original_train = pd.read_csv(train_csv_path, sep='\t', header=None)

In [7]:
X_train = original_train[0]
y_train = original_train[1]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_train)
print(y_train)

# Augmentation using ProtAugmenter

In [None]:
! pip install nlpaug==1.1.7

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

In [12]:
from nlpaug.augmenter.word import WordAugmenter

class ProtAugmenter(WordAugmenter):


    def __init__(self,n_data = 1, num_beams = 10,
        name='ProtAugmenter'):
        super().__init__(
            action='substitute')

        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

        tokenizer = AutoTokenizer.from_pretrained("tdopierre/ProtAugment-ParaphraseGenerator")

        model = AutoModelForSeq2SeqLM.from_pretrained("tdopierre/ProtAugment-ParaphraseGenerator")
        model = model.to('cuda')

        self.model = model
        self.tokenizer = tokenizer
        self.n_data = n_data
        self.num_beams = num_beams

    def substitute(self, data, n=1):

        batch = self.tokenizer(data, return_tensors='pt', truncation=True).to('cuda')
        generated_ids = self.model.generate(batch['input_ids'], num_return_sequences = self.n_data, num_beams = self.num_beams)
        result = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        return result

In [None]:
aug_prot = ProtAugmenter(5)
ProtAugm_train = base_train.copy()
ProtAugm_train['paraphrase'] = ProtAugm_train['text'].progress_apply(lambda x:aug_prot.augment(x))
ProtAugm_train = ProtAugm_train.explode('paraphrase').reset_index(drop=True)
ProtAugm_train

In [15]:
# drop the text col
augmented_train = ProtAugm_train.drop(columns=['text'])

columns_titles = ["paraphrase", "label"]
augmented_train = augmented_train.reindex(columns=columns_titles)

In [None]:
# concat augmented and original
base_train = base_train.rename(columns={'text': 'paraphrase'})
final_train = pd.concat([augmented_train, base_train], axis=0, ignore_index=True)
final_train.shape

In [17]:
final_train.to_csv('train_Prot_augmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)