# GoEmotions Dataset

In [4]:
import pandas as pd
import numpy as np

train_csv_path = 'train.tsv'
original_train = pd.read_csv(train_csv_path, sep='\t', header=None)

In [None]:
original_train.head(10)

In [6]:
X_train = original_train[0]
y_train = original_train[1]

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_train)
print(y_train)

In [None]:
y_train.value_counts()

# Augmentation using EDA

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [12]:
# apply augmentation to the train data and save the results into a file
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action
import nlpaug.flow as naf

aug_eda = naf.Sequential([
    naf.Sometimes([naw.RandomWordAug(action="swap")]),
    naf.Sometimes([naw.RandomWordAug(action="delete")]),
    naf.Sometimes([naw.SynonymAug(aug_src='wordnet')])
])


In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

In [20]:
rep = 5
eda_train = base_train.copy()
eda_train['paraphrase'] = eda_train['text'].apply(lambda x:aug_eda.augment(x, rep))
eda_train = eda_train.explode('paraphrase').reset_index(drop=True)

In [None]:
eda_train.head(20)

In [None]:
# drop the text col
augmented_train = eda_train.drop(columns=['text'])

columns_titles = ["paraphrase", "label"]
augmented_train = augmented_train.reindex(columns=columns_titles)
augmented_train

In [None]:
# concat augmented and original
base_train = base_train.rename(columns={'text': 'paraphrase'})
final_train = pd.concat([augmented_train, base_train], axis=0, ignore_index=True)
final_train.shape

# Create CSV file from augmented dataset

In [31]:
final_train.to_csv('train_EDA_augmented.tsv', sep="\t", encoding='utf-8', index=False, header=None)