In [7]:
import pandas as pd
from nltk.tokenize import sent_tokenize
pd.set_option('display.max_colwidth', None)

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [8]:
def get_new_df(df):
    new_df = pd.DataFrame(columns=['Answer', 'Index'])
    texts = []
    indexes = []
    for index in range(len(df)):
        text_list = sent_tokenize(df['Answer'][index])
        for text in text_list:
            texts.append(text)
            indexes.append(index)

    new_df['Answer'] = pd.Series(texts)
    new_df['Index'] = pd.Series(indexes)
    return new_df

In [9]:
df = pd.read_csv("./data/FAQs.csv")
test_df = pd.read_csv("./data/FAQs_test.csv")
new_df = get_new_df(df)

In [10]:
text = new_df['Answer'][4]
text

'He had one sister named Maja.'

## Text Augment

In [11]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action
import os

In [12]:
aug = naw.SynonymAug(aug_src='wordnet',aug_p=0.5)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
He had one sister named Maja.
Augmented Text:
['Atomic number 2 experience one and only sister named Genus maja.']


In [13]:
from nltk.corpus import wordnet
synonyms = []
antonyms = []

for syn in wordnet.synsets("sibling"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))


{'sibling', 'sib'}
set()


Note:
- Syn of Sibling does not exist on wordnet

#### Using Contextual Word Embedding

In [14]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
He had one sister named Maja.
Augmented Text:
['she saves one girl named maja.']
