# Data Augmentation Library Test

https://bhuvanagopalakrishna-basapur.medium.com/data-augmentation-in-nlp-b09e919daab5

There are three standard methods to generate new data in NLP:  
    - **Paraphrasing**  
    - **Noising**  
    - **Sampling**  

![Alt text](https://miro.medium.com/max/1400/0*SPfMHZhYOrZRVdMC "a title")

# AugLy

In [1]:
import augly.text as textaugs

In [2]:
# Define input text
#input_text = "Hello, world! How are you today?"
input_text = "Mi vieja mula ya no es lo que era"

In [3]:
# Typos noise
print(textaugs.simulate_typos(input_text))

Mi vieja mual ya no es #lo que era


In [4]:
"""
You can optionally pass in a metadata list, to which metadata about the
augmentation will be appended including kwargs and intensity (defined based on
the kwargs for each augmentation).
"""
meta = []
print(
    textaugs.replace_fun_fonts(
        input_text, vary_fonts=True, granularity="word", metadata=meta
    )
)
meta

Mi v̲i̲e̲j̲a̲ mula ya no es lo 𝓆𝓊𝑒 era


[{'name': 'replace_fun_fonts',
  'input_type': 'string',
  'src_length': 1,
  'dst_length': 38,
  'aug_p': 0.3,
  'aug_min': 1,
  'aug_max': 10000,
  'granularity': 'word',
  'vary_fonts': True,
  'fonts_path': 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\augly\\assets\\text\\fun_fonts.json',
  'n': 1,
  'priority_words': None,
  'intensity': 30.0}]

In [5]:
# Noise -> UnicodeChars
meta = []
aug = textaugs.ReplaceSimilarUnicodeChars(aug_word_p=0.6)
print(aug(input_text, metadata=meta))
meta

ʍi vieja muᏝa ya no eṧ Ⓛo que erǺ


[{'name': 'replace_similar_unicode_chars',
  'input_type': 'string',
  'src_length': 1,
  'dst_length': 33,
  'aug_char_p': 0.3,
  'aug_word_p': 0.6,
  'min_char': 2,
  'aug_char_min': 1,
  'aug_char_max': 1000,
  'aug_word_min': 1,
  'aug_word_max': 1000,
  'n': 1,
  'mapping_path': 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\augly\\assets\\text\\letter_unicode_mapping.json',
  'priority_words': None,
  'intensity': 18.0}]

In [6]:
# Swapping gender in text
gendered_text = "She has two brothers, but she always wanted a sister"

aug = textaugs.SwapGenderedWords(aug_word_p=1.0)
print(aug(gendered_text))

He has two sisters, but he always wanted a brother


In [7]:
#In English we can use contractions
aug = textaugs.Contractions(aug_p=1.0)
print(aug("I am Julieta"))

["I'm Julieta"]


In [8]:
# noise -> agregate random puntuaction
texts = ["hello world", "bye planet"]
#texts = input_text

augmented_synonyms = textaugs.insert_punctuation_chars(
    texts,
    granularity="all",
    cadence=5.0,
    vary_chars=True,
)
augmented_synonyms

['hello: worl!d', 'bye p-lanet']

In [9]:
#Mirror (origin: fil-flp in computer vision)
print(textaugs.replace_bidirectional(input_text))

['\u202eare euq ol se on ay alum ajeiv iM\u202c']


In [10]:
print(textaugs.replace_similar_chars(input_text))

IVIi vieja mula ya no es lo que era


In [11]:
print(textaugs.replace_upside_down(input_text))

ɐɹǝ ǝnb ol sǝ ou ɐʎ ɐlnɯ ɐɾǝᴉʌ ᴉW


In [12]:
print(textaugs.split_words(input_text))

Mi vie ja m ula ya no es lo que era


In [13]:
from augly.text.composition import Compose, OneOf
from augly.text.functional import (
    apply_lambda,
    change_case,
    contractions,
    get_baseline,
    insert_punctuation_chars,
    insert_whitespace_chars,
    insert_zero_width_chars,
    merge_words,
    replace_bidirectional,
    replace_fun_fonts,
    replace_similar_chars,
    replace_similar_unicode_chars,
    replace_upside_down,
    replace_words,
    simulate_typos,
    split_words,
    swap_gendered_words,
)

In [14]:
print(apply_lambda(input_text))

Mi vieja mula ya no es lo que era


In [15]:
print(change_case(input_text))

['MI VIEJA mula ya NO es LO QUE era']


In [16]:
print(get_baseline(input_text))

Mi vieja mula ya no es lo que era


In [17]:
print(insert_punctuation_chars(input_text))

['M!i! !v!i!e!j!a! !m!u!l!a! !y!a! !n!o! !e!s! !l!o! !q!u!e! !e!r!a']


In [18]:
print(insert_whitespace_chars(input_text))

['M\ti\t \tv\ti\te\tj\ta\t \tm\tu\tl\ta\t \ty\ta\t \tn\to\t \te\ts\t \tl\to\t \tq\tu\te\t \te\tr\ta']


In [19]:
print(insert_zero_width_chars(input_text))

['M\u2062i\u2062 \u2062v\u2062i\u2062e\u2062j\u2062a\u2062 \u2062m\u2062u\u2062l\u2062a\u2062 \u2062y\u2062a\u2062 \u2062n\u2062o\u2062 \u2062e\u2062s\u2062 \u2062l\u2062o\u2062 \u2062q\u2062u\u2062e\u2062 \u2062e\u2062r\u2062a']


In [20]:
print(merge_words(input_text))

Mi vieja mula ya noes lo queera


In [21]:
print(replace_similar_unicode_chars(input_text))

Mi vieja mula ya no es lo ⓠue ໂra


In [22]:
print(replace_words(input_text))

Mi vieja mula ya no es lo que era


# Thesaurus

In [23]:
from py_thesaurus import Thesaurus

In [24]:
input_word = "dog"

new_instance = Thesaurus(input_word)

In [25]:
# Get the synonyms according to part of speech
# Default part of speech is noun

print(new_instance.get_synonym())

print(new_instance.get_synonym(pos='verb'))

print(new_instance.get_synonym(pos='adj'))

[]
[]
[]


In [26]:
# Get the definitions

print(new_instance.get_definition())

Give a non-empty argument
[]


In [27]:
# Get the antonyms

print(new_instance.get_antonym())

[]


# WordNet

In [28]:
import nltk
nltk.download('wordnet')
#nltk.download('omw-1.4')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MegaTecnologia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
def syn(word, lch_threshold=2.26):
    for net1 in wordnet.synsets(word):
        for net2 in wordnet.all_synsets():
            try:
                lch = net1.lch_similarity(net2)
            except:
                continue
            # The value to compare the LCH to was found empirically.
            if lch >= lch_threshold:
                yield (net1, net2, lch)

In [30]:
for x in syn('love'):
    print (x)

(Synset('love.n.01'), Synset('feeling.n.01'), 2.538973871058276)
(Synset('love.n.01'), Synset('conditioned_emotional_response.n.01'), 2.538973871058276)
(Synset('love.n.01'), Synset('emotion.n.01'), 2.9444389791664407)
(Synset('love.n.01'), Synset('worship.n.02'), 2.9444389791664407)
(Synset('love.n.01'), Synset('anger.n.01'), 2.538973871058276)
(Synset('love.n.01'), Synset('fear.n.01'), 2.538973871058276)
(Synset('love.n.01'), Synset('fear.n.03'), 2.538973871058276)
(Synset('love.n.01'), Synset('anxiety.n.02'), 2.538973871058276)
(Synset('love.n.01'), Synset('joy.n.01'), 2.538973871058276)
(Synset('love.n.01'), Synset('love.n.01'), 3.6375861597263857)
(Synset('love.n.01'), Synset('agape.n.02'), 2.9444389791664407)
(Synset('love.n.01'), Synset('agape.n.01'), 2.9444389791664407)
(Synset('love.n.01'), Synset('filial_love.n.01'), 2.9444389791664407)
(Synset('love.n.01'), Synset('ardor.n.02'), 2.9444389791664407)
(Synset('love.n.01'), Synset('amorousness.n.01'), 2.9444389791664407)
(Synset

# Semantic Embeddings

In [31]:
from gensim.models import KeyedVectors

In [32]:
loaded_word_vectors = KeyedVectors.load('word2vec.kv')

loading KeyedVectors object from word2vec.kv
loading wv recursively from word2vec.kv.wv.* with mmap=None
setting ignored attribute cum_table to None
Word2Vec lifecycle event {'fname': 'word2vec.kv', 'datetime': '2022-04-27T17:58:47.276885', 'gensim': '4.0.1', 'python': '3.8.10 | packaged by conda-forge | (default, May 11 2021, 06:25:23) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'loaded'}


In [34]:
loaded_word_vectors.wv.most_similar("reporte", topn=3)

[('gr55', 0.9576658010482788),
 ('provision', 0.9559593200683594),
 ('informes', 0.9514330625534058)]

# Conclusions

![Alt text](https://miro.medium.com/max/1400/0*HRDJYmZ1FD0q83CD "a title")