In [None]:
# 安装可参考https://github.com/makcedward/nlpaug
# 最新版安装: pip install numpy git+https://github.com/makcedward/nlpaug.git

# NLP数据增强
# https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb

In [1]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [17]:
text = 'The quick brown fox jumps over the lazy dog .'
print(text)

The quick brown fox jumps over the lazy dog .


In [4]:
# Character Augmenter: OCR Augmenter, Keyboard Augmenter, Random Augmenter
# 通过预定义的OCR错误替换character
aug = nac.OcrAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['The quick brown f0x jumps over the lazy dog.', 'The qoicr brown fox jumps uvek the lazy dog.', 'The quick brown fux jumps 0vek the lazy d09.']


In [5]:
# 根据键盘距离替换替换character
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The !uic. br(wM fox jumps ov23 the lazy dog.


In [6]:
# 随机插入character
aug = nac.RandomCharAug(action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The ^quitck bZroown fox jumps over the la&zky dog.


In [7]:
# 随机替换character
aug = nac.RandomCharAug(action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The qufJk brown fox jQ%ps over the 7aoy dog.


In [8]:
# 随机交换character
aug = nac.RandomCharAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The uiqck brown fox jumps over the lzya dog.


In [9]:
# 随机删除character
aug = nac.RandomCharAug(action="delete")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick row fox ump oe the lazy dog.


In [None]:
# Word Augmenter: Spelling Augmenter, Word Embeddings Augmenter, TF-IDF Augmenter, Contextual Word Embeddings Augmenter
# Synonym Augmenter, Antonym(反义) Augmenter, Random Word Augmenter, Split Augmenter, Back Translation Augmenter, Reserved Word Augmenter
	# Word2vecAug, GloVeAug和FasttextAug使用词嵌入来找到最相似的一组词
	# BertAug使用语言模型来预测可能的目标词
	# WordNetAug使用统计方法找到相似词组合

In [13]:
# 通过拼写错误词典替换词
aug = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Texts:
['Thy quick BRWN fox jumps over they lazy dog.', 'The quick broun fox jumps other the lazi dog.', 'Thr quick brown fox jumps overt the last dog.']


In [7]:
# 通过词嵌入相似性插入词
# 首先pip install gensim>=4.1.2
# 然后下载预训练模型
model_dir = "temp/nlpaug/"
# from nlpaug.util.file.download import DownloadUtil
# DownloadUtil.download_word2vec(dest_dir=model_dir) # Download word2vec model / GoogleNews-vectors-negative300.bin / 下载会出现HTTPSConnection问题
# DownloadUtil.download_glove(model_name='glove.6B', dest_dir='.') # Download GloVe model / 下载可能会比较久
# DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.') # Download fasttext model / 可以下载
aug = naw.WordEmbsAug(
    model_type='fasttext', model_path=model_dir+'wiki-news-300d-1M.vec',
    action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick brown 37.85 fox jumps Lubartów over competition. the lazy dog.


In [8]:
# 通过词嵌入相似性替换词
aug = naw.WordEmbsAug(
    model_type='fasttext', model_path=model_dir+'wiki-news-300d-1M.vec',
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The obvious goldish fox jumps over these lazy dog.


In [None]:
# 通过tfidf相似性插入词
# 需要有文件tfidfaug_w2idf.txt
import os
os.environ["MODEL_DIR"] = "temp/nlpaug/"
aug = naw.TfIdfAug(
    model_path=os.environ.get("MODEL_DIR"),
    action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [None]:
# 通过tfidf相似性替换词
aug = naw.TfIdfAug(
    model_path=os.environ.get("MODEL_DIR"),
    action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [3]:
# 通过上下文词嵌入插入词 
# 首先pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
the quick moving brown fox jumps over by the lazy brown dog.


In [4]:
# 通过上下文词嵌入替换词
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
the quick brown fox jumps near the frozen river.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
another quick brown boar jumps over the deer dog.


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The dirty brown fox jumps over the lazy stump —


In [8]:
# 通过WordNet的同义词替换词
# 首先pip install nltk>=3.4.5
# 然后nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The agile brown dodger jumps over the faineant dog.


In [18]:
# 通过PPDB的同义词替换词
# 文件从http://paraphrase.org/#/download下载
import os 
os.environ["MODEL_DIR"] = 'temp/nlpaug/'
aug = naw.SynonymAug(aug_src='ppdb', model_path=os.environ.get("MODEL_DIR") + 'ppdb-2.0-s-all')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The rapid brown fox climbs over the lazy dog.


In [9]:
# 通过antonym替换词
aug = naw.AntonymAug()
_text = 'Good boy'
augmented_text = aug.augment(_text)
print("Original:")
print(_text)
print("Augmented Text:")
print(augmented_text)

Original:
Good boy
Augmented Text:
Bad boy


In [10]:
# 随机交换词
aug = naw.RandomWordAug(action="swap")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
Quick the fox brown over jumps the lazy dog.


In [11]:
# 随机删除词
aug = naw.RandomWordAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick brown fox jumps dog.


In [12]:
# 随机删除一组连续词
aug = naw.RandomWordAug(action='crop')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick brown fox jumps dog.


In [13]:
# 随机将词划分成两个tokens
aug = naw.SplitAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog .
Augmented Text:
The quick brown fox ju mps o ver the la zy dog.


In [14]:
# Back Translation Augmenter: 利用两个翻译模型来增强
# 首先pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
back_translation_aug.augment(text)

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/308k [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'The speedy brown fox jumps over the lazy dog.'

In [15]:
# Reserved Word Augmenter: 替换可逆词
text = 'Fwd: Mail for solution'
reserved_tokens = [
    ['FW', 'Fwd', 'F/W', 'Forward'],
]
reserved_aug = naw.ReservedAug(reserved_tokens=reserved_tokens)
augmented_text = reserved_aug.augment(text)

print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
Fwd: Mail for solution
Augmented Text:
Forward: Mail for solution


In [None]:
# Sentence Augmentation: 
	# Contextual Word Embeddings for Sentence Augmenter
	# Abstractive Summarization Augmenter
	# 首先pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
	# 与官方运行结果存在不一致

In [None]:
# 通过上下文词嵌入插入句子(GPT2 or XLNet)
aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')
augmented_texts = aug.augment(text)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

aug = nas.ContextualWordEmbsForSentenceAug(model_path='distilgpt2')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)