In [1]:
import os

artifacts_path = os.path.join(os.path.curdir, 'artifacts/')
models_path = os.path.join(artifacts_path, 'models/')

In [2]:
import pandas as pd
import os

base_path = "/media/ohtar10/Adder-Storage/datasets/twitter/2013/"
tweets = pd.read_parquet(os.path.join(base_path, 'labeling-completed/tweet-labels.parquet'), engine='pyarrow')
tweets.head()

Unnamed: 0,id,tweet,category,other
0,383319220961828867,english boy writes with his notebook positione...,other_topic,writing ways
1,383294856249888768,"The girls i talk to are like the jobs i get, i...",other_topic,relationships
2,383099313616060416,This I'd what sister are for. @ jolly-hill htt...,na,
3,383357888271360000,UOENO,na,
4,383358030852550657,@russmillerdrums thanks for taking time to tal...,na,


## Original working data set

In [3]:
filters = ["home", "office", "music", "health", "tech", "clothing", "games", "books", "movies", "sports", "other_product"]

def category_match(string, filters):
    categories = string.split(',')
    categories = [c.strip() for c in categories]
    if set(categories).intersection(filters):
        return True
    return False

tweets = tweets.loc[tweets['category'].apply(category_match, args=[filters])]
tweets.head()

Unnamed: 0,id,tweet,category,other
21,383430944650055680,#WeLoveLA Pat Haden meets with NCAA to seek ea...,sports,
37,383303035138482176,I had like perfect internet service and then i...,tech,
39,383292826219335681,Oh! Got it :) “@_R0YAL_: @cy_dieyi d legoo is ...,music,
41,383322559640379392,We were us is my song right now !❤️❤️❤️❤️❤️❤️❤️,music,
48,383368197853810688,“@justincepriano: I don't understand how peopl...,games,


In [4]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

n_augment = 3

### Keyboard typo augmentation
**Note:** 14 min for ~8k records with 3 new examples augmentation

In [5]:
%%time

key_aug_path = os.path.join(artifacts_path, 'tweets/tweets_key_aug.parquet')
if os.path.exists(key_aug_path):
    key_aug = pd.read_parquet(key_aug_path, engine='pyarrow')
else:
    aug = nac.KeyboardAug()
    key_aug = tweets.apply(lambda row: pd.Series({'id': row['id'], 
                                                        'tweet': aug.augment(row['tweet'], n=n_augment), 
                                                        'category': row['category'], 
                                                        'other': row['other']}), 
                                                        axis=1).explode('tweet')
    key_aug.to_parquet(key_aug_path, engine='pyarrow', index=False)
    
key_aug.head()

CPU times: user 57.6 s, sys: 7.76 s, total: 1min 5s
Wall time: 14min 43s


Unnamed: 0,id,tweet,category,other
21,383430944650055680,# WeLoveLA Pat HaVen meets wOth NCAA to Qeek e...,sports,
21,383430944650055680,# seLobeLA Pat Hwden meets with NVAA to seeo e...,sports,
21,383430944650055680,"# WeLkve,A Pat Haden meeRs wity NCAA to sRek e...",sports,
37,383303035138482176,I had like Lertect internet serGide and theJ i...,tech,
37,383303035138482176,I had lJke perffvt internet service and then i...,tech,


### Embedding substitution
**Note:** 5h for ~8k records with 3 new examples augmentation

In [6]:
%%time

emb_subs_path = os.path.join(artifacts_path, 'tweets/tweets_emb_subs.parquet')
if os.path.exists(emb_subs_path):
    emb_subs = pd.read_parquet(emb_subs_path, engine='pyarrow')
else:
    aug = naw.WordEmbsAug(model_type='word2vec', action='substitute', model_path=os.path.join(models_path, 'GoogleNews-vectors-negative300'))
    emb_subs = tweets.apply(lambda row: pd.Series({'id': row['id'],
                                                'tweet': aug.augment(row['tweet'], n=n_augment),
                                                'category': row['category'],
                                                'other': row['other']}), axis=1).explode('tweet')
    emb_subs.to_parquet(emb_subs_path, engine='pyarrow', index=False)

emb_subs.head()                                              

CPU times: user 1d 5h 41min 22s, sys: 6min 28s, total: 1d 5h 47min 50s
Wall time: 5h 14s


Unnamed: 0,id,tweet,category,other
21,383430944650055680,# WeLoveLA Pat Haden satisfies with AIAW to se...,sports,
21,383430944650055680,# WeLoveLA Pat Haden meets whom California_Int...,sports,
21,383430944650055680,# WeLoveLA Pat Kaveinga meets bringing nonscho...,sports,
37,383303035138482176,I had weird golden internet service and into i...,tech,
37,383303035138482176,I gave like perfect internet service and then ...,tech,


### Embedding insertion
**Note:** 3h 51m for ~8k records with 3 new examples augmentation

In [7]:
%%time

emb_insert_path = os.path.join(artifacts_path, 'tweets/tweets_emb_insert.parquet')
if os.path.exists(emb_insert_path):
    emb_insert = pd.read_parquet(emb_insert_path, engine='pyarrow')
else:
    aug = naw.WordEmbsAug(model_type='word2vec', action='insert', model_path=os.path.join(models_path, 'GoogleNews-vectors-negative300'))
    emb_insert = tweets.apply(lambda row: pd.Series({'id': row['id'],
                                                'tweet': aug.augment(row['tweet'], n=n_augment),
                                                'category': row['category'],
                                                'other': row['other']}), axis=1).explode('tweet')
    emb_insert.to_parquet(emb_insert_path, engine='pyarrow', index=False)

emb_insert.head()  

CPU times: user 3h 46min 22s, sys: 7 s, total: 3h 46min 29s
Wall time: 3h 51min 58s


Unnamed: 0,id,tweet,category,other
21,383430944650055680,# WeLoveLA Pat Imagine Haden meets with NCAA E...,sports,
21,383430944650055680,# WeLoveLA Pat unsubsidized Haden meets Guy wi...,sports,
21,383430944650055680,# equaled WeLoveLA Roachton Pat Haden Shares m...,sports,
37,383303035138482176,Schiphol I Jan had like perfect internet servi...,tech,
37,383303035138482176,I had FAST like perfect internet service Behol...,tech,


### Synonim substitution
**Note:** 14 min for ~8k records with 3 new examples augmentation

In [8]:
%%time

synonim_aug_path = os.path.join(artifacts_path, 'tweets/tweets_synonim_aug.parquet')
if os.path.exists(synonim_aug_path):
    synonim_aug = pd.read_parquet(synonim_aug_path, engine='pyarrow')
else:
    aug = naw.SynonymAug(aug_src='wordnet')
    synonim_aug = tweets.apply(lambda row: pd.Series({'id': row['id'],
                                                'tweet': aug.augment(row['tweet'], n=n_augment),
                                                'category': row['category'],
                                                'other': row['other']}), axis=1).explode('tweet')
    synonim_aug.to_parquet(synonim_aug_path, engine='pyarrow', index=False)

synonim_aug.head()   

CPU times: user 2min 5s, sys: 8.79 s, total: 2min 14s
Wall time: 14min 41s


Unnamed: 0,id,tweet,category,other
21,383430944650055680,# WeLoveLA Pat Haden meet with NCAA to try eas...,sports,
21,383430944650055680,# WeLoveLA Pat Haden meet with NCAA to seek re...,sports,
21,383430944650055680,# WeLoveLA Pat Haden meet with NCAA to seek ea...,sports,
37,383303035138482176,I make comparable perfect cyberspace service a...,tech,
37,383303035138482176,I had like everlasting internet service and th...,tech,


### Context insertion with BERT
**Note:** 1h 47min for ~8k records with 3 new examples augmentation

In [9]:
%%time

ctx_insert_path = os.path.join(artifacts_path, 'tweets/tweets_ctx_insert_aug.parquet')

if os.path.exists(ctx_insert_path):
    ctx_insert = pd.read_parquet(ctx_insert_path, engine='pyarrow')
else:
    aug = naw.ContextualWordEmbsAug(action='insert')
    ctx_insert = tweets.apply(lambda row: pd.Series({'id': row['id'], 
                                                        'tweet': aug.augment(row['tweet'], n=n_augment), 
                                                        'category': row['category'], 
                                                        'other': row['other']}), 
                                                        axis=1).explode('tweet')
    ctx_insert.to_parquet(ctx_insert_path, engine='pyarrow', index=False)
ctx_insert.head()

CPU times: user 10h 40min 36s, sys: 2min 42s, total: 10h 43min 18s
Wall time: 1h 47min 20s


Unnamed: 0,id,tweet,category,other
21,383430944650055680,# kevin welovela and pat al haden first meets ...,sports,
21,383430944650055680,# welovela pat haden regularly meets daily wit...,sports,
21,383430944650055680,# welovela group pat john haden meets with nca...,sports,
37,383303035138482176,i had like perfect internet service and then.....,tech,
37,383303035138482176,and i had like perfect normal internet service...,tech,


### Context substitution with BERT
**Note:** 1h 36m min for ~8k records with 3 new examples augmentation

In [10]:
%%time

ctx_subs_path = os.path.join(artifacts_path, 'tweets/tweets_ctx_substitute_aug.parquet')

if os.path.exists(ctx_subs_path):
    ctx_subs = pd.read_parquet(ctx_subs_path, engine='pyarrow')
else:
    aug = naw.ContextualWordEmbsAug(action='substitute')
    ctx_subs = tweets.apply(lambda row: pd.Series({'id': row['id'], 
                                                        'tweet': aug.augment(row['tweet'], n=n_augment), 
                                                        'category': row['category'], 
                                                        'other': row['other']}), 
                                                        axis=1).explode('tweet')
    ctx_subs.to_parquet(ctx_subs_path, engine='pyarrow', index=False)
ctx_subs.head()

CPU times: user 9h 33min 13s, sys: 2min 34s, total: 9h 35min 48s
Wall time: 1h 36min


Unnamed: 0,id,tweet,category,other
21,383430944650055680,# 7 pat had meets before ncaa referees seek ea...,sports,
21,383430944650055680,# peggy lee boone meets with ncaa that seek en...,sports,
21,383430944650055680,# president pat robertson tells his ncaa to po...,sports,
37,383303035138482176,i could like perfect internet service and here...,tech,
37,383303035138482176,i had pretty perfect telephone service so then...,tech,


### Sentence augmentation with BERT
**Note:** 13h for ~8k records with 3 new examples augmentation

In [11]:
%%time

ctx_sent_aug_path = os.path.join(artifacts_path, 'tweets/tweets_ctx_sent_aug.parquet')

if os.path.exists(ctx_sent_aug_path):
    ctx_sent_aug = pd.read_parquet(ctx_sent_aug_path, engine='pyarrow')
else:
    aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')
    ctx_sent_aug = tweets.apply(lambda row: pd.Series({'id': row['id'], 
                                                        'tweet': aug.augment(row['tweet'], n=n_augment), 
                                                        'category': row['category'], 
                                                        'other': row['other']}), 
                                                        axis=1).explode('tweet')
    ctx_sent_aug.to_parquet(ctx_sent_aug_path, engine='pyarrow', index=False)
ctx_sent_aug.head()

CPU times: user 3d 5h 27min 16s, sys: 28min 19s, total: 3d 5h 55min 35s
Wall time: 13h 51s


Unnamed: 0,id,tweet,category,other
21,383430944650055680,#WeLoveLA Pat Haden meets with NCAA to seek ea...,sports,
21,383430944650055680,#WeLoveLA Pat Haden meets with NCAA to seek ea...,sports,
21,383430944650055680,#WeLoveLA Pat Haden meets with NCAA to seek ea...,sports,
37,383303035138482176,I had like perfect internet service and then i...,tech,
37,383303035138482176,I had like perfect internet service and then i...,tech,


### Put all togheter

In [12]:
all_tweets_path = os.path.join(artifacts_path, 'tweets/all_tweets.parquet')

if os.path.exists(all_tweets_path):
    all_tweets = pd.read_parquet(all_tweets_path, engine='pyarrow')
else:
    all_tweets = pd.concat([
            tweets,
            key_aug,
            emb_subs,
            emb_insert,
            synonim_aug,
            ctx_insert,
            ctx_subs,
            ctx_sent_aug
        ])
    all_tweets.to_parquet(all_tweets_path, engine='pyarrow', index=False)