In [1]:
import pandas as pd
from transformers import pipeline
import spacy
import random
nlp = spacy.load("en_core_web_sm")

In [2]:
train_data = pd.read_csv("data_v2/train_data.csv")

In [3]:
train_data["sentiment"].value_counts()

1    3137
0     373
Name: sentiment, dtype: int64

In [4]:
unmasker_1 = pipeline("fill-mask", model="roberta-base")
unmasker_2 = pipeline("fill-mask", model="bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
train_data = train_data[train_data["sentiment"]==0]

In [6]:
train_data.shape

(373, 3)

In [42]:
def get_sents(x):
    for sent in nlp(x).sents:
        yield sent.text


In [26]:
def get_random_word_added_1(input_text):
    augmented_texts = []
    
    for sent in get_sents(input_text):
        orig_text_list = sent.split(" ")
        len_input = len(orig_text_list)
        if len_input > 3:
            rand_indx = random.randint(1, len_input-2)
            orig_text_list = orig_text_list[:rand_indx] + ["<mask>"] + orig_text_list[rand_indx:]
            new_text = " ".join(orig_text_list)
            augmented_text = unmasker_1(new_text)[0]['sequence']
            augmented_texts.append(augmented_text)
        else:
            augmented_texts.append(sent)
        
    text = ". ".join(augmented_texts)
    return text

In [31]:
from spacy.lang.en.stop_words import STOP_WORDS

In [27]:
def get_random_word_added_2(input_text):
    augmented_texts = []
    
    for sent in get_sents(input_text):
        orig_text_list = sent.split(" ")
        len_input = len(orig_text_list)
        if len_input > 3:
            rand_indx = random.randint(1, len_input-2)
            orig_text_list = orig_text_list[:rand_indx] + ["[MASK]"] + orig_text_list[rand_indx:]
            new_text = " ".join(orig_text_list)
            augmented_text = unmasker_2(new_text)[0]['sequence']
            augmented_texts.append(augmented_text)
        else:
            augmented_texts.append(sent)
        
    text = ". ".join(augmented_texts)
    return text

In [28]:
%%time 

train_data["augmented_text_1"] = train_data["reviewText"].apply(lambda x: get_random_word_added_1(x))

CPU times: user 2min 6s, sys: 35.2 s, total: 2min 41s
Wall time: 1min 59s


In [29]:
%%time 

train_data["augmented_text_2"] = train_data["reviewText"].apply(lambda x: get_random_word_added_2(x))

CPU times: user 1min 56s, sys: 33.1 s, total: 2min 29s
Wall time: 1min 47s


In [30]:
train_data

Unnamed: 0,reviewText,overall,sentiment,augmented_text_1,augmented_text_2
7,Used it in my Galaxy Note 2 for the past 16 mo...,2.0,0,Used it regularly in my Galaxy Note 2 for the ...,Used it in my Galaxy Note 2 for about the past...
15,Hi:I ordered two card and they arrived the nex...,1.0,0,Hi:I ordered these two card and they arrived t...,Hi : I ordered two card and they all arrived t...
19,Worked great in my galaxy s4 and died about 2 ...,1.0,0,Worked great in my galaxy s4 and died about 2 ...,Worked great in my galaxy s4 and died about 2 ...
24,Thus microSD card worked fine for a year in my...,2.0,0,Thus microSD card worked fine for a year in us...,Thus microSD card worked fine for a year in my...
32,I have an old SanDisk SD card that still works...,1.0,0,I also have an old SanDisk SD card that still ...,"I have an old, SanDisk SD card that still work..."
...,...,...,...,...,...
3482,Everything transfered over to my new SD card f...,1.0,0,Everything transfered over to my new SD card f...,Everything transfered over to my new SD card f...
3489,I purchased this card (the 64GB version) to us...,1.0,0,I purchased just this card (the 64GB version) ...,I purchased this card ( the 64GB version ) to ...
3493,I bought two of these Memory Cards for use wit...,2.0,0,I bought two of these Memory Cards for use wit...,I bought two of these new Memory Cards for use...
3497,"I bought this for a Galaxy Note 8.0, and while...",3.0,0,"I bought this for a Galaxy Note 8.0, and while...","I bought this for a Galaxy Note 8. 0, and whil..."


In [38]:
augmented_train_data_1 = train_data[["augmented_text_1", "overall", "sentiment"]]
augmented_train_data_1.rename(columns={"augmented_text_1": "reviewText"}, inplace=True)
augmented_train_data_2 = train_data[["augmented_text_2", "overall", "sentiment"]]
augmented_train_data_2.rename(columns={"augmented_text_2": "reviewText"}, inplace=True)
augmented_training_data = pd.concat([augmented_train_data_1, augmented_train_data_2], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  augmented_train_data_1.rename(columns={"augmented_text_1": "reviewText"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  augmented_train_data_2.rename(columns={"augmented_text_2": "reviewText"}, inplace=True)


In [39]:
augmented_training_data

Unnamed: 0,reviewText,overall,sentiment
0,Used it regularly in my Galaxy Note 2 for the ...,2.0,0
1,Hi:I ordered these two card and they arrived t...,1.0,0
2,Worked great in my galaxy s4 and died about 2 ...,1.0,0
3,Thus microSD card worked fine for a year in us...,2.0,0
4,I also have an old SanDisk SD card that still ...,1.0,0
...,...,...,...
741,Everything transfered over to my new SD card f...,1.0,0
742,I purchased this card ( the 64GB version ) to ...,1.0,0
743,I bought two of these new Memory Cards for use...,2.0,0
744,"I bought this for a Galaxy Note 8. 0, and whil...",3.0,0


In [41]:
augmented_training_data.to_csv("data_v2/Aug_RandomInsertion.csv", index=False)