In [1]:
import pandas as pd
from transformers import pipeline
import spacy
import random
nlp = spacy.load("en_core_web_sm")

In [2]:
val_data = pd.read_csv("data_v2/validation_data.csv")

In [3]:
val_data["sentiment"].value_counts()

1    392
0     47
Name: sentiment, dtype: int64

In [4]:
unmasker_1 = pipeline("fill-mask", model="roberta-base")
unmasker_2 = pipeline("fill-mask", model="bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
val_data = val_data[val_data["sentiment"]==0]

In [6]:
val_data.shape

(47, 3)

In [13]:
def get_sents(x):
    for sent in nlp(x).sents:
        yield sent.text


In [8]:
def get_random_word_added_1(input_text):
    augmented_texts = []
    
    for sent in get_sents(input_text):
        orig_text_list = sent.split(" ")
        len_input = len(orig_text_list)
        if len_input > 3:
            rand_indx = random.randint(1, len_input-2)
            orig_text_list = orig_text_list[:rand_indx] + ["<mask>"] + orig_text_list[rand_indx:]
            new_text = " ".join(orig_text_list)
            augmented_text = unmasker_1(new_text)[0]['sequence']
            augmented_texts.append(augmented_text)
        else:
            augmented_texts.append(sent)
        
    text = ". ".join(augmented_texts)
    return text

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS

In [10]:
def get_random_word_added_2(input_text):
    augmented_texts = []
    
    for sent in get_sents(input_text):
        orig_text_list = sent.split(" ")
        len_input = len(orig_text_list)
        if len_input > 3:
            rand_indx = random.randint(1, len_input-2)
            orig_text_list = orig_text_list[:rand_indx] + ["[MASK]"] + orig_text_list[rand_indx:]
            new_text = " ".join(orig_text_list)
            augmented_text = unmasker_2(new_text)[0]['sequence']
            augmented_texts.append(augmented_text)
        else:
            augmented_texts.append(sent)
        
    text = ". ".join(augmented_texts)
    return text

In [14]:
%%time 

val_data["augmented_text_1"] = val_data["reviewText"].apply(lambda x: get_random_word_added_1(x))

CPU times: user 16.9 s, sys: 5.51 s, total: 22.4 s
Wall time: 16.5 s


In [15]:
%%time 

val_data["augmented_text_2"] = val_data["reviewText"].apply(lambda x: get_random_word_added_2(x))

CPU times: user 16 s, sys: 5.44 s, total: 21.4 s
Wall time: 15.8 s


In [16]:
val_data

Unnamed: 0,reviewText,overall,sentiment,augmented_text_1,augmented_text_2
0,It's the wrong one because I thought I was get...,3.0,0,It's the exact wrong one because I thought I w...,It's the wrong one because I thought I was get...
5,UPDATE:This card stopped working as of late Fe...,1.0,0,UPDATE:This card stopped working as of late Fe...,UPDATE : This smart card stopped working as of...
33,I do not know if the GS3's have issues with re...,3.0,0,I do not know if the older GS3's have issues w...,I don't even know if the GS3's have issues wit...
37,Great little card. The only problem I have ex...,3.0,0,Great little card.. The only problem I have e...,Great little card.. The only problem that I ha...
54,Bought this card for my Galaxy S4 and after in...,2.0,0,Bought this card for my Galaxy S4 and after in...,"Bought this card for my Galaxy S4 and, after i..."
59,But since I can't say with any certainty wheth...,3.0,0,But since I can't say with any real certainty ...,But since I can't say anything with any certai...
61,Gave it 4 stars because I don't know if the pr...,3.0,0,Gave it 4 stars because I don't know if the re...,Gave it 4 stars because I don't know if the on...
65,SanDisk makes a lot of claims about this card ...,3.0,0,"SanDisk makes a lot of claims about this card,...","SanDisk makes a lot of claims about this card,..."
71,Doesn't work in my s2 skyrocket but works in m...,3.0,0,Doesn't work in my s2x skyrocket but works in ...,"Doesn't work in my s2 skyrocket, but works in ..."
75,"I connected it to my galaxy S4, but every now ...",3.0,0,"I connected it to my galaxy S4, but only every...","I connected with it to my galaxy S4, but every..."


In [17]:
augmented_val_data_1 = val_data[["augmented_text_1", "overall", "sentiment"]]
augmented_val_data_1.rename(columns={"augmented_text_1": "reviewText"}, inplace=True)
augmented_val_data_2 = val_data[["augmented_text_2", "overall", "sentiment"]]
augmented_val_data_2.rename(columns={"augmented_text_2": "reviewText"}, inplace=True)
augmented_valing_data = pd.concat([augmented_val_data_1, augmented_val_data_2], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  augmented_val_data_1.rename(columns={"augmented_text_1": "reviewText"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  augmented_val_data_2.rename(columns={"augmented_text_2": "reviewText"}, inplace=True)


In [18]:
augmented_valing_data

Unnamed: 0,reviewText,overall,sentiment
0,It's the exact wrong one because I thought I w...,3.0,0
1,UPDATE:This card stopped working as of late Fe...,1.0,0
2,I do not know if the older GS3's have issues w...,3.0,0
3,Great little card.. The only problem I have e...,3.0,0
4,Bought this card for my Galaxy S4 and after in...,2.0,0
...,...,...,...
89,I was really excited when this came out I have...,3.0,0
90,I loaded this memory card into my TF700T and c...,2.0,0
91,06.NOV.2012 -. Bought the card. 12. JAN. 2014 ...,1.0,0
92,Lasted for about 8 months of daily use in my A...,3.0,0


In [19]:
augmented_valing_data.to_csv("data_v2/Aug_Val_RandomInsertion.csv", index=False)