In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset
dataset = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import numpy as np

## Generating training set

In [None]:
text_train = pd.Series(dataset['train']['text'])
text_train[0:5]

0    I rented I AM CURIOUS-YELLOW from my video sto...
1    "I Am Curious: Yellow" is a risible and preten...
2    If only to avoid making this type of film in t...
3    This film was probably inspired by Godard's Ma...
4    Oh, brother...after hearing about this ridicul...
dtype: object

In [None]:
# Directory for Colab (optional, use if running Colab)
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
import os
os.chdir('/content/gdrive/MyDrive/NLP_project')

In [None]:
!pwd

/content/gdrive/MyDrive/NLP_project


In [None]:
np.random.seed(1803)
n_train_samples = 8000
indices = np.random.permutation(n_train_samples)
train_samples = text_train[indices]
train_samples

3264    This is the second British Rank film to adapt ...
3453    This movie features an o.k. score and a not ba...
2632    I don't know why, but for some sick reason, I ...
6825    A well-intentioned movie about Sonja Horowitz ...
4087    The plot was dull, the girls were sickening an...
                              ...                        
7975    Sitting down to watch the 14th season of the B...
7751    Journey to the Center of the Earth is the stor...
2606    Almost every plot detail in this movie is illo...
7394    It makes the actors in Hollyoaks look like the...
3908    Where do I start? Per the title of this film I...
Length: 8000, dtype: object

In [None]:
import pickle

In [None]:
train_samples.to_pickle("pickle/imdb_bert_train.pickle")

## Generating test set

In [None]:
df = pd.read_csv("data/imdb/bert/textfooler/bert-base-uncased-imdb_textfooler.csv",sep=',')
df.head()

Unnamed: 0,original_text,perturbed_text,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,[[Once]] again Mr. Costner has dragged out a [...,[[Whenever]] again Mr. Costner has dragged out...,0.000241,0.815027,0,1,0,469,Successful
1,[[This]] is an example of why the majority of ...,[[These]] is an example of why the majority of...,0.000183,0.554409,0,1,0,670,Successful
2,"First of all I hate those [[moronic]] rappers,...",First of all I hate those [[senseless]] rapper...,0.000289,0.716852,0,1,0,643,Successful
3,Not even the Beatles could write songs everyon...,Not even the Beatles could write songs everyon...,0.000303,0.960098,0,1,0,441,Successful
4,Brass pictures (movies is not a fitting word f...,Brass pictures (movies is not a fitting word f...,0.000311,0.952665,0,1,0,301,Successful


In [None]:
n_test = len(df)
n_test

10000

In [None]:
def clean_text(text):
  text = text.replace("[[", "")
  text = text.replace("]]", "")
  return text

In [None]:
df['original_text'] = df['original_text'].map(clean_text)
df['perturbed_text'] = df['perturbed_text'].map(clean_text)
df.head()

Unnamed: 0,original_text,perturbed_text,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,Once again Mr. Costner has dragged out a movie...,Whenever again Mr. Costner has dragged out a s...,0.000241,0.815027,0,1,0,469,Successful
1,This is an example of why the majority of acti...,These is an example of why the majority of act...,0.000183,0.554409,0,1,0,670,Successful
2,"First of all I hate those moronic rappers, who...","First of all I hate those senseless rappers, w...",0.000289,0.716852,0,1,0,643,Successful
3,Not even the Beatles could write songs everyon...,Not even the Beatles could write songs everyon...,0.000303,0.960098,0,1,0,441,Successful
4,Brass pictures (movies is not a fitting word f...,Brass pictures (movies is not a fitting word f...,0.000311,0.952665,0,1,0,301,Successful


In [None]:
# Scenario 1
np.random.seed(1803)
indices = np.random.permutation(n_test)
n_adv = 2000
indices_adv = indices[:n_adv]
indices_clean = indices[n_adv:]
text_adv = df['perturbed_text'][indices_adv]
text_clean = df['original_text'][indices_clean]
text_test = pd.concat([text_adv, text_clean])
text_test

3098    So, I'm wondering while watching this film, di...
288     Although not a big Coen brothers fan, me am an...
1686    me am in a theatres nightclubs at my pupil and...
5350    The Thirdly Stooges enjoys always been some of...
8615    (Possible ?? spoilers included, but nothing cr...
                              ...                        
7552    When Melville's "Pierre; or The Ambiguities" h...
634     This movie is funny if you're the gentleman wh...
3723    Snakes on a Train is a movie I rented due to t...
7751    Should we take the opening shot as a strange f...
2606    A friend and I went to see this movie. We have...
Length: 10000, dtype: object

In [None]:
adv_series = [1 if i<n_adv else 0 for i in range(n_test)]
adv_series[1998:2002]

[1, 1, 0, 0]

In [None]:
dict_test = {'text': text_test, 'adversarial': adv_series}
df_test = pd.DataFrame(dict_test)
df_test

Unnamed: 0,text,adversarial
3098,"So, I'm wondering while watching this film, di...",1
288,"Although not a big Coen brothers fan, me am an...",1
1686,me am in a theatres nightclubs at my pupil and...,1
5350,The Thirdly Stooges enjoys always been some of...,1
8615,"(Possible ?? spoilers included, but nothing cr...",1
...,...,...
7552,"When Melville's ""Pierre; or The Ambiguities"" h...",0
634,This movie is funny if you're the gentleman wh...,0
3723,Snakes on a Train is a movie I rented due to t...,0
7751,Should we take the opening shot as a strange f...,0


In [None]:
df_test.to_pickle("pickle/imdb_bert_test_textfooler.pickle")