In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from datasets import load_dataset
dataset = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

%matplotlib inline

## Generating training set

In [4]:
text_train = pd.Series(dataset['train']['text'])
text_train[0:5]

0    I rented I AM CURIOUS-YELLOW from my video sto...
1    "I Am Curious: Yellow" is a risible and preten...
2    If only to avoid making this type of film in t...
3    This film was probably inspired by Godard's Ma...
4    Oh, brother...after hearing about this ridicul...
dtype: object

In [5]:
label_train = pd.Series(dataset['train']['label'])
label_train[0:5]

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [6]:
# Directory for Colab (optional, use if running Colab)
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [7]:
import os
os.chdir('/content/gdrive/MyDrive/NLP_project')

In [8]:
!pwd

/content/gdrive/MyDrive/NLP_project


In [11]:
def create_train_data(dataset, list_seed=[1,2,3], n_train_samples=8000):
  ''' As default, we create 3 training sets of size 8000 for quantitative analysis. 
  Input:
  list_seed : list of seeds used
  n_train_samples : length of training sets
  '''
  # Create DataFrame 
  text_train = pd.Series(dataset['train']['text'])
  label_train = pd.Series(dataset['train']['label'])

  for seed in list_seed: 
    np.random.seed(seed)
    indices = np.random.permutation(n_train_samples)
    text_train_samples = text_train[indices]
    label_train_samples = label_train[indices]
    df_samples = pd.DataFrame({'text': text_train_samples, 'label':label_train_samples})
    file_name = "pickle/imdb_train" + str(seed) +".pickle"
    df_samples.to_pickle(file_name)
    print(file_name + ' succesfully created')
  return None

In [12]:
create_train_data(dataset)

pickle/imdb_train1.pickle succesfully created
pickle/imdb_train2.pickle succesfully created
pickle/imdb_train3.pickle succesfully created


## Generating test set for BERT and textfooler

In [14]:
df = pd.read_csv("data/imdb/bert/textfooler/bert-base-uncased-imdb_textfooler.csv",sep=',')
df.head()

Unnamed: 0,original_text,perturbed_text,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,[[Once]] again Mr. Costner has dragged out a [...,[[Whenever]] again Mr. Costner has dragged out...,0.000241,0.815027,0,1,0,469,Successful
1,[[This]] is an example of why the majority of ...,[[These]] is an example of why the majority of...,0.000183,0.554409,0,1,0,670,Successful
2,"First of all I hate those [[moronic]] rappers,...",First of all I hate those [[senseless]] rapper...,0.000289,0.716852,0,1,0,643,Successful
3,Not even the Beatles could write songs everyon...,Not even the Beatles could write songs everyon...,0.000303,0.960098,0,1,0,441,Successful
4,Brass pictures (movies is not a fitting word f...,Brass pictures (movies is not a fitting word f...,0.000311,0.952665,0,1,0,301,Successful


In [15]:
n_test = len(df)
n_test

10000

In [16]:
from sklearn.metrics import accuracy_score

accuracy_score(df['ground_truth_output'], df['original_output'])

0.9306

In [17]:
df['result_type'].value_counts()

Successful    9175
Skipped        694
Failed         131
Name: result_type, dtype: int64

In [18]:
def clean_text(text):
  text = text.replace("[[", "")
  text = text.replace("]]", "")
  return text

In [19]:
df['original_text'] = df['original_text'].map(clean_text)
df['perturbed_text'] = df['perturbed_text'].map(clean_text)
df.head()

Unnamed: 0,original_text,perturbed_text,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
0,Once again Mr. Costner has dragged out a movie...,Whenever again Mr. Costner has dragged out a s...,0.000241,0.815027,0,1,0,469,Successful
1,This is an example of why the majority of acti...,These is an example of why the majority of act...,0.000183,0.554409,0,1,0,670,Successful
2,"First of all I hate those moronic rappers, who...","First of all I hate those senseless rappers, w...",0.000289,0.716852,0,1,0,643,Successful
3,Not even the Beatles could write songs everyon...,Not even the Beatles could write songs everyon...,0.000303,0.960098,0,1,0,441,Successful
4,Brass pictures (movies is not a fitting word f...,Brass pictures (movies is not a fitting word f...,0.000311,0.952665,0,1,0,301,Successful


In [28]:
# Scenario 1

np.random.seed(1)
indices = np.random.permutation(n_test)
n_max_adv = 2000
indices_adv = indices[:n_max_adv]
indices_clean = indices[n_max_adv:]
df_adv = df.loc[indices_adv,:]

mask_1 = df_adv['result_type']=='Successful' # Select only successful attacks
mask_2 = df_adv['original_output'] == df_adv['ground_truth_output'] # Select only correctly classified examples
df_adv = df_adv[mask_1 & mask_2]
n_adv = len(df_adv) # Number of attack samples

indices_clean_samples = np.random.choice(indices_clean, n_adv)
df_clean = df.loc[indices_clean_samples,:]

In [29]:
df_adv

Unnamed: 0,original_text,perturbed_text,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
9953,"Gilmore Girls is one of the funniest, most cle...","Gilmore Girls is one of the funniest, most cle...",0.000233,0.641318,1,0,1,993,Successful
3850,There is something about Doug McLure's appeara...,There is something about Doug McLure's appeara...,0.000235,0.505358,0,1,0,736,Successful
4962,"A little while ago, I stumbled upon this DVD w...","A little while ago, I stumbled upon this DVD w...",0.002581,0.508748,0,1,0,666,Successful
3886,This is one of the worst movies I have ever se...,These is one of the worst movies I have ever s...,0.000202,0.515732,0,1,0,1943,Successful
8517,This is definitely one of the ultimate cult cl...,This is definitely one of the ultimate cult cl...,0.017978,0.745149,1,0,1,584,Successful
...,...,...,...,...,...,...,...,...,...
6200,The Falcon and the Snowman is based on a true ...,The Falcon and the Snowman is based on a prese...,0.000222,0.901656,1,0,1,281,Successful
3919,"This is what used to be called a ""women's pict...","This is what used to be called a ""women's pict...",0.000174,0.782556,0,1,0,390,Successful
7903,"Simply put, there are two parts of this series...","Solo put, there are three parte of this series...",0.001443,0.570861,1,0,1,468,Successful
2242,I saw this movie in the theatre and it was a t...,me saw this theatres in the theatre and it was...,0.000371,0.500394,0,1,0,410,Successful


In [30]:
df_clean

Unnamed: 0,original_text,perturbed_text,original_score,perturbed_score,original_output,perturbed_output,ground_truth_output,num_queries,result_type
3435,The worst ever Korean movie! The plot is ridic...,The finest ever Korean movie! The plot is ridi...,0.000188,0.992230,0,1,0,197,Successful
2316,"If I heard the male lead say ""This is madness!...","Until I heard the male lead say ""This is loca!...",0.000197,0.613869,0,1,0,397,Successful
7183,I wonder if I could take sitting through a who...,I wonder if I could take sitting through a who...,0.993204,0.993204,0,0,1,1,Skipped
8656,"Terrific movie: If you did not watch yet, you ...","Terrific movie: If you ai not surveilling yet,...",0.000306,0.527497,1,0,1,562,Successful
6436,"Don't watch this film while, or soon after, ea...","Don't watch this film while, or soon after, ea...",0.996598,0.996598,0,0,1,1,Skipped
...,...,...,...,...,...,...,...,...,...
5609,This is one of the best romantic movies I have...,That is one of the biggest soppy flick I got e...,0.001051,0.229166,1,1,1,799,Failed
6163,Anna Christie (1930)<br /><br />Anna Christie ...,Anna Christie (1930)<br /><br />Anna Christie ...,0.050629,0.660517,1,0,1,314,Successful
480,I saw this on a screener DVD a couple months b...,I saw this on a screener DVD a couple months b...,0.000229,0.501745,0,1,0,321,Successful
3003,This is a very strange product from Hollywood....,This is a very strange product from Celebs. Ap...,0.000228,0.520891,0,1,0,321,Successful


In [35]:
adv_text = df_adv['perturbed_text']
adv_bool = np.ones(len(adv_text), dtype=int)
clean_text = df_clean['original_text']
clean_bool = np.zeros(len(clean_text), dtype=int)

text_test = pd.concat([adv_text, clean_text])
bool_test = np.concatenate([adv_bool, clean_bool])
df_test = pd.DataFrame({'text': text_test, 'adversarial': bool_test})

df_test

Unnamed: 0,text,adversarial
9953,"Gilmore Girls is one of the funniest, most cle...",1
3850,There is something about Doug McLure's appeara...,1
4962,"A little while ago, I stumbled upon this DVD w...",1
3886,These is one of the worst movies I have ever s...,1
8517,This is definitely one of the ultimate cult cl...,1
...,...,...
5609,This is one of the best romantic movies I have...,0
6163,Anna Christie (1930)<br /><br />Anna Christie ...,0
480,I saw this on a screener DVD a couple months b...,0
3003,This is a very strange product from Hollywood....,0
