In [1]:
!pip install transformers torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Directory for Colab (optional, use if running Colab)
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
import os
os.chdir('/content/gdrive/MyDrive/NLP_project')

In [4]:
!pwd

/content/gdrive/MyDrive/NLP_project


In [5]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from transformers import RobertaTokenizer, RobertaModel
import torch

torch.cuda.is_available()

True

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [12]:
from embeddings import get_embeddings

def create_train_embeddings(transformers, device=device):
  '''Input : transformers = {'bert', 'roberta'}
  Output : BERT or RoBERTa embeddings from training set
  '''
  assert transformers=='bert' or transformers=='roberta'

  df_train = pd.read_pickle("pickle/imdb_train.pickle")
  print('Loading from pickle/imdb_train.pickle')

  if transformers=='bert':
    model_name = 'textattack/bert-base-uncased-imdb'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name) 
  else:
    model_name = 'textattack/roberta-base-imdb'
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaModel.from_pretrained(model_name)

  train_matrix = get_embeddings(df_train['text'], model.cuda(), tokenizer, device)
  assert train_matrix.shape==(len(df_train),768)

  file_name = 'embeddings/imdb_train_' + transformers + '_emb.npy'
  with open(file_name, 'w') as file:
    np.save(file_name, train_matrix)
  print(file_name, 'successfully created')

  return None

In [13]:
for transformers in ['bert','roberta']:
  create_train_embeddings(transformers)

Loading from pickle/imdb_train.pickle


Some weights of the model checkpoint at textattack/bert-base-uncased-imdb were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_train_bert_emb.npy successfully created
Loading from pickle/imdb_train.pickle


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_train_roberta_emb.npy successfully created


In [16]:
def create_test_embeddings(transformers, attack, device=device):
  '''Input : transformers = {'bert', 'roberta'}
  attack = {'textfooler', 'pwws', 'bae', 'tf-adj'}
  Output : BERT or RoBERTa embeddings from test set
  '''
  assert transformers=='bert' or transformers=='roberta'
  assert attack in ['textfooler', 'pwws', 'bae', 'tf-adj']

  df_path = "pickle/imdb_" + transformers + '_test_' + attack +'.pickle'
  df_test = pd.read_pickle(df_path)
  print('Loading from ', df_path)

  if transformers=='bert':
    model_name = 'textattack/bert-base-uncased-imdb'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name) 
  else:
    model_name = 'textattack/roberta-base-imdb'
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaModel.from_pretrained(model_name)

  test_matrix = get_embeddings(df_test['text'], model.cuda(), tokenizer, device)
  assert test_matrix.shape==(len(df_test),768)

  file_name = 'embeddings/imdb_' + transformers + '_test_' + attack + '_emb.npy'
  with open(file_name, 'w') as file:
    np.save(file_name, test_matrix)
  print(file_name, 'successfully created')

  return None

In [17]:
for transformers in ['bert', 'roberta']:
  for attack in ['textfooler', 'pwws', 'bae', 'tf-adj']:
    create_test_embeddings(transformers, attack, device=device)

Loading from  pickle/imdb_bert_test_textfooler.pickle


Some weights of the model checkpoint at textattack/bert-base-uncased-imdb were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_bert_test_textfooler_emb.npy successfully created
Loading from  pickle/imdb_bert_test_pwws.pickle


Some weights of the model checkpoint at textattack/bert-base-uncased-imdb were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_bert_test_pwws_emb.npy successfully created
Loading from  pickle/imdb_bert_test_bae.pickle


Some weights of the model checkpoint at textattack/bert-base-uncased-imdb were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_bert_test_bae_emb.npy successfully created
Loading from  pickle/imdb_bert_test_tf-adj.pickle


Some weights of the model checkpoint at textattack/bert-base-uncased-imdb were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_bert_test_tf-adj_emb.npy successfully created
Loading from  pickle/imdb_roberta_test_textfooler.pickle


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_roberta_test_textfooler_emb.npy successfully created
Loading from  pickle/imdb_roberta_test_pwws.pickle


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_roberta_test_pwws_emb.npy successfully created
Loading from  pickle/imdb_roberta_test_bae.pickle


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_roberta_test_bae_emb.npy successfully created
Loading from  pickle/imdb_roberta_test_tf-adj.pickle


Some weights of the model checkpoint at textattack/roberta-base-imdb were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


embeddings/imdb_roberta_test_tf-adj_emb.npy successfully created
