In [None]:
!pip install datasets

In [None]:
! pip install -U accelerate

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Tokenize the training examples

In [None]:
from transformers import BertModel, Trainer, DataCollatorWithPadding, AutoTokenizer
from datasets import Dataset
import numpy as np
import pandas as pd

train_path = "drive/MyDrive/subtaskA_train_monolingual.jsonl"
model_name = "bert-base-uncased"

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True,padding=True)

# load tokenizer from saved model
tokenizer = AutoTokenizer.from_pretrained(model_name)
train = pd.read_json(train_path,lines=True)
train_dataset = Dataset.from_pandas(train)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})


# Save the tokens in drive.

We extract only the relevent columns for model prediction. It is saved in drive for easy reuse using pickle.


In [8]:
import pickle

# code to save the tokens in a file
only_tokens = tokenized_train_dataset.remove_columns(['text','source','model','token_type_ids','label'])
with open("drive/MyDrive/tokenized_bert_truncated.pkl",'wb') as f:
  pickle.dump(only_tokens,f)
# code to extract the tokens from the file
with open("drive/MyDrive/tokenized_bert_truncated.pkl",'rb') as f:
  only_tokens = pickle.load(f)

# Generate training sequence embeddings

We run the model on the training dataset batches and extract the [CLS] embeddings generated.
The embeddings are stored in a list which is saved in a file using pickle.

In [9]:
from transformers import BertModel
import torch
from tqdm import tqdm

device = torch.device("cuda:0")
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name).to(device)
batch_size = 8 #int(len(tokenized_train_dataset)/4)
# print(len(only_tokens))
batches = []
predictions = []


for i in tqdm(range(0, len(only_tokens),batch_size)):
    inp_ids = torch.tensor(only_tokens[i:i+batch_size]["input_ids"], dtype=torch.int64, device=device)
    att_masks = torch.as_tensor(only_tokens[i: i+batch_size]["attention_mask"], device=device)
    preds = model(inp_ids, attention_mask = att_masks, output_hidden_states=True)
    embeddings = torch.Tensor.tolist(preds.last_hidden_state[:,0,:].cpu())
    predictions.append(embeddings)



100%|██████████| 14970/14970 [1:10:11<00:00,  3.55it/s]


Some processing of the generated list to flatten the batche. Also, saving the list for reuse later.

In [15]:
from itertools import chain

final_embeds = list(chain.from_iterable(predictions))

In [17]:
with open("drive/MyDrive/embeddings512.pkl",'wb') as f:
  pickle.dump(final_embeds,f)