In [None]:
%pip install transformers
%pip install pytorch_transformers

In [9]:
import os
import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()

import torch
from keras_preprocessing.sequence import pad_sequences
from transformers import BertForSequenceClassification, BertTokenizer

# Init hardware resource

In [10]:
# If there's a GPU available...
if torch.cuda.is_available():
  
  # Tell PyTorch to use the GPU.
  device = torch.device("cuda")

  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
  # If not...
  print('No GPU available, using the CPU instead.')
  device = torch.device("cpu")

No GPU available, using the CPU instead.


# Load data

In [11]:
dtypes = {
    'id': str,
    'news': str
}

csv_file = '../../data/news_with_header.csv'
df = pd.read_csv(csv_file, dtype=dtypes)
df.head(2)

Unnamed: 0,id,news
0,026f4c15-37a9-459d-a944-03bde29a5c59,அமைச்சின் பணிகளை முன்னெடுப்பதற்கு கௌர அமைச்சர்...
1,0df3b73b-b08a-4357-bd6d-3bd94f8d4e58,இவ்வமைச்சு இல 40 புத்கமுவ வீதி இராஜகிரிய எனு...


# Load the pretained model

In [12]:
# The name of the folder containing the model files.
pretrained_model = 'bert-base-multilingual-uncased'

# Load our fine-tuned model, and configure it to return the "hidden states",
# from which we will be taking our text embeddings.
model = BertForSequenceClassification.from_pretrained(
    pretrained_model,
    output_hidden_states=True,  # Whether the model returns all hidden-states.
)

# Load the tokenizer.
tokenizer = BertTokenizer.from_pretrained(pretrained_model)

# Copy the model to the GPU.
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

# Embedder

In [13]:
def news_to_embedding(tokenizer, model, in_text):
    #   Uses the provided BERT `model` and `tokenizer` to generate a vector 
    #   representation of the input string, `in_text`.
    #   Returns the vector stored as a numpy ndarray.
  
    # ===========================
    #    STEP 1: Tokenization
    # ===========================
    tokens = tokenizer.tokenize(in_text)
    if (len(tokens) > 510):
        tokens = tokens[:128] + tokens[-382:]

    MAX_LEN = 512

    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Truncate the sentence to MAX_LEN if necessary.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end. (After truncating!)
    #   (4) Map tokens to their IDs.
    input_ids = tokenizer.encode(
        tokens,                   # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=MAX_LEN,       # Truncate all sentences.
    )
    
    # Pad our input tokens. Truncation was handled above by the `encode`
    # function, which also makes sure that the `[SEP]` token is placed at the
    # end *after* truncating.
    # Note: `pad_sequences` expects a list of lists, but we only have one
    # piece of text, so we surround `input_ids` with an extra set of brackets.
    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
                            truncating="post", padding="post")
    
    # Remove the outer list.
    input_ids = results[0]
    
    # Create attention masks
    attn_mask = [int(i > 0) for i in input_ids]
    
    # Cast to tensors.
    input_ids = torch.tensor(input_ids)
    attn_mask = torch.tensor(attn_mask)

    # Add an extra dimension for the "batch" (even though there is only one
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attn_mask = attn_mask.unsqueeze(0)

    # ===========================
    #    STEP 2: BERT Model
    # ===========================

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Copy the inputs to the GPU
    # Note -- I got stuck here for a while because I didn't assign the result
    # back to the variable! Geez!
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)

    # Telling the model not to build the backwards graph will make this
    # a little quicker.
    with torch.no_grad():
        # Forward pass, return hidden states and predictions.
        # This will return the logits rather than the loss because we have
        # not provided labels.
        logits, encoded_layers = model(
            input_ids=input_ids,
            token_type_ids=None,
            attention_mask=attn_mask,
            return_dict=False)

    # Retrieve our sentence embedding--take the `[CLS]` embedding from the final
    # layer.
    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0  # Only one input in the batch.
    token_i = 0  # The first token, corresponding to [CLS]

    # Grab the embedding.
    vec = encoded_layers[layer_i][batch_i][token_i]
    
    # Move to the CPU and convert to numpy ndarray.
    vec = vec.detach().cpu().numpy()

    return(vec)

# Test the embedder

In [14]:
# Get the onw news from the list.
input_text = df['news'].iloc[10]

print('Getting embedding for news:\n\n', input_text)

# Use the BERT model and tokenizer to generate an embedding for `input_text`.
vec = news_to_embedding(tokenizer, model, input_text)

print('\nDone. Embedding shape:', str(vec.shape))
print('\nDone. news embedding vector:\n', str(vec))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Getting embedding for news:

 அரச உத்தியோகத்தர்கள் மொழித் தேர்ச்சி பெற்றுக் கொள்வதற்கு அமைவாக முன்வைக்கப்பட்டுள்ள உத்தியோகத்தர் குழு அறிக்கை சிபாரிசுகளை நடைமுறைப்படுத்தல்

Done. Embedding shape: (768,)

Done. news embedding vector:
 [-8.16217661e-02  1.38339503e-02  2.86057275e-02 -3.13600665e-03
 -2.15890378e-01  1.07254073e-01 -8.52226615e-02  1.71499103e-02
 -1.95386875e+00 -1.36038959e-02 -4.05059606e-02 -1.08759195e-01
  6.14710450e-02  6.89011738e-02  9.64766592e-02  1.60595238e-01
 -2.40740031e-02 -1.11191869e-02  4.91555408e-02 -1.79351699e-02
 -2.90736314e-02  1.41032159e-01  7.86092281e-02 -8.89550447e-02
  6.35270834e-01 -2.79472023e-03  1.31536990e-01 -3.50437090e-02
 -2.09349632e+00  3.91056463e-02 -2.51639366e-01  2.48275846e-02
 -1.74585786e-02 -1.33073270e-01 -1.50353406e-02 -1.19470246e-02
 -1.14890188e-02  1.58056426e+00 -6.75819516e-02 -3.13006416e-02
  3.43747735e-02  4.67215478e-03 -8.61967131e-02  9.16192159e-02
 -2.98888981e-02 -3.70589830e-02 -2.96759903e-02  7.

# Batch process news to extract embeddings

In [None]:
# Store the set of embeddings.
embeddings = []

num_news = len(df['news'])

print('Generating news embeddings for all {:,} news...'.format(num_news))

# For each row of the dataframe...
for idx, row in tqdm(df.iterrows()):
  # Vectorize this news.
  vec = news_to_embedding(tokenizer, model, row.news)

  # Store the embeddings.
  embeddings.append(vec)

In [None]:
# Convert the list of vectors into a 2D array.
vecs = np.stack(embeddings)
vecs.shape

In [None]:
output_dir = '../../data/vectors/'

# Create output directory if needed
if not os.path.exists(output_dir):
  os.makedirs(output_dir)

# Use numpy to write out the matrix of embeddings.
print(f'Saving vec to: {output_dir}embeddings.npy')
np.save(f'{output_dir}embeddings.npy', vecs)