In [1]:
# ! pip install torch transformers



In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [7]:
import sys

sys.path.insert(0, '../../../')
from notebooks.utils import _ARTICLES_2020, load_json, export_model_as_feature

2022-01-17 09:21:09.418394: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from transformers import RobertaTokenizer, RobertaModel


# Load pre-trained model tokenizer (vocabulary)
TOKENIZER = RobertaTokenizer.from_pretrained('roberta-base')

# Load pre-trained model (weights)
MODEL = RobertaModel.from_pretrained("roberta-base")

# MODEL.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def get_roberta_embeddings(text):
    # Define a new example sentence with multiple meanings of the word "bank"
    # text = "After stealing money from the bank vault, the bank robber was seen " \
    #     "fishing on the Mississippi river bank."

    encoded_dict = TOKENIZER.encode(
                        text,                       # article to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 510,           # Pad & truncate all articles.
                        padding = True,
                        truncation = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    difference = 510 - encoded_dict.shape[1]
    if difference:
        encoded_dict = torch.cat(
                (encoded_dict, torch.Tensor([[0] * difference])),
                dim=-1
        )

    # Convert inputs to PyTorch tensors
    tokens_tensor = encoded_dict.to(device)
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    MODEL.eval()

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers.
    with torch.no_grad():

        outputs = MODEL(tokens_tensor.long())

        hidden_states = outputs.last_hidden_state
    # `hidden_states` has shape [1, 510, 768]

    # `token_vecs` is a tensor with shape [510, 768]
    token_vecs = hidden_states[0]

    # Calculate the average of all 510 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)

    return sentence_embedding.tolist()

In [6]:
get_roberta_embeddings('This is a stupid test.')[:10]

[-0.04898376762866974,
 0.08267765492200851,
 -0.008879145607352257,
 -0.11193916201591492,
 0.07080987840890884,
 -0.06627613306045532,
 -0.050851985812187195,
 0.0062030707485973835,
 0.06748633831739426,
 -0.08248450607061386]

In [13]:
import zipfile
import json


with zipfile.ZipFile(_ARTICLES_2020, 'r') as zip_fd:
    title_embeddings, body_embeddings = {}, {}
    for site_file in [f for f in zip_fd.namelist() if f.endswith('.json')]:
        site = os.path.basename(site_file).replace('.json', '')
        print('Generate embeddings for', site)
        with zip_fd.open(site_file) as site_fd:
            articles_for_site = json.load(site_fd)
            title_embed, body_embed  = [], []
            for article in articles_for_site:
                title_embed.append(get_roberta_embeddings(article['title']))
                body_embed.append(get_roberta_embeddings(article['body']))

        title_embeddings[site] = torch.Tensor(title_embed).mean(dim=0).tolist()
        body_embeddings[site] = torch.Tensor(body_embed).mean(dim=0).tolist()

    export_model_as_feature(title_embeddings, 'roberta_title_embeddings_768d.json')
    export_model_as_feature(body_embeddings, 'roberta_body_embeddings_768d.json')

Generate embeddings for inthesetimes.com
Generate embeddings for shareblue.com
