In [None]:
%pip install dask
%pip install dask[dataframe]
%pip install dask[complete]
%pip install dask[distributed]
%pip install cloudpickle
%pip install graphviz

In [None]:
%pip install transformers
%pip install pytorch_transformers

In [None]:
# define the imports
try:
    import os
    import json
    import math
    import dask
    from dask.distributed import Client
    import dask.dataframe as dd
    from dask.diagnostics import ProgressBar
    import dask.multiprocessing
    import numpy as np
    import pandas as pd
    import torch
    from transformers import BertTokenizer
    from transformers import BertModel
    from tqdm.notebook import tqdm
    tqdm.pandas()
    from tqdm import tqdm
except Exception as ex:
    print('Some modules are missing : {}'.format(ex))

In [None]:
dtypes = {
    'id': str,
    'news': str
}

In [None]:
df = pd.read_csv('../data/news_with_header.csv', dtype=dtypes)
df

In [None]:
## Load pretrained model/tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
model = BertModel.from_pretrained('bert-base-multilingual-uncased', output_hidden_states=True)

In [None]:
# Put the model in "evaluation" mode,meaning feed-forward operation.
model.eval()

In [None]:
def get_vectors(row):
  print('e')
  encoding = tokenizer.encode(row['news'], add_special_tokens=True,
                              max_length=512, truncation=True, padding="max_length")
  token_text = tokenizer.convert_ids_to_tokens(encoding)
  indexed_tokens = tokenizer.convert_tokens_to_ids(token_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])

  # Run the text through BERT, get the output and collect all of the hidden states produced from all 12 layers.
  with torch.no_grad():
    outputs = model(tokens_tensor)

    # Evaluating the model will return a different number of objects based on how it's  configured in the `from_pretrained` call earlier.
    # In this case, becase we set `output_hidden_states = True`, the third item will be the hidden states from all layers.
    # See the documentation for more details:https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

    # initial embeddings can be taken from 0th layer of hidden states
    word_embed = hidden_states[0]

    output_path = '../data/vectors/{id}.npy'.format(id=row['id'])
    np.save(output_path, word_embed)

In [None]:
parallel_tasks = []
for idx, row in tqdm(df.iterrows()):
    task = dask.delayed(get_vectors)(row)
    parallel_tasks.append(task)