In [6]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from transformers import XLMTokenizer, XLMWithLMHeadModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
# download tokenizer and model
tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-clm-enfr-1024 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# display supported language and id
tokenizer.lang2id

{'en': 0, 'fr': 1}

In [20]:
# create an example input
input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
input_ids

tensor([[    0,  4018,  5545, 51104,    32,   308,    18,     1]])

In [7]:
# retrieve language id -> 0
language_id = tokenizer.lang2id["en"] 

# torch.tensor([0, 0, 0, ..., 0])
langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])

# We reshape it to be of size (batch_size, sequence_length)
langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)\

langs, langs.shape

(tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), torch.Size([1, 8]))

In [8]:
# pass the input ids and language embedding to the model
outputs = model(input_ids, langs=langs)
outputs

MaskedLMOutput(loss=None, logits=tensor([[[ -6.3665,  10.7176,  -6.3754,  ...,  -6.1219,  -6.3775,  -6.0470],
         [-13.8973,  -3.1509, -13.8749,  ..., -14.6975, -12.9415, -13.8252],
         [ -9.2461,   4.8444,  -9.6047,  ..., -10.5616,  -8.2646,  -9.9838],
         ...,
         [-12.1487,   9.7573, -12.0884,  ..., -13.0167, -11.7700, -10.6408],
         [-11.5067,   4.1504, -11.6619,  ..., -12.0130, -11.2718, -10.8311],
         [ -8.4061,   6.6820,  -8.5521,  ...,  -8.2401,  -8.5305,  -8.3125]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [None]:
hidden_states

('logits', tensor([[[ -6.3665,  10.7176,  -6.3754,  ...,  -6.1219,  -6.3775,  -6.0470],
         [-13.8973,  -3.1509, -13.8749,  ..., -14.6975, -12.9415, -13.8252],
         [ -9.2461,   4.8444,  -9.6047,  ..., -10.5616,  -8.2646,  -9.9838],
         ...,
         [-12.1487,   9.7573, -12.0884,  ..., -13.0167, -11.7700, -10.6408],
         [-11.5067,   4.1504, -11.6619,  ..., -12.0130, -11.2718, -10.8311],
         [ -8.4061,   6.6820,  -8.5521,  ...,  -8.2401,  -8.5305,  -8.3125]]],
       grad_fn=<ViewBackward0>))


### Try sentence transformer

In [3]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [4]:
%%time
contents = pd.read_csv('./data/content.csv')
contents['title'] = contents['title'].fillna('No title exist')
title_sentences = list(contents.title.values)
title_sentences[:10]

CPU times: user 4.58 s, sys: 460 ms, total: 5.04 s
Wall time: 7.93 s


['Sumar números de varios dígitos: 48,029+233,930 ',
 'Trovare i fattori di un numero',
 'Sumar curvas de demanda',
 'Nado de aproximação',
 'geometry-m3-topic-a-overview.pdf',
 '5.12E: Regulation of the Calvin Cycle',
 'Reflexionemos sobre lo que vemos y escuchamos',
 'अंग्रेजी ओके प्लीज 1.2',
 '4.E: Genomes and Chromosomes (Exercises)',
 'La banca 12: los bonos del tesoro (deuda pública)']

In [17]:
%%time
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Tokenize sentences
encoded_input = tokenizer(title_sentences[:10], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.0394,  0.0246, -0.0035,  ...,  0.0677,  0.0117, -0.0330],
        [-0.0439, -0.0270, -0.0044,  ...,  0.0714,  0.0114, -0.0300],
        [-0.0217,  0.0136, -0.0046,  ...,  0.0016, -0.0647, -0.0364],
        ...,
        [-0.0347,  0.0291, -0.0065,  ...,  0.0083, -0.0276, -0.0389],
        [-0.0064,  0.0167, -0.0035,  ...,  0.0052, -0.0050, -0.0418],
        [-0.0396,  0.0388, -0.0047,  ...,  0.0280, -0.0131,  0.0163]])
CPU times: user 1.06 s, sys: 0 ns, total: 1.06 s
Wall time: 47.3 ms


In [18]:
**encoded_input

SyntaxError: invalid syntax (2809074728.py, line 1)