In [0]:
# Copyright (c) 2019-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

In [0]:
#
# Code to generate sentence representations from a pretrained model.
# This can be used to initialize a cross-lingual classifier, for instance.
#

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
% bookmark HOME "/content/drive/My Drive/African_Translator/from_github/XLM" 
%cd -b HOME

(bookmark:HOME) -> /content/drive/My Drive/African_Translator/from_github/XLM
/content/drive/.shortcut-targets-by-id/1PdPd2d1303vfDNzoJKH48Ww6NcPS5YrJ/African_Translator/from_github/XLM


In [3]:
import os
import torch

from src.utils import AttrDict
from src.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from src.model.transformer import TransformerModel

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


## Reload a pretrained model

In [4]:
#model_path = 'models/mlm_100_1280.pth'
model_path = '/content/drive/My Drive/African_Translator/from_github/XLM/dumped/test_enfr_mlm/16r7b9ka67/best-valid_mlm_ppl.pth'
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

Supported languages: en, fr


## Build dictionary / update parameters / build model

In [14]:
# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params = params, dico = dico, is_encoder = True, with_output = True)
model.eval()
model.load_state_dict(reloaded['model'])

<All keys matched successfully>

In [12]:
print(model)

TransformerModel(
  (position_embeddings): Embedding(512, 1024)
  (lang_embeddings): Embedding(2, 1024)
  (embeddings): Embedding(9659, 1024, padding_idx=2)
  (layer_norm_emb): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
  (attentions): ModuleList(
    (0): MultiHeadAttention(
      (q_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (k_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (v_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (out_lin): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (1): MultiHeadAttention(
      (q_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (k_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (v_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (out_lin): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (2): MultiHeadAttention(
      (q_lin): Linear(in_features=1024, out_features=1024, bias=True)
      (


## Get sentence representations

Sentences have to be in the BPE format, i.e. tokenized sentences on which you applied fastBPE.

Below you can see an example for English and French sentences.

In [0]:
# Below is one way to bpe-ize sentences
codes = "" # path to the codes of the model
fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')

def to_bpe(sentences):
    # write sentences to tmp file
    with open('/tmp/sentences.bpe', 'w') as fwrite:
        for sent in sentences:
            fwrite.write(sent + '\n')
    
    # apply bpe to tmp file
    os.system('%s applybpe /tmp/sentences.bpe /tmp/sentences %s' % (fastbpe, codes))
    
    # load bpe-ized sentences
    sentences_bpe = []
    with open('/tmp/sentences.bpe') as f:
        for line in f:
            sentences_bpe.append(line.rstrip())
    
    return sentences_bpe


In [10]:
# Below are already BPE-ized sentences

# list of (sentences, lang)
sentences = [
    'once he had worn trendy italian leather shoes and jeans from paris that had cost three hundred euros .', # en
    'Le français est la seule langue étrangère proposée dans le système éducatif .', # fr
]

# bpe-ize sentences
sentences = to_bpe(sentences)
print('\n\n'.join(sentences))

# check how many tokens are OOV
n_w = len([w for w in ' '.join(sentences).split()])
n_oov = len([w for w in ' '.join(sentences).split() if w not in dico.word2id])
print('Number of out-of-vocab words: %s/%s' % (n_oov, n_w))

# add </s> sentence delimiters
sentences = [(('</s> %s </s>' % sent.strip()).split()) for sent in sentences]

once he had worn trendy italian leather shoes and jeans from paris that had cost three hundred euros . blablandnodo

Le français est la seule langue étrangère proposée dans le système éducatif .
Number of out-of-vocab words: 13/33


### Create batch

In [0]:
bs = len(sentences)
slen = max([len(sent) for sent in sentences])

word_ids = torch.LongTensor(slen, bs).fill_(params.pad_index)
for i in range(len(sentences)):
    sent = torch.LongTensor([dico.index(w) for w in sentences[i]])
    word_ids[:len(sent), i] = sent

lengths = torch.LongTensor([len(sent) for sent in sentences])
                             
# NOTE: No more language id (removed it in a later version)
# langs = torch.LongTensor([params.lang2id[lang] for _, lang in sentences]).unsqueeze(0).expand(slen, bs) if params.n_langs > 1 else None
langs = None


### Forward

In [11]:
tensor = model('fwd', x=word_ids, lengths=lengths, langs=langs, causal=False).contiguous()
print(tensor.size())

tensor([[[ 1.0043,  1.3356,  1.1830,  ...,  0.3952, -0.1379,  0.2779],
         [ 0.9566,  1.4682,  1.3344,  ...,  0.5169, -0.4460,  0.1688]],

        [[ 1.3772,  0.3458,  1.4081,  ...,  2.5558,  0.6799,  0.5485],
         [ 0.7248,  0.6524,  1.9140,  ...,  2.4721,  0.2964,  0.0548]],

        [[ 1.5902,  0.5192, -0.1281,  ...,  0.1359, -2.1216, -0.2077],
         [ 0.9734,  0.9857,  1.1935,  ...,  2.1771, -0.8057,  0.2012]],

        ...,

        [[ 0.4444,  0.6856,  1.2785,  ...,  1.8183,  0.4155,  1.1290],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]],

        [[ 1.1060,  1.5340,  0.6061,  ...,  2.5929, -0.9947, -0.1349],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]],

        [[ 0.2880,  1.1748,  0.9947,  ...,  0.0865, -0.5463,  0.6289],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.0000, -0.0000]]],
       grad_fn=<CopyBackwards>)


In [0]:
tensor = model('predict', x=word_ids, lengths=lengths, langs=langs, causal=False).contiguous()

The variable `tensor` is of shape `(sequence_length, batch_size, model_dimension)`.

`tensor[0]` is a tensor of shape `(batch_size, model_dimension)` that corresponds to the first hidden state of the last layer of each sentence.

This is this vector that we use to finetune on the GLUE and XNLI tasks.