In [70]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenize = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
tok_sent = tokenize(["Arturo","devra","retenter","demain"], is_split_into_words=True, return_tensors='pt')

In [71]:
print(tok_sent['input_ids'])

tensor([[  101, 33141, 10104, 39210, 32641, 25446, 10268, 18073,   102]])


In [72]:
model = AutoModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
with torch.no_grad():    
    emb_sent = model(**tok_sent)['last_hidden_state'][0]
print(emb_sent.shape)

torch.Size([9, 768])


In [73]:
import numpy as np
from collections import defaultdict

def token_alignment(input_sent, tok_sent, upos_filter):
    '''
    input_sent: from corpus, raw sentence
    tok_sent: from the tokenizer of the pre-trained model
    '''
    np_tok_sent = np.array(tok_sent.word_ids()) # convert the list of tokens to numpy array to use the np.where function
    tok_alignment = dict() # create en defaultdict contraining the ids corresponding to the word in tok_sent
    for id, w in enumerate(input_sent): 
        if upos_filter is None:
        if upos_filter in 
        tok_alignment[id] = (np.where(np_tok_sent == id)[0]).tolist() # for loop to store the ids for the tokens corresponding to the word
    return tok_alignment

In [85]:
# tok_alignment = token_alignment(input_sent, tok_sent)
# for k,v in tok_alignment.items():
#     print(input_sent[k])
#     for i in v:
#         print(tok_sent.word_ids()[i])

In [138]:
'''
trouver le(s) embedding(s) correspondants à l’aide de word_ids. Créez ensuite un seul
embedding pour ce mot en moyennant les embeddings contextuels de ses sub-tokens. Cet embedding sera
associé à l’étiquette super-sense du mot. 
'''
from conllu import parse_incr

def load_corpus(in_file):
    sents = parse_incr(open(in_file, encoding='UTF-8'))
    word_sent = {}
    upos_sent = {}
    for i, sent in enumerate(sents):
        word_sent[i] = []
        upos_sent[i] = []
        for token in sent:
            word_sent[i].append(token["form"])
            upos_sent[i].append(token["upos"]) 
        
    
    return word_sent, upos_sent

import numpy as np
from collections import defaultdict

def token_alignment(input_sent, input_upos, tok_sent, emb_sent):
    '''
    input_sent: from conllu corpus, pair of (word,upos) 
    tok_sent: from the tokenizer of the pre-trained model
    '''
    # VALID_UPOS = [ADJ, ADP, ADV, AUX, CCONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X]
    VALID_UPOS = ["NOUN", "NUM", "PROPN"]
    
    np_tok_sent = np.array(tok_sent.word_ids()) # convert the list of tokens to numpy array to use the np.where function
    tok_alignment = dict() # create en defaultdict contraining the ids corresponding to the word in tok_sent
    if upos_filter is None:
        for id, w in enumerate(input_sent): 
            tok_alignment[id] = (np.where(np_tok_sent == id)[0]).tolist() # for loop to store the ids for the tokens corresponding to the word
    else:
        for id, w in enumerate(input_sent): 
            if (input_upos[id]).upper() in VALID_UPOS:
                tok_alignment[id] = (np.where(np_tok_sent == id)[0]).tolist()
            else:
                continue

    # get embedding of each word
    word_emb = dict.fromkeys(tok_alignment.keys(),None)
    for word, tok in tok_alignment.items():    
        word_emb[word] = emb_sent[tok].mean(dim=0)
    
    return tok_alignment, word_emb

In [139]:
### Question: 
### Why no RNN is this Lab? ===> Answer: Contextuel embeddings 
### Why using the mean? ===> Answer: 

In [149]:
input_sent, upos_sent = load_corpus("../PSTAL-MORPHtagging/pstal-etu/sequoia/sequoia-ud.parseme.frsemcor.simple.small")

In [150]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenize = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
tok_sent = tokenize(input_sent[0], is_split_into_words=True, return_tensors='pt')

model = AutoModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
with torch.no_grad():    
    emb_sent = model(**tok_sent)['last_hidden_state'][0]
print(emb_sent.shape)

torch.Size([47, 768])


In [161]:
tok_alignment, emb_alignment = dict(), dict()
for idx, (sent, upos) in enumerate(zip(input_sent.items(), upos_sent.items())):
    tok_alignment[idx], emb_alignment[idx] = token_alignment(sent[1], upos[1], tok_sent, emb_sent)

In [176]:
emb_alignment[0].keys()

dict_keys([1, 5, 9, 12, 16, 19, 22, 23, 25, 29, 31])

In [180]:
# emb_alignment[0][9]

In [184]:
# def get_pairs(emb_alignment, ):
in_file = "../PSTAL-MORPHtagging/pstal-etu/sequoia/sequoia-ud.parseme.frsemcor.simple.small"
pairs = []
for i, sent in enumerate(parse_incr(open(in_file, encoding='UTF-8'))):
    for idx, tok in enumerate(sent):   
        if tok["upos"] in ["NOUN", "NUM", "PROPN"]:
            pairs.append((emb_alignment[i][idx], tok["frsemcor:noun"]))  

In [186]:
pairs[0]

(tensor([ 2.4548e-02, -3.2410e-01,  6.6246e-01,  3.7163e-01, -5.6628e-02,
         -2.4369e-01, -4.9498e-01,  7.4060e-01, -7.5084e-01,  6.2616e-01,
         -3.4168e-02,  3.4419e-02, -5.5544e-01,  3.7847e-01, -2.1963e-01,
         -3.7520e-01,  4.0847e-01, -2.4299e-01,  2.0511e-03, -3.3328e-01,
         -3.5438e-01, -4.1518e-02,  2.1685e-01,  1.3224e-01,  8.6031e-03,
          9.5864e-02, -5.0794e-01, -1.3520e-01, -1.1673e-01, -8.2052e-01,
          3.4336e-01, -2.4322e-01, -2.8327e-01,  1.1261e+00,  7.0449e-01,
          2.4583e-01, -1.8570e-02, -2.3374e-02,  2.8395e-01,  6.2698e-02,
          2.3254e-01,  3.2569e-01, -4.4565e-01, -2.0823e-01,  1.1893e-01,
         -4.5199e-01,  2.8266e-01, -7.1197e-01,  5.8547e-01,  1.0857e+00,
         -1.0936e+00, -1.4291e-01,  1.7317e-01,  1.4788e-01, -2.6592e-01,
          9.7784e-02, -5.6060e-01,  1.4198e-01,  1.8125e-01, -7.0146e-01,
          6.4720e-01, -2.6596e-01,  7.1156e-01, -4.2304e-01, -4.6336e-01,
         -4.3222e-01,  1.3419e-01, -4.