In [13]:
import numpy as np
from conllu import parse_incr
from collections import defaultdict

def load_corpus(in_file):
    sents = parse_incr(open(in_file, encoding='UTF-8'))
    word_sent = {}
    upos_sent = {}
    for i, sent in enumerate(sents):
        word_sent[i] = []
        upos_sent[i] = []
        for token in sent:
            word_sent[i].append(token["form"])
            upos_sent[i].append(token["upos"]) 
        
    return word_sent, upos_sent

def token_alignment(input_sent, input_upos, tok_sent, emb_sent):
    '''
    input_sent: from conllu corpus, pair of (word,upos) 
    tok_sent: from the tokenizer of the pre-trained model
    '''
    # VALID_UPOS = [ADJ, ADP, ADV, AUX, CCONJ, DET, INTJ, NOUN, NUM, PART, PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X]
    VALID_UPOS = ["NOUN", "NUM", "PROPN"]
    
    np_tok_sent = np.array(tok_sent.word_ids()) # convert the list of tokens to numpy array to use the np.where function
    tok_alignment = dict() # create en defaultdict contraining the ids corresponding to the word in tok_sent
    for id, w in enumerate(input_sent): 
        if (input_upos[id]).upper() in VALID_UPOS:
            tok_alignment[id] = (np.where(np_tok_sent == id)[0]).tolist()
        else:
            continue

    # get embedding of each word
    word_emb = dict.fromkeys(tok_alignment.keys(),None)
    for word, tok in tok_alignment.items():    
        word_emb[word] = emb_sent[tok].mean(dim=0)
    
    return tok_alignment, word_emb

In [139]:
### Question: 
### Why no RNN is this Lab? ===> Answer: Contextuel embeddings 
### Why using the mean? ===> Answer: 

In [14]:
input_sent, upos_sent = load_corpus("../pstal-etu/sequoia/sequoia-ud.parseme.frsemcor.simple.small")

In [15]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenize = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
tok_sent = tokenize(input_sent[0], is_split_into_words=True, return_tensors='pt')

model = AutoModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
with torch.no_grad():    
    emb_sent = model(**tok_sent)['last_hidden_state'][0]
print(emb_sent.shape)

torch.Size([47, 768])


In [16]:
tok_alignment, emb_alignment = dict(), dict()
for idx, (sent, upos) in enumerate(zip(input_sent.items(), upos_sent.items())):
    tok_alignment[idx], emb_alignment[idx] = token_alignment(sent[1], upos[1], tok_sent, emb_sent)

In [17]:
emb_alignment[0].keys()

dict_keys([1, 5, 9, 12, 16, 19, 22, 23, 25, 29, 31])

In [21]:
def get_pairs(in_file, emb_alignment):
    pairs = []
    for i, sent in enumerate(parse_incr(open(in_file, encoding='UTF-8'))):
        for idx, tok in enumerate(sent):   
            if tok["upos"] in ["NOUN", "NUM", "PROPN"]:
                pairs.append((emb_alignment[i][idx], tok["frsemcor:noun"]))
    return pairs

In [22]:
in_file = "../pstal-etu/sequoia/sequoia-ud.parseme.frsemcor.simple.small"

pairs = get_pairs(in_file, emb_alignment)

In [24]:
def create_dataloader(dataset, batch_size, shuffle_mode):
    from torch.utils.data import DataLoader
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle_mode)

In [25]:
dataloader = create_dataloader(pairs, 32, True)

In [37]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenize = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
tok_sent = tokenize(input_sent[0], is_split_into_words=True, return_tensors='pt')

model = AutoModel.from_pretrained("distilbert/distilbert-base-multilingual-cased")
with torch.no_grad():    
    emb_sent = model(**tok_sent)['last_hidden_state'][0]
print(emb_sent.shape)

torch.Size([47, 768])


In [39]:
model.config.hidden_size

768

In [43]:
sense_values = set()
for i, sent in enumerate(parse_incr(open("../pstal-etu/sequoia/sequoia-ud.parseme.frsemcor.simple.full", encoding='UTF-8'))):
    for idx, tok in enumerate(sent):   
        sense_values.add(tok["frsemcor:noun"])

In [46]:
list(sense_values)

['Feeling',
 'Cognition',
 'Relation',
 'Artifact',
 'Group',
 'Phenomenon',
 'Quantity',
 'Part',
 'Substance',
 'Body',
 'Person',
 'Food',
 'Animal',
 'Communication',
 'Act',
 'Object',
 'Institution',
 'Attribute',
 'Plant',
 'Possession',
 'Time',
 'Tops',
 '*',
 'State',
 'Event']

In [48]:
type(list(sense_values))

list

In [49]:
model.config

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "dtype": "float32",
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.57.3",
  "vocab_size": 119547
}