# Preprocessing, extracting and averaging embeddings

Here we are at task 2, "easy" choice: we extract an average of contextualized embeddings of the verb for each pattern, and then cluster these averages. Compare this classification with the one made on static embeddings (https://github.com/Rapazebu/Clustering-Verb-Meanings-in-Italian)

Install and import stuff

In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install simplemma

Note: you may need to restart the kernel to use updated packages.


In [3]:
import json 
import pandas as pd 
import numpy as np 
import simplemma
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import nltk 
from nltk.stem import SnowballStemmer
from codecs import *

Clustering contextualized embeddings gathered by sense. 

Steps:

1.   Filtro pattern con + di 30 istanze: `Ps = [abbaiare_1, abbaiare_2 ecc]`
2.   A ogni frase se pattern è accettabile appendo il token lemmatizzato e l'embedding: 
`Ls = [label, sentence, token, embedding]`
3.   Per ogni label appendo la media di vettori: 
`Vs = [(abbaiare_1, np.average(vettore1, vettore2, vettore3), (abbaiare2, media) ecc]`
4. Metto tutto in un pandas dataframe
5.   Do i suddetti vettori in pasto al kmeans (lo stesso che ho usato per embeddings statici
6. Comparo qualitativamente e bang il clustering sui vettori è fatto



In [4]:
def ReturnDatabase(filename):
    f = open(filename)
    r = json.load(f)
    data = r['data']
    dataClean = []
    for x in data: 
        if x[1] != 'x' and x[1] != 'u' and '.' not in x[1] and '_' not in x[0]:
            dataClean.append(x)
    return dataClean

def count(dataClean):
  patterns = {}
  for x in dataClean:
    pattern = (x[0], x[1])
    if pattern not in patterns: 
       patterns[pattern] = 1
    else: 
       patterns[pattern]  = patterns[pattern] + 1  
  return patterns #(abbaiare,1): 3, (abbaiare,2): 5

def getsentences(data, patterns):
  Ls = []
  for x in data: 
    tup = (x[0], x[1])
    if patterns[tup] > 30:
      trip = (x[0], x[1], x[2])
      Ls.append(trip)
  return Ls

def gettokens_SIMPLEMMA(Ls):
  Ms = []
  langdata = simplemma.load_data('it')
  for x in Ls: 
    sentence = x[2].split(" ")
    for tok in sentence:
      lemmatized = simplemma.lemmatize(tok, langdata)
      if lemmatized == x[0]:
        tup = (x[0], x[1], tok, x[2])
        Ms.append(tup)
        break
  return Ms

def gettokens_SNOWBALL(Ls):
  Ms = []
  stemmer_snowball = SnowballStemmer('italian')
  for x in Ls: 
    sentence = x[2].split(" ")
    verb_stem = stemmer_snowball.stem(x[0])
    for tok in sentence:
      tok_stem = stemmer_snowball.stem(tok)
      #print(tok, tok_stem)
      if tok_stem == verb_stem:
        tup = (x[0], x[1], tok, x[2])
        Ms.append(tup)
        break
  return Ms


In [5]:
# get embeddings 
 
def get_word_idx(sent: str, word: str):
     return sent.split(" ").index(word)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_hidden_states(encoded, token_ids_word, model, layers):
     """Push input IDs through model. Stack and sum `layers` (last four by default).
        Select only those subword token outputs that belong to our word of interest
        and average them."""
     with torch.no_grad():
         output = model(**encoded)
 
     # Get all hidden states
     states = output.hidden_states
     # Stack and sum all requested layers
     output = torch.stack([states[i] for i in layers]).sum(0).squeeze().to(device)
     # Only select the tokens that constitute the requested word
     word_tokens_output = output[token_ids_word]
     return word_tokens_output.mean(dim=0)
 
def get_word_vector(sent, idx, tokenizer, model, layers):
     """Get a word vector by first tokenizing the input sentence, getting all token idxs
        that make up the word of interest, and then `get_hidden_states`."""
     encoded = tokenizer.encode_plus(sent, return_tensors="pt").to(device)
     # get all token idxs that belong to the word of interest
     token_ids_word = np.where(np.array(encoded.word_ids()) == idx)
     return get_hidden_states(encoded, token_ids_word, model, layers)
 
def main(sent, tok, layers=None):
     # Use last four layers by default
     layers = [-4, -3, -2, -1] if layers is None else layers
     tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-cased")
     model = AutoModel.from_pretrained("dbmdz/bert-base-italian-xxl-cased", output_hidden_states=True).to(device)
     idx = get_word_idx(sent, tok)
     word_embedding = get_word_vector(sent, idx, tokenizer, model, layers).cpu()
     return word_embedding 

In [6]:
# appends the vector to the list, i. e. returns a list label, sentence, vector

def getvectors(Ms):
  Ns = []
  for x in Ms: 
    try:
      embedding = main(x[3], x[2])
      label = x[0] + "_" + x[1]
      print(x[3])
      tup = (label, x[3], embedding)
      Ns.append(tup)
    except: 
      pass
  return Ns

In [7]:
# takes the list label-sentence-vector and creates a dictionary label:average of vectors for label

def getaverage(Ns):
  d = {}
  for x in Ns:
    if x[0] not in d:
      d[x[0]] = [x[2]]
    else: 
      d[x[0]] = d[x[0]] + [x[2]]
  avgs = {}
  for key in d:
    avgs[key] = [x.numpy() for x in d[key]]
    avg = np.mean(avgs[key], axis = 0)
    avgs[key] = avg
  return avgs


In [8]:
#writing stuff

def getmetadata(filename, Ns):
  d = {}
  # counts how many embeddings we have for each pattern
  for x in Ns:
    if x[0] not in d:
      d[x[0]] = 1
    else: 
      d[x[0]] = d[x[0]] + 1
  fh = open(filename, "w", "utf-8")
  for x in d:
    fh.write(x)
    fh.write("\t")
    fh.write(str(d[x]))
    fh.write("\n")  
  fh.close()  

def writefile(filename, avgs):
  fh = open(filename, "w", "utf-8")
  meta = [str(i) for i in range(769)]
  fh.write("label")
  for x in meta:
    tabbed = "\t" + str(x)
    fh.write(tabbed)
  fh.write("\n")

  for x in avgs:
    fh.write(x)
    lista = [str(x +1) for x in avgs[x]]
    for x in lista:
      n = "\t" + x 
      fh.write(n)
    fh.write("\n")
  fh.close()


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Ora gli facciamo computare gli embeddings, in modo che sputi fuori una lista LABEL SENTENCE EMBEDDINGS, es abbaiare_1, frase, emb:

In [9]:
data = ReturnDatabase("G:\My Drive\TESI codici\TPAS corpus.json") # change here
patterns = count(data)
Ls = getsentences(data, patterns)

In [None]:
AB = Ls[:4070]       # done
AF = Ls[4071:7288]   # done
AN = Ls[7289:13142]  
B = Ls[13143:15275] # done
CA = Ls[15276:20069] # done
CO = Ls[20070:25159] 
D = Ls[25160:31353] 
E = Ls[31354:34033] # done
F = Ls[34034:36770] # done
G = Ls[36771:38131] # done
I = Ls[38132:45111]
L = Ls[45112:46207] # done
M = Ls[46208:48972] # done
N = Ls[48973:49659] # done
O = Ls[49660:51542] # done
P = Ls[51543:63858]
R = Ls[63859:86199]
S = Ls[86200:]
T = Ls[:]
U = Ls[:]
V = Ls[:]
Z = Ls[:]
Ms = gettokens_SIMPLEMMA(N)      # change here
Ns = getvectors(Ms)   
avgs = getaverage(Ns) 
writefile("G:\My Drive\TESI codici\Vectors_N.csv", avgs) # change here
getmetadata("G:\My Drive\TESI codici\Metadata_N.csv", Ns) # change here

In [49]:
avgs = getaverage(Ns) 
writefile("G:\My Drive\TESI codici\Vectors_G.csv", avgs) # change here
getmetadata("G:\My Drive\TESI codici\Metadata_G.csv", Ns) # change here

In [44]:
writefile("G:\My Drive\TESI codici\Vectors_AF.csv", avgs) # change here
getmetadata("G:\My Drive\TESI codici\Metadata_AF.csv", Ns) # change here