- https://www.nltk.org/howto/wordnet.html WordNet examples

In [216]:
from nltk.corpus import wordnet as wn
import numpy as np

### Utilities

In [217]:
def get_synsets(word):
    return wn.synsets(word)

def get_all_hypernyms(synset):
    # get all hypernyms of a synset until the root of wordnet
    # estendere per far prendere più iperonimi, non solo il primo ogni volta
    ret_list = []
    hypernyms = synset.hypernyms()
    while hypernyms:
        for hyper in hypernyms:
            ret_list.append(hyper)
        hypernyms = hypernyms[0].hypernyms()
    return ret_list

def lowest_common_subsumer(syn1, syn2):
    return syn1.lowest_common_hypernyms(syn2)[0] if syn1.lowest_common_hypernyms(syn2) else None

def depth(syn):
    return syn.min_depth() if syn else 0

def lowest_common_subsumer_2(syn1, syn2):
    # risale la gerarchia degli iperonimi, scegliendo sempre e solo il primo synset percjè non ho un PC non della NASA
    syn1_hypernyms = get_all_hypernyms(syn1)
    syn2_hypernyms = get_all_hypernyms(syn2)

    for h1 in syn1_hypernyms:
        if h1 in syn2_hypernyms:
            return h1

def max_path(): # restituisce sempre 19, per velocizzare le esecuzioni salvo il valore in una costante
    max_path = 0
    for synset in wn.all_synsets():
        if synset.max_depth() > max_path:
            max_path = synset.max_depth()
    return max_path

def length(syn1, syn2): # NB, non esistono i cammini tra nomi e verbi in WordNet, pepr cui vanno rimossi i verbi credo
    return syn1.shortest_path_distance(syn2) if syn1.shortest_path_distance(syn2) else 0

In [218]:
MAX_PATH = 19

### Wu & Palmer

In [219]:
def wu_palmer(syn1, syn2):
    dep = 0

    lcs = lowest_common_subsumer_2(syn1, syn2)
    dep = (depth(syn1) + depth(syn2))
    
    if dep == 0:
        dep = 0.001
        
    return 2 * depth(lcs) / dep

### Shortest Path

In [220]:
def shortest_path(syn1, syn2):
    return 2 * MAX_PATH - length(syn1, syn2) if length(syn1, syn2) else 0

### Leakcock & Chodorow

In [221]:
def leakcock_chodorow(syn1, syn2):
    return -np.log(length(syn1, syn2) / 2 * MAX_PATH) if length(syn1, syn2) else -1000 #-1000 per indicare un valore di somiglianza basso

### Execution

read lines from WordSim353.csv

In [222]:
dataset = []

with open(r'C:\Users\andre\Desktop\Università\Magistrale\TLN\TLN-LAB\nlp-UniTO-2021-22\Radicioni\data\WordSim353.csv', 'r') as f:
    word_sim = f.readlines()[1:]
    for tuple in word_sim:
        dataset.append(tuple.split(','))

get synset from dataset

In [223]:
syns_1 = []
syns_2 = []

for tuple in dataset:
    syns_1.append(get_synsets(tuple[0]))
    syns_2.append(get_synsets(tuple[1]))

compute similarity using the three methods described above over all the combinations of synsets of every word in the input file.
For each couple, take the maximum value of each similarity 

In [None]:
wp = []
sp = []
lc = []

max_wu = 0
max_sp =  0
max_lc = -1000

for i in range(len(syns_1)):
    for j in range(len(syns_1[i])):
        for k in range(len(syns_2[i])):
            
            #print(f"syn1: {syns_1[i][j]}, syn2: {syns_2[i][k]} --> WU_PALMER: {wu_palmer(syns_1[i][j], syns_2[i][k])} - SHORTEST_PATH: {shortest_path(syns_1[i][j], syns_2[i][k])} - LEAKCOCK_CHODOROW: {leakcock_chodorow(syns_1[i][j], syns_2[i][k])}")
            if wu_palmer(syns_1[i][j], syns_2[i][k]) > float(max_wu):
                max_wu = wu_palmer(syns_1[i][j], syns_2[i][k])
            if shortest_path(syns_1[i][j], syns_2[i][k]) > float(max_sp):
                max_sp = shortest_path(syns_1[i][j], syns_2[i][k])
            if leakcock_chodorow(syns_1[i][j], syns_2[i][k]) > float(max_lc):
                max_lc = leakcock_chodorow(syns_1[i][j], syns_2[i][k])

    wp.append(max_wu) 
    max_wu = 0
    sp.append(max_sp)
    max_sp = 0
    lc.append(max_lc)
    max_lc = -1000

print(wp)
print(sp)
print(lc)

evaluation methods