### Sense Similarity

In [51]:
from nltk.corpus import wordnet as wn
import numpy as np
synset1 = wn.synsets('cat')[0]
synset2 = wn.synsets('dog')[0]

In [37]:
#calcola la profondità di un synset, prendendo il percorso più lungo tra i suoi iperonimi
def depth(synset):
    hypernym_paths = synset.hypernym_paths()
    max_depth = max(len(path) for path in hypernym_paths)
    return max_depth

#calcola il lowest common hypernym tra due synset e ne calcola la profondità
def lcs_depth(synset1, synset2):
    lcs = synset1.lowest_common_hypernyms(synset2)
    if lcs: return depth(lcs[0])
    else: return 0

### Wu-Palmer

### $sim_{wup}(s_1,s_2)=\frac{2*\text{depth(LCS)}}{\text{depth}(s_1)+\text{depth}(s_2)}$

In [40]:
#implementazione metrica di similarità di Wu-Palmer
def wup_similarity(synset1, synset2):
    depthLCS = lcs_depth(synset1, synset2)
    depth1 = depth(synset1)
    depth2 = depth(synset2)
    similarity = (2 * depthLCS) / (depth1 + depth2)
    return similarity

print(wup_similarity(synset1,synset2))

0.8571428571428571


### Shortest Path

### $sim_{path}(s_1,s_2)=2*\text{depthMax}-\text{len}(s_1,s_2)$

In [41]:
#Max depth=20
max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

20

In [78]:
#implementazione metrica di similarità di Shortest Path
def path_similarity(synset1, synset2):
    len=synset1.shortest_path_distance(synset2) 
    if len is None: return 0
    return (2*20-len)
print(path_similarity(synset1,synset2))

36


### Leakcook - Chodorow

### $sim_{LC}(s_1,s_2)=-log\frac{\text{len}(s_1,s_2)+1}{2*\text{depthMax}+1}$

In [82]:
#implementazione metrica di similarità di Leacock-Chodorow
def lc_similarity(synset1, synset2):
    len=synset1.shortest_path_distance(synset2) 
    if len is None: return 0
    return -np.log((len+1)/((2*20)+1))

print(lc_similarity(synset1,synset2))

2.1041341542702074


---

In [85]:
#per calcolare la similarità tra due termini,
#prendo la similarità maggiore tra tutte le combinazioni di synset dei due termini
def term_similarity(term1,term2,metric):
    synsets1 = wn.synsets(term1)
    synsets2 = wn.synsets(term2)
    best_similarity=0
    for s1 in synsets1:
        for s2 in synsets2:
            similarity = metric(s1,s2)
            if similarity > best_similarity:
                best_similarity = similarity
    return best_similarity

Wu Palmer: 0.8571428571428571
Shortest Path: 36
Leakcoock Chodorow 2.1041341542702074


In [95]:
import pandas as pd
sim_df = pd.read_csv('../data/WordSim353.csv')

wup_sim = []
path_sim = []
lc_sim = []
for index, row in sim_df.iterrows():
    wup_sim.append(term_similarity(row['term1'], row['term2'],wup_similarity))
    path_sim.append(term_similarity(row['term1'], row['term2'],path_similarity))
    lc_sim.append(term_similarity(row['term1'], row['term2'],lc_similarity))

sim_df['wup_sim'] = wup_sim
sim_df['path_sim'] = path_sim
sim_df['lc_sim'] = lc_sim
sim_df

Unnamed: 0,term1,term2,human_sim,wup_sim,path_sim,lc_sim
0,love,sex,6.77,0.923077,39,3.020425
1,tiger,cat,7.35,0.965517,39,3.020425
2,tiger,tiger,10.00,1.000000,40,3.713572
3,book,paper,7.46,0.875000,38,2.614960
4,computer,keyboard,7.62,0.823529,37,2.327278
...,...,...,...,...,...,...
348,shower,flood,6.03,0.636364,36,2.104134
349,weather,forecast,8.34,0.133333,27,1.074515
350,disaster,area,6.25,0.500000,32,1.516347
351,governor,office,6.34,0.526316,31,1.410987


In [112]:
def pearson_correlation(X, Y):
    mean_X = np.mean(X)
    mean_Y = np.mean(Y)
    cov = np.mean((X - mean_X) * (Y - mean_Y))
    std_X = np.std(X, ddof=1)
    std_Y = np.std(Y, ddof=1)
    pearson_corr = cov / (std_X * std_Y)
    return pearson_corr

def rank_data(X):
    sorted_X = sorted([(value, index) for index, value in enumerate(X)])
    ranks = [0] * len(X)
    for i, (_, index) in enumerate(sorted_X):
        ranks[index] = i + 1
    return ranks

def spearman_correlation(X, Y):
    X_rank = rank_data(X)
    Y_rank = rank_data(Y)
    pearson_corr_rank = pearson_correlation(X_rank, Y_rank)
    return pearson_corr_rank

In [121]:
print("Person correlation")
print("wup_sim: ",pearson_correlation(sim_df['human_sim'], sim_df['wup_sim']))
print("path_sim: ",pearson_correlation(sim_df['human_sim'], sim_df['path_sim']))
print("lc_sim: ",pearson_correlation(sim_df['human_sim'], sim_df['lc_sim']))

print("\nSpearmann correlation")
print("wup_sim: ",spearman_correlation(sim_df['human_sim'], sim_df['wup_sim']))
print("path_sim: ",spearman_correlation(sim_df['human_sim'], sim_df['path_sim']))
print("lc_sim: ",spearman_correlation(sim_df['human_sim'], sim_df['lc_sim']))

Person correlation
wup_sim:  0.2865121430303598
path_sim:  0.16606040518129
lc_sim:  0.31346970268590574

Spearmann correlation
wup_sim:  0.3443780960403464
path_sim:  0.2846033239430566
lc_sim:  0.2846033239430566
