# Es 1a - Conceptual Similarity

In questo esercizio andiamo ad esplorare varie tecniche per calcolare la similarità semantica tra 
due parole. Per farlo utilizzeremo il dataset WordSim353.csv, che contiene una serie di coppie di
 parole e il loro punteggio di similarità. Il punteggio è un valore reale compreso tra 0 e 10.

Le misure di similarità che utilizzeremo sono:

- Wu and Palmer
- Shortest Path
- Leakcock & Chodorow

### Imports

In [3]:
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
import random
import numpy as np
from iteration_utilities import deepflatten
from scipy import stats

### Data preparation

In [4]:
def extract_triple(path):
    '''
    Read a file and return the first 2 word in each row, as a tuple.
    '''
    tuple_list = []
    with open (path, 'r') as f:
        for row in f:
            # Remove \n
            row = row.strip()
            # Organize in a triple the VALUES
            tuple_list.append(tuple(row.split(",")[:3]))
            
        # Remove first value of the tuple
        tuple_list.pop(0)
    f.close()
    return tuple_list

In [5]:
VALUES = extract_triple('../data/WordSim353.csv')

## Conceptual similarity methods

### WordNet Methods

Nota: Normalizziamo tutti i valori nell'intervallo [0;10] per facilitare il confronto
con il file di partenza

In [6]:
def max_similarity(syns1, syns2, sim_fuct):
    '''
    The method comprare all the synsets of the 2 term and return the
    synsets with the highest similarity with the relative score.
    
    Args:
        syns1: list of synsets of the first term
        syns2: list of synsets of the second term
    Returns:
        the higher value for similarity and the relative synsets
    '''
    sim_max = ("", "", -100)
    
    for syn1 in syns1:
        for syn2 in syns2:
            sim = sim_fuct(syn1, syn2)
            if sim >= sim_max[2]:
                sim_max = (syn1, syn2, sim)      
                        
    return sim_max

def sim_res_list(sim_funct):
    res = []
    VALUES = extract_triple('../data/WordSim353.csv')
    
    for val in VALUES:
        syns1 = get_synsets(val[0])
        syns2 = get_synsets(val[1])
    
        if(syns1 is not None and syns2 is not None):
            sim = max_similarity(syns1, syns2, sim_funct)
            res.append(sim[2])
        else:
            res.append(0)
            
    return res

def print_result(sim_funct):
    for val in VALUES:
        syns1 = get_synsets(val[0])
        syns2 = get_synsets(val[1])
    
        if(syns1 is not None and syns2 is not None):
            sim = max_similarity(syns1, syns2, sim_funct)
            s = sim[2]
            so = val[2]
            diff = abs(round((s - float(so)), 2))
            
            print(f'{sim_funct.__name__} = {s}  \tgiven: {so} \tDiffernce --> {diff}')

def get_synsets(term):
    '''
    Retrurn the synsets of a term.
    '''
    if(len(wn.synsets(term)) > 0):
        return wn.synsets(term)
    return None
    
def lch(syn1, syn2): #? WordNet function
    '''
    Return the lowest common hypernyms of two synsets.
    '''
    if(syn1 is None or syn2 is None):
        return None
    return syn1.lowest_common_hypernyms(syn2)

def lowest_common_subsumer(synset1, synset2): #? My function that simulate the WordNet's function
    '''
    Args:
        synset1: first synset to take LCS from
        synset2: second synset to take LCS from
    Returns:
        the first common LCS
    '''
    if synset2 == synset1:
        return synset2

    commonsArr = []
    for hyper1 in synset1.hypernym_paths():
        for hyper2 in synset2.hypernym_paths():
            zipped = list(zip(hyper1, hyper2))  # merges 2 list in one list of tuples
            common = None
            for i in range(len(zipped)):
                if(zipped[i][0] != zipped[i][1]):
                    break
                common = (zipped[i][0], i)

            if common is not None and common not in commonsArr:
                commonsArr.append(common)
    
    if len(commonsArr) <= 0:
        return None

    commonsArr.sort(key=lambda x: x[1], reverse=True)
    return commonsArr[0][0]

def wu_pal_sim(syn1, syn2): #? WordNet function
    '''
    Return the Wu-Palmer similarity of two synsets.
    '''
    if(syn1 is None or syn2 is None):
        return 0
    return round(syn1.wup_similarity(syn2) * 10, 2)

def my_wu_pal_sim(syn1, syn2): #? My function that simulate the WordNet's function
    '''
    Implementation of Wu and Palm similarity metrics
    '''
    lcs = (lowest_common_subsumer(syn1, syn2))
    if lcs is None:
        return 0

    depth_lcs = depth_path(lcs, lcs)
    depth_s1 = depth_path(syn1, lcs)
    depth_s2 = depth_path(syn2, lcs)

    result = (2 * depth_lcs) / (depth_s1 + depth_s2)
    return round(result * 10, 2)

def depth_path(synset, lcs):
    """It measures the distance (depth) between the given Synset and the WordNet's root.
    Args:
        synset: synset to reach from the root
        lcs: Lowest Common Subsumer - the first common sense or most specific ancestor node
    Returns:
        the minimum path which contains LCS
    """
    paths = synset.hypernym_paths()
    paths = list(filter(lambda x: lcs in x, paths))  # all path containing LCS
    return min(len(path) for path in paths)

## Wu and Palmer

### Print the result

In [7]:
print_result(my_wu_pal_sim)

my_wu_pal_sim = 9.23  	given: 6.77 	Differnce --> 2.46
my_wu_pal_sim = 9.66  	given: 7.35 	Differnce --> 2.31
my_wu_pal_sim = 10.0  	given: 10.00 	Differnce --> 0.0
my_wu_pal_sim = 8.75  	given: 7.46 	Differnce --> 1.29
my_wu_pal_sim = 8.24  	given: 7.62 	Differnce --> 0.62
my_wu_pal_sim = 6.32  	given: 7.58 	Differnce --> 1.26
my_wu_pal_sim = 7.27  	given: 5.77 	Differnce --> 1.5
my_wu_pal_sim = 7.37  	given: 6.31 	Differnce --> 1.06
my_wu_pal_sim = 1.67  	given: 7.50 	Differnce --> 5.83
my_wu_pal_sim = 9.09  	given: 6.77 	Differnce --> 2.32
my_wu_pal_sim = 8.24  	given: 7.42 	Differnce --> 0.82
my_wu_pal_sim = 1.82  	given: 6.85 	Differnce --> 5.03
my_wu_pal_sim = 7.5  	given: 6.19 	Differnce --> 1.31
my_wu_pal_sim = 8.24  	given: 5.92 	Differnce --> 2.32
my_wu_pal_sim = 8.24  	given: 7.00 	Differnce --> 1.24
my_wu_pal_sim = 6.67  	given: 6.62 	Differnce --> 0.05
my_wu_pal_sim = 5.33  	given: 6.81 	Differnce --> 1.48
my_wu_pal_sim = 1.33  	given: 4.62 	Differnce --> 3.29
my_wu_pal_si

### Save result in a list

In [8]:
wu_result = sim_res_list(wu_pal_sim)

### Print more result

In [9]:
for val in VALUES:
    syns1 = get_synsets(val[0])
    syns2 = get_synsets(val[1])
    
    #? Print the VALUES whose synsets I could not find or that returned similarity of 0
    if(syns1 is not None and syns2 is not None):
        sim = max_similarity(syns1, syns2, my_wu_pal_sim)
        # wn_sim = wu_pal_sim(syns1, syns2)
        print(f'Similarity between {sim[0]} and {sim[1]}:')
        print(f'my_WUP = {sim[2]}, the given similarity is: {val[2]}')
        
        # print(f'WN original WUP Similarity between {wn_sim[0]} and {wn_sim[1]}: {wn_sim[2]}\n')
                
        if(sim[2] == 0):
            print("SIMILARiTY = 0. Real value by WUP of ", sim[0], ";", sim[1], ":", wu_pal_sim(sim[0], sim[1]))
    else:
        print("-- Empty Synsets:", val[0], val[1])

Similarity between Synset('sexual_love.n.02') and Synset('sexual_activity.n.01'):
my_WUP = 9.23, the given similarity is: 6.77
Similarity between Synset('tiger.n.02') and Synset('big_cat.n.01'):
my_WUP = 9.66, the given similarity is: 7.35
Similarity between Synset('tiger.n.02') and Synset('tiger.n.02'):
my_WUP = 10.0, the given similarity is: 10.00
Similarity between Synset('book.n.11') and Synset('newspaper.n.03'):
my_WUP = 8.75, the given similarity is: 7.46
Similarity between Synset('computer.n.01') and Synset('keyboard.n.01'):
my_WUP = 8.24, the given similarity is: 7.62
Similarity between Synset('computer.n.01') and Synset('internet.n.01'):
my_WUP = 6.32, the given similarity is: 7.58
Similarity between Synset('airplane.n.01') and Synset('car.n.02'):
my_WUP = 7.27, the given similarity is: 5.77
Similarity between Synset('train.n.01') and Synset('car.n.02'):
my_WUP = 7.37, the given similarity is: 6.31
Similarity between Synset('telephone.n.02') and Synset('communication.n.02'):
m

## Shortest Path

Define the maximum distance in WordNet tree

In [10]:
def max_path():
    """
    Returns:
        The max depth of WordNet tree (20)
    """
    max_path = 0
    for synset in wn.all_synsets():
        if synset.max_depth() > max_path:
            max_path = synset.max_depth()
    return max_path

def max_path_2(): #19
    return max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

# max_depth = max_path() #! take too much time to compute
max_depth = 20

Define the distence method between two words in WordNet, for doing that we define a
function that evaluate the lowest_common_subsumer, but without using the built-in function
*lowest_common_hypernyms* of wordnet.

We also use *hypenym_paths* function that return all the tree paths between the root and
the given synset.

In [11]:
def syn_distance(synset1, synset2):
    '''
    Args:
        synset1: first synset to calculate distance
        synset2: second synset to calculate
    Returns:
        distance between the two synset
    '''
    lcs = lowest_common_subsumer(synset1, synset2)
    if lcs is None:
        return None

    hypernym1 = synset1.hypernym_paths()
    hypernym2 = synset2.hypernym_paths()

    # paths from LCS to root
    hypernym_lcs = lcs.hypernym_paths()

    # create a set of unique items flattening the nested list
    set_lcs = set(deepflatten(hypernym_lcs))

    # remove root
    set_lcs.remove(lcs)

    # path from synset to LCS
    hypernym1 = list(map(lambda x: [y for y in x if y not in set_lcs], hypernym1))
    hypernym2 = list(map(lambda x: [y for y in x if y not in set_lcs], hypernym2))

    # path containing LCS
    hypernym1 = list(filter(lambda x: lcs in x, hypernym1))
    hypernym2 = list(filter(lambda x: lcs in x, hypernym2))

    return min(list(map(lambda x: len(x), hypernym1))) + min(list(map(lambda x: len(x), hypernym2))) - 2

def sim_path(syn1, syn2):
    '''
    Args:
        synset1: first synset
        synset2: second synset
    Returns:
        Shortest Path Similarity between the two synset 
        with normalization from [0, 2 * max_depth] to [0,10]:
        new_val = (val - lower_bound)/(upper_bound - lower_bound)
        = (x - 0)/(2 * max_depth - 0) = x / 40
        --> Divide by 40 (max_depth * 2) wich is the max possible value
    '''
    len = syn_distance(syn1, syn2)
    if(len is None):
        return 0
    return round(((2 * max_depth - len) / 40)* 10, 2)
            

### Print result

In [12]:
print_result(sim_path)

sim_path = 9.75  	given: 6.77 	Differnce --> 2.98
sim_path = 9.75  	given: 7.35 	Differnce --> 2.4
sim_path = 10.0  	given: 10.00 	Differnce --> 0.0
sim_path = 9.5  	given: 7.46 	Differnce --> 2.04
sim_path = 9.25  	given: 7.62 	Differnce --> 1.63
sim_path = 8.25  	given: 7.58 	Differnce --> 0.67
sim_path = 8.5  	given: 5.77 	Differnce --> 2.73
sim_path = 8.75  	given: 6.31 	Differnce --> 2.44
sim_path = 7.5  	given: 7.50 	Differnce --> 0.0
sim_path = 9.5  	given: 6.77 	Differnce --> 2.73
sim_path = 9.25  	given: 7.42 	Differnce --> 1.83
sim_path = 7.75  	given: 6.85 	Differnce --> 0.9
sim_path = 9.5  	given: 6.19 	Differnce --> 3.31
sim_path = 9.25  	given: 5.92 	Differnce --> 3.33
sim_path = 9.5  	given: 7.00 	Differnce --> 2.5
sim_path = 8.5  	given: 6.62 	Differnce --> 1.88
sim_path = 8.25  	given: 6.81 	Differnce --> 1.44
sim_path = 6.75  	given: 4.62 	Differnce --> 2.13
sim_path = 6.75  	given: 5.81 	Differnce --> 0.94
sim_path = 8.75  	given: 7.08 	Differnce --> 1.67
sim_path = 

### Save result in a list

In [13]:
sp_result = sim_res_list(sim_path)

## Leacock & Chodorow

In [14]:
def lec(syn1, syn2):
    '''
    Args:
        synset1: first synset
        synset2: second synset
    Returns:
        Leakcock & Chodorow Similarity between the two synset 
        To avoid log(0) we add 1 to numerator and denominator
    '''
    len = syn_distance(syn1, syn2)
    if len is None:
        return 0
    
    # return round(np.log(len + 1 / 2 * max_depth + 1), 2)
    return round(np.log((2 * max_depth + 1) / (len  + 1)), 2)

### Print result

In [15]:
print_result(lec)

lec = 3.02  	given: 6.77 	Differnce --> 3.75
lec = 3.02  	given: 7.35 	Differnce --> 4.33
lec = 3.71  	given: 10.00 	Differnce --> 6.29
lec = 2.61  	given: 7.46 	Differnce --> 4.85
lec = 2.33  	given: 7.62 	Differnce --> 5.29
lec = 1.63  	given: 7.58 	Differnce --> 5.95
lec = 1.77  	given: 5.77 	Differnce --> 4.0
lec = 1.92  	given: 6.31 	Differnce --> 4.39
lec = 1.32  	given: 7.50 	Differnce --> 6.18
lec = 2.61  	given: 6.77 	Differnce --> 4.16
lec = 2.33  	given: 7.42 	Differnce --> 5.09
lec = 1.41  	given: 6.85 	Differnce --> 5.44
lec = 2.61  	given: 6.19 	Differnce --> 3.58
lec = 2.33  	given: 5.92 	Differnce --> 3.59
lec = 2.61  	given: 7.00 	Differnce --> 4.39
lec = 1.77  	given: 6.62 	Differnce --> 4.85
lec = 1.63  	given: 6.81 	Differnce --> 5.18
lec = 1.07  	given: 4.62 	Differnce --> 3.55
lec = 1.07  	given: 5.81 	Differnce --> 4.74
lec = 1.92  	given: 7.08 	Differnce --> 5.16
lec = 1.77  	given: 8.08 	Differnce --> 6.31
lec = 1.77  	given: 1.62 	Differnce --> 0.15
lec = 1.92

### Save result in a list

In [16]:
lec_result = sim_res_list(lec)

## Indici di Correlazione

This module contains the correlation indices implementations. Alternatively one could you use scypy implementation

In [17]:
def pearson_index(x, y):
    '''
    Implementation of the Pearson index.
    Args:
         x: golden value
         y: similarity list
    Returns:
        Pearson correlation index = [Covariance / (Standard deviation of x * Standard deviation of y)]
    '''
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    
    # and the standard deviation of both
    std_dev_x = np.std(x)
    std_dev_y = np.std(y)

    # subtract mean from each element of x
    modified__x = [elem - mu_x for elem in x] 
    modified__y = [elem - mu_y for elem in y]
    
    num = np.mean(np.multiply(modified__x, modified__y)) # Covariance
    numeric = std_dev_x * std_dev_y

    return num / numeric

def pearson_index_scipy(x, y):
    '''
    Implementation of the Pearson index using scipy
    Args:
         x: golden value
         y: similarity list
    Returns:
        Pearson correlation index = [Covariance / (Standard deviation of x * Standard deviation of y)]
    '''
    return stats.pearsonr(x, y)[0]


def pearson_index_np(x, y):
    '''
    Implementation of the Pearson index using numpy
    Args:
         x: golden value
         y: similarity list
    Returns:
        Pearson correlation index = [Covariance / (Standard deviation of x * Standard deviation of y)]
    '''
    mu_x = np.mean(x)
    mu_y = np.mean(y)
    
    # and the standard deviation of both
    std_dev_x = np.std(x)
    std_dev_y = np.std(y)
    
    numeric = std_dev_x * std_dev_y

    num = np.cov(x, y)[0][1] # Covariance    

    return num / numeric

def spearman_index_scipy(x, y):
    '''
    Implementation of the Spearman index using scipy
    '''
    return stats.spearmanr(x, y)[0]
    

def spearman_index(x, y):
    '''
    Implementation of the Spearman index.
    Args:
        x: golden value
        y: similarity list
    Returns:
         Spearman correlation index
    '''
    rank__x = define_rank(x)
    rank__y = define_rank(y)

    return pearson_index(rank__x, rank__y)


def define_rank(vector):
    '''
    Args:
        vector: numeric vector
    Returns:
        ranks list, sorted as the input order
    '''
    x_couple = [(vector[i], i) for i in range(len(vector))]
    x_couple_sorted = sorted(x_couple, key=lambda x: x[0])

    return [y for (x, y) in x_couple_sorted]

## Print corelation index result - scipy

In [21]:
g_VALUES = [item[2] for item in VALUES] # golden value, the value we want to compare with the similarity list
golden = []
for el in g_VALUES:
    golden.append(float(el))
    
# List of similarity functions to be used in the experiment
sim_algo_list = [my_wu_pal_sim, sim_path, lec]

algo_result = []
i = 0

for algo in sim_algo_list:
    algo_result.append(sim_res_list(algo))

for sim_algo in algo_result:
    print("Pearson index for ", sim_algo_list[i].__name__, "is: ", pearson_index_scipy(golden, sim_algo))
    print("Spearman index for ", sim_algo_list[i].__name__, "is: ",  spearman_index_scipy(golden, sim_algo), "\n")
    i += 1



Pearson index for  my_wu_pal_sim is:  0.28723889591339413
Spearman index for  my_wu_pal_sim is:  0.3362876896519317 

Pearson index for  sim_path is:  0.1641193890897277
Spearman index for  sim_path is:  0.2883010732370922 

Pearson index for  lec is:  0.3127965610951786
Spearman index for  lec is:  0.2883010732370922 



### Print corelation index results - Mio

In [19]:
g_VALUES = [item[2] for item in VALUES] # golden value, the value we want to compare with the similarity list
golden = []
for el in g_VALUES:
    golden.append(float(el))
    
# List of similarity functions to be used in the experiment
sim_algo_list = [my_wu_pal_sim, sim_path, lec]

algo_result = []
i = 0

for algo in sim_algo_list:
    algo_result.append(sim_res_list(algo))

for sim_algo in algo_result:
    print("Pearson index for ", sim_algo_list[i].__name__, "is: ", pearson_index(golden, sim_algo))
    print("Spearman index for ", sim_algo_list[i].__name__, "is: ",  spearman_index(golden, sim_algo), "\n")
    i += 1



Pearson index for  my_wu_pal_sim is:  0.2872388959133944
Spearman index for  my_wu_pal_sim is:  0.0734025871137553 

Pearson index for  sim_path is:  0.16411938908972756
Spearman index for  sim_path is:  0.007227015194437291 

Pearson index for  lec is:  0.31279656109517906
Spearman index for  lec is:  0.007227015194437291 

