# Es.1 - Conceptual Similarity

In questo esercizio andiamo ad esplorare varie tecniche per calcolare la similarità semantica tra 
due parole. Per farlo utilizzeremo il dataset WordSim353.csv, che contiene una serie di coppie di
 parole e il loro punteggio di similarità. Il punteggio è un valore reale compreso tra 0 e 10.

Le misure di similarità che utilizzeremo sono:

- Wu and Palmer
- Shortest Path
- Leakcock & Chodorow

### Imports

In [2]:
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
import random
from iteration_utilities import deepflatten

### Data preparation

In [3]:
def extract_triple(path):
    '''
    Read a file and return the first 2 word in each row, as a tuple.
    '''
    tuple_list = []
    with open (path, 'r') as f:
        for row in f:
            # Remove \n
            row = row.strip()
            # Organize in a triple the values
            tuple_list.append(tuple(row.split(",")[:3]))
            
        # Remove first value of the tuple
        tuple_list.pop(0)
    f.close()
    return tuple_list

In [4]:
values = extract_triple('../data/WordSim353.csv')

### WordNet Methods

In [26]:
def get_synsets(term):
    '''
    Retrurn the synsets of a term.
    '''
    if(len(wn.synsets(term)) > 0):
        return wn.synsets(term)
    return None

def wn_max_depth(syn): 
    '''
    Return the depth of a synset, that is the distance between the
    synset and the root.
    '''
    if(syn is None):
            return 0
    # Gestisco il caso in cui gli passo una lista con un solo elemento, il sysnset
    if(type(syn) is list): 
        return syn[0].max_depth()
    return syn.max_depth()
    

def lch(syn1, syn2): #? WordNet function
    '''
    Return the lowest common hypernyms of two synsets.
    '''
    if(syn1 is None or syn2 is None):
        return None
    return syn1.lowest_common_hypernyms(syn2)

def lowest_common_subsumer(synset1, synset2): #? My function taht simulate the WordNet function
    '''
    Args:
        synset1: first synset to take LCS from
        synset2: second synset to take LCS from
    Returns:
        the first common LCS
    '''
    if synset2 == synset1:
        return synset2

    commonsArr = []
    for hyper1 in synset1.hypernym_paths():
        for hyper2 in synset2.hypernym_paths():
            zipped = list(zip(hyper1, hyper2))  # merges 2 list in one list of tuples
            common = None
            for i in range(len(zipped)):
                if(zipped[i][0] != zipped[i][1]):
                    break
                common = (zipped[i][0], i)

            if common is not None and common not in commonsArr:
                commonsArr.append(common)
    
    if len(commonsArr) <= 0:
        return None

    commonsArr.sort(key=lambda x: x[1], reverse=True)
    return commonsArr[0][0]

def wu_pal_sim(syn1, syn2): #? WordNet function
    '''
    Return the Wu-Palmer similarity of two synsets.
    '''
    if(syn1 is None or syn2 is None):
        return 0
    return syn1.wup_similarity(syn2)

def my_wu_pal_sim(syn1, syn2): #? My function that simulate the WordNet function
    '''
    IMplementation of Wu and Palm similarity metrics
    '''
    lcs = (lowest_common_subsumer(syn1, syn2))
    if lcs is None:
        return 0

    depth_lcs = depth_path(lcs, lcs)
    depth_s1 = depth_path(syn1, lcs)
    depth_s2 = depth_path(syn2, lcs)

    result = (2 * depth_lcs) / (depth_s1 + depth_s2)
    return result * 10

    
def max_similarity(syns1, syns2):
    '''
    The method comprare all the synsets of the 2 term and return the
    synsets with the highest similarity with the relative score.
    
    Args:
        syns1: list of synsets of the first term
        syns2: list of synsets of the second term
    Returns:
        the higher value for similarity and the relative synsets
    '''
    sim_max = ("", "", 0)
    
    for syn1 in syns1:
        for syn2 in syns2:
            sim = my_wu_pal_sim(syn1, syn2)
            if sim >= sim_max[2]:
                sim_max = (syn1, syn2, sim)      
                        
    return sim_max

def depth_path(synset, lcs):
    """It measures the distance (depth) between the given Synset and the WordNet's root.
    Args:
        synset: synset to reach from the root
        lcs: Lowest Common Subsumer - the first common sense or most specific ancestor node
    Returns:
        the minimum path which contains LCS
    """
    paths = synset.hypernym_paths()
    paths = list(filter(lambda x: lcs in x, paths))  # all path containing LCS
    return min(len(path) for path in paths)

### Wu and Palmer

In [27]:
for val in values:
    syns1 = get_synsets(val[0])
    syns2 = get_synsets(val[1])
    
    if(syns1 is not None and syns2 is not None):
        sim = max_similarity(syns1, syns2)
        if(sim[2] == 0):
            print("SIMILARiTY = 0. Real value by WUP of ", sim[0], ";", sim[1], ":", wu_pal_sim(sim[0], sim[1]))
    else:
        print("-- Empty Synsets:", val[0], val[1])

-- Empty Synsets: Maradona football
SIMILARiTY = 0. Real value by WUP of  Synset('investor.n.01') ; Synset('earn.v.02') : 0.18181818181818182


### Shortest Path

Define the maximum distance in WordNet tree

In [None]:
def max_path():
    """
    Returns:
        The max depth of WordNet tree (20)
    """
    max_path = 0
    for synset in wn.all_synsets():
        if synset.max_depth() > max_path:
            max_path = synset.max_depth()
    return max_path

def max_path_2(): #19
    return max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())

# max_depth = max_path() #! take too much time to compute
max_depth = 20

Define the distence method between two words in WordNet, for doing that we define a
function that evaluate the lowest_common_subsumer, but without using the built-in function
*lowest_common_hypernyms* of wordnet.

We also use *hypenym_paths* fucntion that return all the tree paths between the root and
the given synset.

In [None]:
def distance(synset1, synset2):
    """
    Args:
        synset1: first synset to calculate distance
        synset2: second synset to calculate
    Returns:
        distance between the two synset
    """
    lcs = lowest_common_subsumer(synset1, synset2)
    if lcs is None:
        return None

    hypernym1 = synset1.hypernym_paths()
    hypernym2 = synset2.hypernym_paths()

    # paths from LCS to root
    hypernym_lcs = lcs.hypernym_paths()

    # create a set of unique items flattening the nested list
    set_lcs = set(deepflatten(hypernym_lcs))

    # remove root
    set_lcs.remove(lcs)

    # path from synset to LCS
    hypernym1 = list(map(lambda x: [y for y in x if y not in set_lcs], hypernym1))
    hypernym2 = list(map(lambda x: [y for y in x if y not in set_lcs], hypernym2))

    # path containing LCS
    hypernym1 = list(filter(lambda x: lcs in x, hypernym1))
    hypernym2 = list(filter(lambda x: lcs in x, hypernym2))

    return min(list(map(lambda x: len(x), hypernym1))) + min(list(map(lambda x: len(x), hypernym2))) - 2
