In [593]:
import pandas as pd
import csv
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.metrics import jaccard_distance
from scipy.stats import pearsonr
from nltk.corpus import wordnet as wn
from nltk import pos_tag

In [594]:
trial_path = 'trial/STS.input.txt'
trial_gs_path = 'trial/STS.gs.txt'
trial_df = pd.read_csv(trial_path, sep='\t', lineterminator='\n', names=['sentence0','sentence1'], header=None, quoting=csv.QUOTE_NONE)
trial_gs = pd.read_csv(trial_gs_path, sep='\t', lineterminator='\n', names=['labels'], header=None, quoting=csv.QUOTE_NONE)
print(trial_df.shape, trial_gs.shape)

(6, 2) (6, 1)


In [595]:
def morphy_tag(nltk_tag):

    if nltk_tag.startswith('J'):
        return wn.ADJ
    elif nltk_tag.startswith('V'):
        return wn.VERB
    elif nltk_tag.startswith('N'):
        return wn.NOUN
    elif nltk_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def apply_lesk_to_text(text):
    tokenized_text = word_tokenize(text)
    list_tags = pos_tag(tokenized_text)
    tags = {w:morphy_tag(tag) for w,tag in list_tags}
    lesk_text = [lesk(tokenized_text, word,tags[word]) if tags[word] else word for word in tokenized_text]
    lesk_text_str = []
    for word in lesk_text:
        try: 
            w =  word.name().split('.')[0]
        except: 
            w = word
        lesk_text_str.append(w)
    
    return lesk_text_str

def preprocessing(data):
    # To see the effect of the desambiguation we do a preprocess only with it. Generaly we will use a more complete 
    # preprocessing function. 
    data = data.fillna('')
    for column in data.columns:
        # words to lower
        data[column] = data[column].str.lower()
        # desambiguate 
        data[column] = data[column].apply(apply_lesk_to_text)
    return data

In [596]:
trial_df = preprocessing(trial_df)
trial_df


Unnamed: 0,sentence0,sentence1
id1,"[the, bird, be, bathe, in, the, sinkhole, .]","[shuttlecock, be, wash, itself, in, the, body_..."
id2,"[in, may, 2010, ,, the, troop, undertake, to, ...","[the, us, None, None, kabul, on, may, 7th, las..."
id3,"[whoremaster, suppose, he, embody, view, a, wi...","[``, he, embody, not, a, defendant, anymore, ...."
id4,"[they, fly, out, of, the, None, in, group, .]","[they, fly, into, the, None, together, .]"
id5,"[the, woman, be, play, the, violin, .]","[the, young, lady, love, heed, to, the, guitar..."
id6,"[toilet, plump, knight, back, ride, at, dawn, ...","[sunrise, at, dawn, be, a, None, view, to, tak..."


In [599]:
def lexical_simmilarity(df):
    guess = pd.DataFrame()
    for i in df.index:
        guess.loc[i,'labels'] = 1 - jaccard_distance(set(df.loc[i,'sentence0']), set(df.loc[i,'sentence1']))
    return guess

guess_lex = lexical_simmilarity(trial_df)
guess_lex.head()

Unnamed: 0,labels
id1,0.666667
id2,0.647059
id3,0.428571
id4,0.545455
id5,0.833333


In [600]:
print(pearsonr(trial_gs['labels'], guess_lex['labels'])[0])

0.5323734100639426


We obtain a value lower than expected. 
On session 2, we compared the sentences practically as given and we obtained a correlation of 0.51. 
On session 3, we performed a lemmatization of the sentences and obtained a correlation of 0.57. 
On this session we performed a desambiguation and we have obtained a coefficient of 0.53. This value is a little better than the one on session 2 but worse than the one on the lemmatization. 

As commented on previous sessions, 

> This time we haven't processed the gold standar values, so we are comparing a distance array with a simmilarity one, so we obtain a negative correlation. 

> This value is a little bigger than 0.5, this means that there is little correlation between the two arrays, so probably the Jaccard distance isn't the best way to measure the semantic similarity between this sentences. 

> These results are due to the definition of Jaccard distance. This definition is fully based on set theory and does not take into account the semantic relationship between words (like synonymity).


This facts explain why the current results are better than the ones on session 2; The desambiguation values returned by lesk should be related to the sense of the words, this implies than sometimes this value will be the same on the two sentences, making greater the simmilarity between them. 

On the other hand we had better results on session 3, this is because:
1. The Jaccard distance deffinition; With Jaccard, have more simmilarity means have more common words between the two sentences, this implies than Jaccard doesn't treats two morphological variation of a word as the same word. If we use a lemmatization of the word we avoid this problem, but no with the desambiguation. That's the first reason why the results on session 3 where better. 
2. Use "corpus" too little to desambiguate; The lesk function compares the corpus given with the deffinition of the word. If the corpus is too little is probably that it won't find the correct sense of the word. One example of this is that it changes "Birdie" to "shuttlecock" instead of returning "bird". 


