### Approccio

- Prendo il termine più frequente nelle definizioni, sarà il genus
- Stopwords removing e lemming delle definizioni
- Prelevo tutto il sottoalbero di hyponimi del genus
- Prendo le definizioni (glossa) dei synset di cui ho trovato i hyponimi
- Faccio confronto tra definizioni di wordnet e lista di definizioni
- Restituisco il synset che ha definizioni più simile a quella della lista

In [5]:
from string import punctuation
from nltk import word_tokenize, pos_tag
from gensim.test.utils import simple_preprocess
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from collections import Counter
from nltk.wsd import lesk

## Utilities

In [6]:
def word_sense_disambiguation(list_words, word):
    right_synset = lesk(list_words, word)
    return right_synset

# print(word_sense_disambiguation(['i', 'went', 'to', 'the', 'bank', 'to', 'give', 'money'], 'bank').definition())

def remove_stop_words(row):
    stop_words = set(stopwords.words('english'))
    punctuation = [',', '.', ';', '!', '?', "'", "''", '"', "’", "’’", "`","``"]
    filtered_sentence = [w for w in row if not w.lower() in stop_words and w not in punctuation]
    return filtered_sentence

def calculate_frequency(row):
    freq_dict = {}
    row = remove_stop_words(word_tokenize(row))
    for word in row[1:]:
        if word.lower() not in freq_dict:
            freq_dict[word.lower()] = 1
        else:
            freq_dict[word.lower()] += 1
    return freq_dict

def get_text_from_file(path):
    file = []
    stop_words = set(stopwords.words('english'))
    with open (path, 'r') as f:
        for row in f:
            filtered_s = [w for w in word_tokenize(row) if not w.lower() in stop_words]
            file.append(simple_preprocess(str(filtered_s), deacc=True))
    f.close()
    return file

def get_hypos(word):
    syn = get_synset(word)
    hypo_list = list(set([w for s in syn.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))
    return hypo_list


def get_synset(word):
    if(len(wn.synsets(word)) > 0):
        return wn.synsets(word)[0]
    return None

# Program to measure the similarity between 
# two sentences using cosine similarity.
def cos_similarity(sen1, sen2):
    # tokenization
    X_list = wn.word_tokenize(sen1) 
    Y_list = wn.word_tokenize(sen2)
    
    # sw contains the list of stopwords
    sw = stopwords.words('english') 
    l1 =[];l2 =[]
    
    # remove stop words from the string
    X_set = {w for w in X_list if not w in sw} 
    Y_set = {w for w in Y_list if not w in sw}
    
    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0
    
    # cosine formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
                       
    return cosine

In [7]:
def penn_to_wn(tag):
    """ Convert between a Penn Treebank tag to a simplified Wordnet tag """
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
def tagged_to_synset(word, tag):
    wn_tag = penn_to_wn(tag)
    if wn_tag is None:
        return None
 
    try:
        return wn.synsets(word, wn_tag)[0]
    except:
        return None
 
def sentence_similarity(sentence1, sentence2):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = pos_tag(word_tokenize(sentence1))
    sentence2 = pos_tag(word_tokenize(sentence2))
 
    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
 
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]
 
    score, count = 0.0, 0
 
    # For each word in the first sentence
    for synset in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        best_score = max([synset.path_similarity(ss) for ss in synsets2])
 
        # Check that the similarity could have been computed
        if best_score is not None:
            score += best_score
            count += 1
 
    # Average the values
    score /= count
    return score

In [8]:
sentences = [
    "Dogs are awesome.",
    "Some gorgeous creatures are felines.",
    "Dolphins are swimming mammals.",
    "Cats are beautiful animals.",
]
 
focus_sentence = "Cats are beautiful animals."
 
for sentence in sentences:
    print "Similarity(\"%s\", \"%s\") = %s" % (focus_sentence, sentence, sentence_similarity(focus_sentence, sentence))
    print "Similarity(\"%s\", \"%s\") = %s" % (sentence, focus_sentence, sentence_similarity(sentence, focus_sentence))
    print 
 
# Similarity("Cats are beautiful animals.", "Dogs are awesome.") = 0.511111111111
# Similarity("Dogs are awesome.", "Cats are beautiful animals.") = 0.666666666667
 
# Similarity("Cats are beautiful animals.", "Some gorgeous creatures are felines.") = 0.833333333333
# Similarity("Some gorgeous creatures are felines.", "Cats are beautiful animals.") = 0.833333333333
 
# Similarity("Cats are beautiful animals.", "Dolphins are swimming mammals.") = 0.483333333333
# Similarity("Dolphins are swimming mammals.", "Cats are beautiful animals.") = 0.4
 
# Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1.0
# Similarity("Cats are beautiful animals.", "Cats are beautiful animals.") = 1.0

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(...)? (653542922.py, line 11)

## Execution

In [14]:
with open ('../res/def.csv', 'r') as f:
    for row in f:
        dict = calculate_frequency(row)
        c = Counter(dict)
        
        most_common = c.most_common(5)
        
        # The genus is the most used word in the definitions
        genus = c.most_common(1)
        
        # Compongo lista parole più usate nelle definizioni da confrontare con definizioni wordnet
        my_key_word = []
        for el in most_common:
            my_key_word.append(el[0])
        
        # List of hyponyms of the genus and their wordnet definition
        hypo_list = get_hypos(genus[0][0])
        hypo_def = []
        for hypo in hypo_list:
            hypo_def.append((hypo, get_synset(hypo).definition()))
        
        best_score = (0, "", "") # (score, word, definition)
            
        for wndef in hypo_list:
            for mydef in row[1:]:
                score = sentence_similarity(mydef, wndef[1])
                if score > best_score[0]:
                    best_score = (score, wndef[0], wndef[1])
                    
        print(f'best_score is {best_score} and the real word is {row[0]}')
             
f.close()

  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - '/Users/paolobonicco/nltk_data'
    - '/Users/paolobonicco/virtual-envs/tln-2022-third-part-lab/venvdicaro/nltk_data'
    - '/Users/paolobonicco/virtual-envs/tln-2022-third-part-lab/venvdicaro/share/nltk_data'
    - '/Users/paolobonicco/virtual-envs/tln-2022-third-part-lab/venvdicaro/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
