# Es2 - Word Sense Disambiguation

In questo esercizio andremo ad estrarre 50 frasi causuali dal corpus `SemCor` e proveremo a disambiguare un sostantivo per ogni frase, anche
quest'ultimo estratto casualmente dalla frase.

1. Estrazione casuale delle frasi dal corpus `SemCor`
2. Pulizia delle frasi:
   1. Rimozione stopwords, punteggiatura e lemming
3. Estrazione di un sostantivo casuale dalla frase
4. Estrazione dei synset del sostantivo **?? (domanda, estraggo solo i sysnet che sono etichettati come *NN*)**
5. Costruzione della `Bag of Words` per la frase e del sostantivo 

## Preparazione dei dati

### Imports and dataset downlaod

In [81]:
from nltk.corpus import semcor
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import nltk
import random
from pprint import pprint
from nltk.tree import *

# nltk.download('semcor') # download the semcor corpus

### Estrazione frasi da corpus

In [292]:
sents = semcor.sents()

sents_full = semcor.tagged_sents(tag="both")

print(len(sents) == len(sents_full)) # True

True


### Metodi utili per gestione Corpus `SemCor` e struttura `Tree` di `nltk`

In [202]:
def get_lemma(word):
    '''
    Args:
        word: term as as nltk.Tree
    Returns:
        lemma of the word as a string.
        If there isn't a lemma, return the PoS tag or None.
    '''
    return word.label()

def get_word(word):
    '''
    Args:
        word: term as as nltk.tree.tree.Tree
    Returns:
        the term as as a list of strings.
        Return a list because a term may consist of several words
    '''
    if(isinstance(word, nltk.Tree)):
        return word.leaves()
    return None

def get_pos(word):
    '''
    Args:
        word: term as as nltk.Tree
    Returns:
        The PoS tag of a word or None if there is no PoS for the term 
        (es. '!' has tag None)
    '''
    return word.pos()[0][1]

def get_synset(lemma):
    '''
    Args:
        lemma: Lemma of a word
    Returns:
        The synset associated to the lemma
    '''
    if(isinstance(lemma, nltk.corpus.reader.wordnet.Lemma)):
        return lemma.synset()
    return None

def get_sents(semcor):
    '''
    Args:
        semcor: Semcor corpus
    Return:
        a list of list of words. Each list of words is a sentence.
    '''
    return semcor.sents()

def get_term(lemma):
    '''
    Args:
        lemma: Lemma of a word
    Returns:
        The term associated to the lemma
    '''
    if(isinstance(lemma, nltk.corpus.reader.wordnet.Lemma)):
        return lemma.name()
    return None

def get_synsets(term):
    '''
    Retrurn the synsets of a term.
    '''
    if(len(wn.synsets(term)) > 0):
        return wn.synsets(term)
    return None

In [33]:
lemma = "primary_election.n.01.primary"
wn.lemma(lemma).synset()

Synset('primary.n.01')

### Selezione delle frasi casuali

Estraiamo le frasi come stringhe e come oggetti `Tree` per poter ottenere anche il pos e il 
lemma associato ad un termine.

In [316]:
def check_sent(sent):
    '''
        Check if there is a NN with his lemma in the sentence and that as more than
        1 synset, so that the term is ambiguous
    '''
    for el in sent:
        if(get_pos(el) == 'NN'):
            lemma = get_lemma(el)
            if(isinstance(lemma, nltk.corpus.reader.wordnet.Lemma)):
                term = get_term(lemma)
                syns = wn.synsets(term)
                if(len(syns) > 1):
                    return True
    return False

def pick_sents(s, sfull, num):
    rand_sents, rand_num, rand_full_sent = [], [], []
    l = len(s) - 1
    n = random.randint(0, l)
    
    while (len(rand_sents) < num):
        while(n in rand_num):
            n = random.randint(0, l)
            
        rand_num.append(n)
        
        if(check_sent(sfull[n])):
            rand_sents.append(s[n])
            rand_full_sent.append(sfull[n])
    
    return rand_sents, rand_full_sent

Estraggo 50 frasi casuali dal corpus `SemCor`

Ottengo una lista di oggetti *semcor* -> `nltk.corpus.reader.semcor.SemcorSentence`



In [317]:
sent_list, sent_full_list = pick_sents(sents, sents_full, 50)

# print(len(sent_list) == len(sent_full_list)) # 50

50

## Implementazione

Contex

In [524]:
from distutils.log import error


def find_noun(sent):
    '''
    Take a sentence and return a random Noun in the phrase with his right synset associated
    '''
    noun_list = []
    for el in sent:
        if(get_pos(el) == 'NN'):
            lemma = get_lemma(el)
            if(isinstance(lemma, nltk.corpus.reader.wordnet.Lemma)):
                term = get_term(lemma)
                syns = wn.synsets(term)
                if(len(syns) > 1):
                    noun_list.append(el)
    return noun_list[random.randint(0, len(noun_list) - 1)]


def bag_of_word(sent):
    '''
    Auxiliary function for the Lesk algorithm. Transforms the given sentence
    according to the bag of words approach, apply lemmatization, stop words
    and punctuation removal.
    Args:
        sent: sentence
    Returns:
        bag of words
    '''
    stop_words = set(stopwords.words('english'))
    punctuation = {',', ';', '(', ')', '{', '}', ':', '?', '!', '.', '``', '*', '-'}
    # Returns the input word unchanged if it cannot be found in WordNet.
    wnl = nltk.WordNetLemmatizer()
    # Return a tokenized copy of text, using NLTK’s recommended word tokenizer (Treebank + PunkSentence)
    tokens = nltk.word_tokenize(sent)
    tokens = list(filter(lambda x: x not in stop_words and x not in punctuation, tokens))
    return list(wnl.lemmatize(t.lower()) for t in tokens)

def get_context(sent):
    '''
    Auxiliary function for the Lesk algorithm. Returns the context of the given word in the given sentence.
    Args:
        sent: sentence
    Returns:
        set of words in the sentence after stop words and punctuation removal and lemming
    '''
    context = []
    merged_sent = ' '.join(word for word in sent)
    context.append(bag_of_word(merged_sent))
    return context[0]

In [438]:
target_word = find_noun(sent_full_list[0])

lemma = get_lemma(target_word)

target_synset = lemma.synset()

In [446]:
context[0]

['Sweat', 'bubbled', 'Jess', "'s", 'swarthy', 'face']

## Sandbox - 1 frase

In [534]:
sent_0 = sent_list[0]
sent_full_0 = sent_full_list[0]

target_word = find_noun(sent_full_list[0])
lemma = get_lemma(target_word)
target_word_str = get_term(lemma)
target_synset = lemma.synset()

word_synsets = wn.synsets(target_word_str)

context_0 = get_context(sent_0)
signature_0 = get_signature(target_synset)

print(word_synsets)
print(context_0)
print(signature_0)

TypeError: unhashable type: 'SemcorSentence'

### `Lesk sandbox`

In [527]:
print(signature(word_synsets[0]))

['salty', 'fluid', 'secreted', 'sweat', 'gland', 'sweat', 'poured', 'brow']


In [532]:
from multiprocessing import get_context


def get_signature(syn):
    '''
    Args:
        synset: a synset of a word
    Returns:
        A list of word formed by examples and gloss of the synset
    '''
    bof = bag_of_word(syn.definition())
    for el in syn.examples():
        bof.extend(bag_of_word(el))
    return bof

def get_overlap(s1, s2):
    '''
    Args:
        s1: list of words
        s2: list of words
    Returns:
        The number of words in s1 that are also in s2
    '''
    return len(set(s1).intersection(set(s2)))


def lesky(word, sentence):
    max_overlap = 0
    context = get_context(sentence)
    for syn in word_synsets:
        signature = get_signature(syn)
        overlap = get_overlap(signature, context)
        if(overlap > max_overlap):
            max_overlap = overlap
            best_synset = syn
        print(syn)
    return best_synset

In [533]:
lesky(target_word_str, sent_0)

TypeError: unhashable type: 'SemcorSentence'

### Lesk Algorithm

1. `Contesto` - Insieme delle parole presenti nella frase
2. `Signature` - Insieme della parole presenti nella definizione e negli esempi dei synset del termine da disambiguare

# Approccio alternativo 

###  !!!! Non usare Venv perchè non riesco ad installare lxml

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

import re
import sys
import xml.etree.ElementTree as eT

from tqdm import tqdm

In [None]:
from lxml import etree as exml

### Data preparation

In [None]:
path = "../data/semcor3.0/brown1/tagfiles/br-a01"

In [None]:
def parse_xml(path):
    '''
    It parses the SemCor corpus, which has been annotated by hand on WordNet synsets by Rada Mihalcea and her team.
    In order:
        1) Load XML file
        2) Took all the tags "s"
        3) Extract the sentence
        4) Select the words to disambiguate (select only the needed ones) with total number of senses >= 2
        5) Extract Golden annotated sense from WSN
    Args:
        path: the path to the XML file (Brown Corpus)
    Returns:
        [(sentence, [(word, gold)])]
    '''

    with open(path, 'r') as fileXML:
        data = fileXML.read()

        # fixing XML's bad formatting
        data = data.replace('\n', '')
        replacer = re.compile("=([\w|:|\-|$|(|)|']*)")
        data = replacer.sub(r'="\1"', data)

        result = []
        try:
            root = exml.XML(data)
            paragraphs = root.findall("./context/p")
            sentences = []
            for p in paragraphs:
                sentences.extend(p.findall("./s"))
            for sentence in sentences:
                words = sentence.findall('wf')
                sent = ""
                tuple_list = []
                for word in words:
                    w = word.text
                    pos = word.attrib['pos']
                    sent = sent + w + ' '
                    if pos_validity(pos=pos, text=w, word=word):
                        sense = word.attrib['wnsn']
                        t = (w, sense)
                        tuple_list.append(t)
                result.append((sent, tuple_list))
        except Exception as e:
            raise NameError('xml: ' + str(e))
    return result

def pos_validity(pos, text, word):
    """Auxiliary function for the parse_xml
    Args:
        pos:
        text:
        word: ambiguous word (with more that 1 sense)
    Returns:
        boolean: True if the word is valid and false otherwise
    """
    return pos == 'NN' and '_' not in text and len(wn.synsets(text)) > 1 and 'wnsn' in word.attrib

In [None]:
parse_xml(path)

[("The Fulton_County_Grand_Jury said Friday an investigation of Atlanta 's recent primary_election produced no evidence that any irregularities took_place ",
  [('investigation', '1'),
   ('Atlanta', '1'),
   ('evidence', '1'),
   ('irregularities', '1')]),
 ('The jury further said in term end presentments that the City_Executive_Committee which had over-all charge of the election deserves the praise and thanks of the City_of_Atlanta for the manner in which the election was conducted ',
  [('jury', '1'),
   ('term', '2'),
   ('end', '2'),
   ('presentments', '1'),
   ('charge', '6'),
   ('election', '1'),
   ('praise', '1'),
   ('thanks', '1'),
   ('manner', '1'),
   ('election', '1')]),
 ('The September October term jury had been charged by Fulton Superior_Court_Judge_Durwood_Pye to investigate reports of possible irregularities in the hard-fought primary which was won by Mayor-nominate_Ivan_Allen_Jr. ',
  [('term', '2'),
   ('jury', '1'),
   ('reports', '3'),
   ('irregularities', '1

In [None]:
def lesk(word, sentence):
    '''
    Lesk algorithm implementation. Given a word and a sentence in which it appears,
    it returns the best sense of the word.
    Args:
        word: word to disambiguate
        sentence: sentence in wich the word occour
    Returns:
        best sense of word
    '''

### Suggestion

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn


def lesk(word, sentence):
    """Lesk algorithm implementation. Given a word and a sentence in which it appears,
    it returns the best sense of the word.
    Args:
        word: word to disambiguate
        sentence: sentence to compare
    Returns:
        best sense of word
    """

    # Calculating the synset of the given word inside WN
    word_senses = wn.synsets(word)
    best_sense = word_senses[0]
    max_overlap = 0

    # I choose the bag of words approach
    context = bag_of_word(sentence)

    for sense in word_senses:
        # set of words in the gloss
        signature = bag_of_word(sense.definition())

        # and examples of the given sense
        examples = sense.examples()
        for ex in examples:
            # after this line, signature will contain for all the words, their
            # bag of words definition and their examples
            signature = signature.union(bag_of_word(ex))

        overlap = compute_overlap(signature, context)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense

    return best_sense


def bag_of_word(sent):
    """Auxiliary function for the Lesk algorithm. Transforms the given sentence
    according to the bag of words approach, apply lemmatization, stop words
    and punctuation removal.
    Args:
        sent: sentence
    Returns:
        bag of words
    """

    stop_words = set(stopwords.words('english'))
    punctuation = {',', ';', '(', ')', '{', '}', ':', '?', '!'}
    # Returns the input word unchanged if it cannot be found in WordNet.
    wnl = nltk.WordNetLemmatizer()
    # Return a tokenized copy of text, using NLTK’s recommended word tokenizer (Treebank + PunkSentence)
    tokens = nltk.word_tokenize(sent)
    tokens = list(filter(lambda x: x not in stop_words and x not in punctuation, tokens))
    return set(wnl.lemmatize(t) for t in tokens)


def compute_overlap(signature, context):
    """Auxiliary function for the Lesk algorithm. Computes the number of words in
    common between signature and context.
    Args: 
        signature: bag of words of the signature (e.g. definitions + examples)
        context: bag of words of the context (e.g. sentence)
    Returns:
        number of elements in commons
    """

    return len(signature & context)


def get_sense_index(word, sense):
    """Given a ambiguous word and a sense of that word, it returns the
    corresponding index of the sense in the synsets list associated with the
    word indices starts with 1.
    Args: 
        word: ambiguous word (with more that 1 sense)
        sense: sense of the word
    Returns:
        index of the sense in the synsets list of the word
    """

    senses = wn.synsets(word)
    return senses.index(sense) + 1


def pos_validity(pos, text, word):
    """Auxiliary function for the parse_xml
    Args:
        pos:
        text:
        word: ambiguous word (with more that 1 sense)
    Returns:
        boolean: True if the word is valid and false otherwise
    """
    return pos == 'NN' and '_' not in text and len(wn.synsets(text)) > 1 and 'wnsn' in word.attrib


def max_freq(word):
    """
    Helper method for lesk_demaria
    :param word of interest
    :return: frequency of the word
    """
    synsets = wn.synsets(word)
    sense2freq = None
    freq_max = 0

    for s in synsets:
        freq = 0
        for lemma in s.lemmas():
            freq += lemma.count()
            if freq > freq_max:
                freq_max = freq
                sense2freq = s
    return sense2freq


def lesk_demaria(word, sentence):
    """
    Given a word and a sentence in which it appears, it returns the best sense of the word.
    DeMaria Implementation more precise than simpler lesk above thanks to max_freq
    Args:
        word: word to disambiguate
        sentence: sentence to compare
    Returns:
        best sense of word
    """
    # inizializzazione
    max_overlap = 0
    best_sense = max_freq(word)

    # If I choose the bag of words approach
    context = bag_of_word(sentence)
    signature = []

    for ss in wn.synsets(word):
        signature += ss.definition().split()
        signature += ss.lemma_names()

        overlap = set(signature).intersection(context)
        signature.clear()

        if len(overlap) > max_overlap:
            best_sense = ss
            max_overlap = len(overlap)

    return best_sense