# Esercitazione 3: Teoria di Hanks


## TODO
* Recuperare da un corpus n istanze in cui esso viene usato XXX

* Effettuare parsing e disambiguazione

* Usare i super sensi di WordNet sugli argomenti (subj e obj) del verbo scelto

* Aggregare i risultati, calcolare le frequenze, stampare i cluster semantici ottenuti

In [1]:
import nltk
from nltk.corpus import brown

nltk.download('brown')

[nltk_data] Downloading package brown to /home/prf/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
INPUT_VERB = 'eat'

VERB_POS_TAGS = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# stanford parser dependencies types
SUBJ_DEP_TYPES = ['nsubj', 'nsubjpass']
OBJ_DEP_TYPES = ['dobj', 'iobj']

VALENCE = 2

### Step 1: estrazione frasi contenti verbo

In [3]:
from nltk.stem import WordNetLemmatizer

def corpus_extraction(verb, corpus,  verb_pos_tags = VERB_POS_TAGS):
    lemmatizer = WordNetLemmatizer()
    sentences = corpus.sents()

    selected_sentences = []
    for sent in sentences:
        tags = dict(nltk.pos_tag(sent))
        for word in sent:
            if tags[word] in VERB_POS_TAGS: # extract only verbs
                word = lemmatizer.lemmatize(word, 'v')
                if word == verb: # extract only sentences with the given verb
                    selected_sentences.append(sent)

    return selected_sentences

In [4]:
# step 1 extract sentences
selected_sents = corpus_extraction(INPUT_VERB, brown)

# Step 2: Estrazione Fillers

In [5]:
import spacy
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import namedtuple

nlp = spacy.load("en_core_web_lg")



In [11]:
#dep_allowed_types = OBJ_DEP_TYPES + SUBJ_DEP_TYPES
def find_target_verb(doc, verb, verb_pos_tags = VERB_POS_TAGS):
    targets = []
    for token in doc:
        # search for verbs first and also in the lemmatized form
        if token.tag_ in verb_pos_tags and (token.text == verb or 
                                            token.lemma_ == verb):
            targets.append(token)
    return targets


def get_hanks_verb(verb_token, dep_allowed_types=OBJ_DEP_TYPES + SUBJ_DEP_TYPES):
    slot_values = []
    filler_values = []
    
    for children in verb_token.children:
        if children.dep_ in dep_allowed_types:
            slot_values.append(children.dep_) 
            filler_values.append(children.text)

    # dynamically create a named tuple based on number of slot/fillers found
    slot_names = [f"slot{i}" for i,_ in enumerate(slot_values, start=1)]
    filler_names = [f"filler{i}" for i,_ in enumerate(filler_values, start=1)]
    
    attrib_names = ['verb', 'nargs'] + slot_names + filler_names
    attrib_values = [verb_token.text, len(slot_names)] + slot_values + filler_values

    HanksVerb = namedtuple("HanksVerb", attrib_names)

    return HanksVerb(*attrib_values)

def find_verb_fillers(sentences, verb, valence=VALENCE):
    fillers = []

    # this tokenization step is necessary for Spacy since the pipeline takes a string as input
    detokenizer = TreebankWordDetokenizer()
    joined_sents = [detokenizer.detokenize(sent) for sent in sentences]

    for sentence, joined_sent in zip(sentences,joined_sents):
        doc = nlp(joined_sent)
        
        # find target verb occurences in the sentence
        target_verbs = find_target_verb(doc, verb)
        
        # loop trough target verb occurences
        for target_verb in target_verbs:
            
            # retrieve verb fillers from syntactic dependencies
            hank_verb = get_hanks_verb(target_verb)
            # check for valence
            if hank_verb.nargs == valence:
                fillers.append((sentence, hank_verb))
    return fillers

In [12]:
# step 2 find fillers
fillers = find_verb_fillers(selected_sents, INPUT_VERB, valence=2)

### Step 3: WSD & sense clustering

In [23]:
import nltk.wsd as wsd
from collections import Counter


def find_filler_senses(fillers, wsd_func):
    filler_senses = []
    
    for sentence, hanks_verb in fillers:
        filler1_sense = wsd_func(sentence, hanks_verb.filler1)
        filler2_sense = wsd_func(sentence, hanks_verb.filler2)

        filler_senses.append((filler1_sense, filler2_sense))
    
    return filler_senses

def semantic_clustering(filler_senses):
    semantic_types = Counter()

    for filler1_sense, filler2_sense in filler_senses:
        if filler1_sense is not None and filler2_sense is not None:
                # implicit clustering with wn supersenses
                semantic_type = (filler1_sense.lexname(), filler2_sense.lexname())
                # keep track of semantic_type occurrences
                semantic_types.update([semantic_type])
        else:
            semantic_types.update([(None, None)]) # just to keep track of invalid semantic type senses
        
    return semantic_types


In [24]:
filler_senses = find_filler_senses(fillers, wsd.lesk)
semantic_types = semantic_clustering(filler_senses)

### Valutazione

In [27]:
def show_results(verb_name, fillers, filler_senses, semantic_types):
    print(f"Applying Hanks Theory for verb: {verb_name}")
    
    for  (_, hanks_verb), (filler1_sense, filler2_sense) in zip(fillers, filler_senses):
        print(f"filler: {hanks_verb.filler1} - {hanks_verb.filler2}")
        print(f"senses: {filler1_sense} - {filler2_sense}")
        print('-------------------------------------------------------------')
    print("")
    print(f"Semantic Types Clustering Informations for {verb_name}")
    tot = sum([freq for semantic_type, freq in semantic_types.most_common()])
    for semantic_type, freq in semantic_types.most_common():
        relative_freq = freq / tot * 100
        print(f"semantic type: {semantic_type}, abs freq:{freq}, rel freq: {relative_freq:0.2f}%")

show_results(INPUT_VERB, fillers, filler_senses, semantic_types)

Applying Hanks Theory for verb: eat
filler: what - people
senses: None - Synset('citizenry.n.01')
-------------------------------------------------------------
filler: Americans - fat
senses: Synset('american_english.n.01') - Synset('fatty.a.01')
-------------------------------------------------------------
filler: one - skin
senses: Synset('one.s.03') - Synset('skin.v.02')
-------------------------------------------------------------
filler: what - they
senses: None - None
-------------------------------------------------------------
filler: students - cereal
senses: Synset('scholar.n.01') - Synset('grain.n.02')
-------------------------------------------------------------
filler: they - it
senses: None - Synset('information_technology.n.01')
-------------------------------------------------------------
filler: students - cereal
senses: Synset('scholar.n.01') - Synset('grain.n.02')
-------------------------------------------------------------
filler: they - it
senses: None - Synset('i

In [201]:
list(semantic_types.elements())

[(None, None)]

### Risultati

* spesso i filler corrispondenti all'argomento subject del verbo corrispondono a pronomi (I, she, they, ecc...) i quali non trovano una corrispondente lexical category in wordnet dunque lesk ritorna NONE e non è possibile associare un semantic_type valido

* cose si può notare l'invalid semantic type rappresenta il 68%, indicativo di come sia lo step di WSD che il coverage del sense repository (WN) sia fondamentale per il risultato finale