In [None]:
# Start writing code here...They are not formally
defined in WordNet. Notice that the synset and its genus should belong
to the same grammatical category. This is not required for differentia. For
example, ricotta and its genus (cheese) in the following synset are nouns,
while the differentia contains two adjectives soft and Italian

Note su genus-differentia:

They are not formally
defined in WordNet. Notice that the synset and its genus should belong
to the same grammatical category. This is not required for differentia. For
example, ricotta and its genus (cheese) in the following synset are nouns,
while the differentia contains two adjectives soft and Italian

from http://eprints-phd.biblio.unitn.it/1265/1/phd-thesis.pdf

In [None]:
import pandas as pd
from pathlib import Path
import nltk

import src.word_sense_disambiguation as wsd
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

#### Algorithm approach:
1. Pos tagging 
2. Genus extraction heuristic -> we search for noun (NN or NNS) genus, so filter out other pos
3. Note that genus lemma extracted are ambiguous, so we us Lesk-based WSD to find the right genus sense (ambiguous word context is the definition itself where the genus has been extracted)
4. Rank genus senses based on frequency heuristic
5. Expand genus hyponyms rooted subtree and compute their signatures
6. Search for the hyponym that maximize the avg similarity with the concept definitions
    1. first compute the document-term matrix from the concept definitions (here each definition is a document)
    2. then compute the document-term matrix from concept signature (definition+examples)
    3. compute the pairwise similarity and then avarage along the rows (avg among all concept definitions)
    4. compute the argmax among the avg hypnym similarity

In [None]:
concepts = pd.read_csv(Path("data/definitions.tsv"), sep="\t")

concept_signature = concepts['Concetto 1']

import nltk.stem
from collections import Counter
from textblob import TextBlob

def generate_genus_candidates(definitions):
    # Step 1 pos tagging
    def_pos = {}
    for definition in definitions:
        pos = nltk.pos_tag(nltk.word_tokenize(definition))
        def_pos[definition] =  pos
    
    # Step 2 genus extraction
    ambiguous_genera = {}
    singularizer = nltk.stem.WordNetLemmatizer()
    
    for defn in def_pos:
        # extract nouns (and also singularize NNS plurals nouns)
        candidate_genera = list(map(lambda lemma_pos: singularizer.lemmatize(lemma_pos[0]), 
                            filter(lambda x: x[1] in ['NN','NNS'], def_pos[defn])))
        ambiguous_genera[defn] = [TextBlob(candidate).correct().raw for candidate in candidate_genera] # fix mispelling

    # Step 3 candidate genus identification trough WSD
    genus_candidates = Counter()
    # WSD for genus and add to a multiset to keep track of occurences in the definitions
    for defn in ambiguous_genera:
        for genus in ambiguous_genera[defn]:
            best_sense = lesk(wsd.bow_model(defn), genus, pos=wn.NOUN) 
            if best_sense: # avoid None
                genus_candidates.update([best_sense])

    # Step 4 ranking heuristic based on occurence frequencies
    genus_candidates_ranking = list(map(lambda rank: rank[0], genus_candidates.most_common()))

    return genus_candidates_ranking



In [None]:
def hyponyms_signatures(genus_synset, max_search_depth):
    genus_hyponyms = list(genus_synset.closure(lambda syn: syn.hyponyms(), depth=max_search_depth))
    # joind synset def with example to augment contextual informations
    definitions = [" ".join([hyp.definition()] +
                            hyp.examples()) for hyp in genus_hyponyms]
    # add the genus itself ???? (just to avoid empty hyponyms set)
    return [genus_synset] + genus_hyponyms, [genus_synset.definition()] +  definitions

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np

def compute_similarity_matrix(concept_signatures, hyponyms_signatures):
    vectorizer = CountVectorizer(stop_words='english')
    
    # vectorize both concept and hyponyms definitions (signatures)
    concept_mat = vectorizer.fit_transform(concept_signatures)
    hyponyms_mat = vectorizer.transform(hyponyms_signatures)
    #print(hyponyms_signatures)
    sim_mat = cosine_similarity(concept_mat, hyponyms_mat)

    return sim_mat

def find_best_sense(similarity_matrix, genus_hyponyms):
    avg_similarities = similarity_matrix.mean(axis=0)
    best_hyponym_idx = np.argmax(avg_similarities)
    return genus_hyponyms[best_hyponym_idx], avg_similarities[best_hyponym_idx]

### Batch Processing

In [None]:


def content_to_form(concept_signature, top_k, true_name, max_search_depth):
    print(f"Concept: {true_name}")
    # search for genus candidates candid
    genus_candidates = generate_genus_candidates(concept_signature)

    candidate_senses = []
    # search for candidate senses 
    for genus_candidate in genus_candidates:
        hyponyms, hyp_signatures = hyponyms_signatures(genus_candidate, max_search_depth) 
        sim_mat = compute_similarity_matrix(concept_signature, hyp_signatures)
        candidate_senses.append(find_best_sense(sim_mat, hyponyms))

    # create a ranking by semantic relatdness (similarity)
    candidate_senses.sort(key=lambda x:x[1], reverse=True)
    # take the first top_k in the ranking
    for i, (sense, score) in enumerate(candidate_senses[0:top_k], start=1):
        print(f"{i}. {sense.name()}: {score:.4f}")

MAX_SEARCH_DEPTH = 3
concepts = pd.read_csv(Path("data/definitions.tsv"), sep="\t")

for concept in concepts.columns:
    concept_signature = concepts[concept]
    concept_signature = list(filter(lambda c: c is not np.NaN, concept_signature))
    content_to_form(concept_signature, 5, concept, MAX_SEARCH_DEPTH)

Concept: Concetto 1
1. right.n.01: 0.1760
2. right.n.01: 0.1760
3. right_field.n.01: 0.1423
4. side.n.01: 0.1423
5. ownership.n.01: 0.1423
Concept: Concetto 2
1. resource.n.03: 0.2322
2. adaptability.n.01: 0.2092
3. pons_asinorum.n.01: 0.2092
4. difficulty.n.03: 0.1720
5. capacity.n.08: 0.1601
Concept: Concetto 3
1. hunger.n.02: 0.2114
2. thing.n.11: 0.2091
3. wish.n.02: 0.2091
4. pile.n.03: 0.2059
5. generosity.n.01: 0.2059
Concept: Concetto 4
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
1. clubroom.n.01: 0.1456
2. nonprofit_organization.n.01: 0.1324
3. state.n.03: 0.1099
4. citizenry.n.01: 0.1030
5. citizenry.n.01: 0.1011
Concept: Concetto 5
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
1. whitebait.n.02: 0.2673
2. hen-of-the-woods.n.01: 0.2673
3. blood_meal.n.01: 0.1479
4. withers.n.01: 0.1479
5. living.n.02: 0.1269
Concept: Concetto 6
  for synset

[(Synset('clubroom.n.01'), 0.14559428548331146),
 (Synset('nonprofit_organization.n.01'), 0.13241348911669912),
 (Synset('state.n.03'), 0.10991099108223519),
 (Synset('citizenry.n.01'), 0.10295070656725963),
 (Synset('citizenry.n.01'), 0.10106924535225541)]

In [None]:


    def_pos = {}
    for definition in concept_signature:
        pos = nltk.pos_tag(nltk.word_tokenize(definition))
        def_pos[definition] =  pos
    
    # Step 2 genus extraction
    ambiguous_genera = {}
    singularizer = nltk.stem.WordNetLemmatizer()
    
    for defn in def_pos:
        # extract nouns (and also singularize NNS plurals nouns)
        candidate_genera = list(map(lambda lemma_pos: singularizer.lemmatize(lemma_pos[0]), 
                            filter(lambda x: x[1] in ['NN','NNS'], def_pos[defn])))
        ambiguous_genera[defn] =  candidate_genera

In [None]:
ambiguous_genera

{'the concept of fairness, equality for all the parts involved': ['concept',
  'fairness',
  'equality',
  'part'],
 'condition of being morally correct': ['condition'],
 'abstract concept that refers to what is right': ['abstract', 'concept'],
 'moral principle determining what is right': ['principle'],
 'entity that allows compliance with the rules': ['entity',
  'compliance',
  'rule'],
 'abstract concept based on the idea of legal fairness, a front of a code of laws shared by a community': ['abstract',
  'concept',
  'idea',
  'fairness',
  'front',
  'code',
  'law',
  'community'],
 'To respect other people and their rights. Usually administrated in public courts': ['people',
  'right',
  'court'],
 'being right and just': [],
 'the quality of being fair and reasonable': ['quality'],
 'when most of points of view agree with': ['point', 'view'],
 'idea of fair application of laws': ['idea', 'application', 'law'],
 'recognition and respect of people rights': ['recognition',
  'resp

In [None]:
!pip install pattern

Collecting pattern
  Using cached Pattern-3.6.0.tar.gz (22.2 MB)
Collecting backports.csv
  Using cached backports.csv-1.0.7-py2.py3-none-any.whl (12 kB)
Collecting mysqlclient
  Using cached mysqlclient-2.0.3.tar.gz (88 kB)
[31m    ERROR: Command errored out with exit status 1:
     command: /root/venv/bin/python -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-7b41pj0r/mysqlclient_e133f8066d4a41ee96bb4ad10fd876f5/setup.py'"'"'; __file__='"'"'/tmp/pip-install-7b41pj0r/mysqlclient_e133f8066d4a41ee96bb4ad10fd876f5/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /tmp/pip-pip-egg-info-hj01fgg2
         cwd: /tmp/pip-install-7b41pj0r/mysqlclient_e133f8066d4a41ee96bb4ad10fd876f5/
    Complete output (15 lines):
    /

In [None]:


TextBlob(concept_signature[0]).correct().raw

'the discipline that regulates territorial rules'

In [None]:
concept_signature

0       the discipline that regulates territorial rules
1      activities associated with covernance of an area
2     abstract concept of governance inside a social...
3     the activity of governing an entity and its af...
4     entity based on sets of principles that guide ...
5                  activities for the good of the state
6     To be able to reach common objectives and idea...
7            the science or art of political government
8     the activities associated with the governance ...
9            organization aimed to amministrate a state
10          sience of ensuring cohesion of a population
11    activities of the government or people who try...
Name: Concetto 4, dtype: object

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=afb22156-bb61-4d65-847d-18db79c0d4d2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>