In [12]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.wsd import lesk

In [13]:
# IPERONIMI
syns = wn.synsets("equality.n.02")
names = []

for i, s in enumerate(syns, start=0):
    hyper = lambda s: s.hypernyms()  # SOPRA-NOME, categoria superiore della parola
    temp = list(s.closure(hyper, depth=3))
    names.extend([x.name().split(".")[0] for x in temp])
    #print("SYN: {} \t HYPER: {}".format(s,t))

print(names)
#print(len(names))
#print(set(names))
#print(len(set(names)))

[]


In [14]:
wn.synset("equality.n.02").hypernyms()

[Synset('status.n.01')]

In [15]:
# IPONIMI
syns = wn.synsets("equality")
names = []

for i, s in enumerate(syns, start=0):
    hyper = lambda s: s.hyponyms()  # SOTTONOME, significato semantico incluso in altra parola
    temp = list(s.closure(hyper, depth=1))
    names.extend([x.name().split(".")[0] for x in temp])
    #print("SYN: {} \t HYPER: {}".format(s,t))

print(names)
#print(len(names))
#print(set(names))
#print(len(set(names)))

['balance', 'equatability', 'equivalence', 'evenness', 'isometry', 'egality', 'tie']


In [16]:
wn.synset("justice.n.01").definition()

'the quality of being just or fair'

In [7]:
h = wn.synset("apple.n.01").hypernyms()
for v in h:
    print(v.name())

edible_fruit.n.01
pome.n.01


In [8]:
wn.synset("pome.n.01").definition()

'a fleshy fruit (apple or pear or related fruits) having seed chambers and an outer fleshy part'

In [9]:
def load_data():
    """
    It reads che definition's CSV
    :return: four list containing the read definitions.
    """
    with open(options["output"] + 'content-to-form.csv', "r", encoding="utf-8") as content:
        cnt = csv.reader(content, delimiter=';')

        dictionary = {}
        i = 0
        for line in cnt:
            dictionary[i] = line
            i += 1

        return dictionary


def preprocess(definition):
    """
    It does some preprocess: removes stopwords, punctuation and does the
    lemmatization of the tokens inside the sentence.
    :param definition: a string representing a definition
    :return: a set of string which contains the preprocessed string tokens.
    """

    # Removing stopwords
    definition = definition.lower()
    stop_words = set(stopwords.words('english'))
    punct = {',', ';', '(', ')', '{', '}', ':', '?', '!', '.'}
    wnl = nltk.WordNetLemmatizer()
    tokens = nltk.word_tokenize(definition)
    tokens = list(filter(lambda x: x not in stop_words and x not in punct, tokens))

    # Lemmatization
    lemmatized_tokens = set(wnl.lemmatize(t) for t in tokens)

    return lemmatized_tokens


def preprocess_synset(synset):
    """
    It does some preprocess: removes the stopword, punctuation and does the
    lemmatization of the tokens inside the sentence.
    :param definition: a string representing a definition
    :return: a set of string which contains the preprocessed string tokens.
    """
    pre_synset = synset.split(".")
    clean_synset = pre_synset[0]
    return clean_synset

In [10]:
options = {
        "output": "/Users/lorenzotabasso/Desktop/University/TLN/Progetto/19-20/tln-1920/part3/exercise2/input/",
    }

content = load_data()  # Loading the content-to-form.csv file

'''
1. prendo definzione, disambiguo con pos-tagging. il primo nome è il genus
2. come approccio personalizzato, teveno un dizionario di genus (dopo aver esplorato tutte le definizioni) e espandevo solo il genus più frequente
riducendo la ricerca
3. prendo da wordnet i synsets di quel sostantivo, e per ognuno di essi parto in basso con gli iponimi
4. calcolo l'iponimo dell'iponimo dell'iponimo..., per non sclerare utilizza la closure (chiusura trasitiva). Calcolo gli iponimi fino a un certo 
livello
5. calcola iponimo con più overlapping, stili classifica
'''

for index in content:
#for index in range(1):
    
    hyponyms_list = []
    
    for definition in content[index]:
    #for definition in content[0]:
        genus_dict = {}
        hyponyms = []
            
        def_tokens = word_tokenize(definition)
        results = nltk.pos_tag(def_tokens)
        
        possibles_genus = list(filter(lambda x: x[1] == "NN", results))
        # Es.: [('abstract', 'NN'), ('concept', 'NN'), ('idea', 'NN'), ('fairness', 'NN'), ('front', 'NN'), ('code', 'NN'), ('community', 'NN')]

        for g in possibles_genus:
            if not g[0] in genus_dict:
                genus_dict[g[0]] = 1
            else:
                genus_dict[g[0]] += 1
    
#         print(index, genus)
#         print("{} - {}\n".format(index, genus_dict))
        
        if len(genus_dict) > 0:
            genus = max(genus_dict, key=genus_dict.get)
#             print("GENUS: " + genus)
        
            syns = wn.synsets(genus)
        
            # Prendiamo tutti gli iponimi per il genus della singola definizione
            for i, s in enumerate(syns, start=0):
                hypon = lambda s: s.hyponyms()  # SOTTONOME, significato semantico incluso in altra parola
                all_hypon = list(s.closure(hypon, depth=1))  # TODO: aumentare a 2,3
                hyponyms.extend([x.name().split(".")[0] for x in all_hypon])
#                 print("SYN: {} \t HYPER: {}".format(s,t))

#             print(index, hyponyms, "\n")
#         else:
#             print("NADA")
            
        
        hyponyms_list.append(' '.join(hyponyms))
    
#     print(hyponyms_list)

        
    '''
    CountVectorizer will create k vectors in n-dimensional space, where:
    - k is the number of sentences,
    - n is the number of unique words in all sentences combined.
    If a sentence contains a certain word, the value will be 1 and 0 otherwise
    '''
    
    vectorizer = CountVectorizer()
    matrix = vectorizer.fit_transform(hyponyms_list)
    
    feature_list = vectorizer.get_feature_names()
    vectors = matrix.toarray()
    
    m = vectors.sum(axis=0).argmax()
    
    print(m)
    print(feature_list[m] + '\n')
#     print(feature_list)
    

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lorenzotabasso/Desktop/University/TLN/Progetto/19-20/tln-1920/part3/exercise2/input/content-to-form.csv'

In [11]:
# Da NOMI a IPERONIMI

options = {
        "output": "/Users/lorenzotabasso/Desktop/University/TLN/Progetto/19-20/tln-1920/part3/exercise2/input/",
    }

content = load_data()  # Loading the content-to-form.csv file

'''
1. prendo definzione, disambiguo con pos-tagging. il primo nome è il genus
2. come approccio personalizzato, teveno un dizionario di genus (dopo aver esplorato tutte le definizioni) e espandevo solo il genus più frequente
riducendo la ricerca
3. prendo da wordnet i synsets di quel sostantivo, e per ognuno di essi parto in basso con gli iponimi
4. calcolo l'iponimo dell'iponimo dell'iponimo..., per non sclerare utilizza la closure (chiusura trasitiva). Calcolo gli iponimi fino a un certo 
livello
5. calcola iponimo con più overlapping, stili classifica
'''

for index in content:
#for index in range(1):
    
    genus_dict = {}
    
    for definition in content[index]:
    #for definition in content[0]:

        hypernyms = []
        clean_tokens = preprocess(definition)
        
        all_synsets = []
        for word in clean_tokens:
            syn = [lesk(definition, word)] # TODO: disambiguare le parole della definizione con lesk e usare i loro synsets per trovare gli iperonimi!
            if len(syn) > 0:
                for s in syn:
                    if s:
                        hyper = lambda s: s.hypernyms()
                        all_hyper = list(s.closure(hyper, depth=2))  # TODO: aumentare a 2,3
                        hypernyms.extend([x.name().split(".")[0] for x in all_hyper])

                for g in hypernyms:
                    if not g in genus_dict:
                        genus_dict[g] = 1
                    else:
                        genus_dict[g] += 1
                
        # ------------------------------
        
#         print(genus_dict)
    
        if len(genus_dict) > 0:
            genus = max(genus_dict, key=genus_dict.get)
#             print("\n{}\n".format(genus))

            syns = wn.synsets(genus)

            # Prendiamo tutti gli iponimi per il genus della singola definizione
            for i, s in enumerate(syns, start=0):
                hypon = lambda s: s.hyponyms()  # SOTTONOME, significato semantico incluso in altra parola
                all_hypon = list(s.closure(hypon, depth=1))  # TODO: aumentare a 2,3
                hyponyms.extend([x.name().split(".")[0] for x in all_hypon])
#             print("SYN: {} \t HYPER: {}".format(s,t))
#             print(index, hyponyms, "\n")
#         else:
#             print("NADA")

        hyponyms_list.append(' '.join(hyponyms))

#         print(hyponyms_list)


    '''
    CountVectorizer will create k vectors in n-dimensional space, where:
    - k is the number of sentences,
    - n is the number of unique words in all sentences combined.
    If a sentence contains a certain word, the value will be 1 and 0 otherwise
    '''

    vectorizer = CountVectorizer()
    matrix = vectorizer.fit_transform(hyponyms_list)

    feature_list = vectorizer.get_feature_names()
    vectors = matrix.toarray()

    m = vectors.sum(axis=0).argmax()

    print(m)
    print(feature_list[m] + '\n')
#     print(feature_list)

    

FileNotFoundError: [Errno 2] No such file or directory: '/Users/lorenzotabasso/Desktop/University/TLN/Progetto/19-20/tln-1920/part3/exercise2/input/content-to-form.csv'

In [19]:
import time
time.sleep(3)

In [None]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\nn_webserver")

from employee import motivation_to_work