#Imports and Constants

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone "https://github.com/Omar659/verbatlas.git"

Cloning into 'verbatlas'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 27 (delta 1), reused 17 (delta 0), pack-reused 0[K
Unpacking objects: 100% (27/27), 611.37 KiB | 2.77 MiB/s, done.


In [None]:
# Nltk
from nltk.corpus import wordnet as wn
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Utils
import json
import random
from collections import defaultdict
from collections import Counter
import csv
import itertools
import requests

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Paths
# Path to folder that contains files
ROOT_PATH = "./drive/MyDrive/EAI/NLP/HW2/Project_NUANS/"
# ROOT_PATH = "./drive/MyDrive/Project_NUANS/"
DATASETS_PATH = ROOT_PATH + "Datasets/"
OUTPUT_PATH = ROOT_PATH + "Outputs/"

#Functions and Classes

In [None]:
class VerbAtlas():
    def __init__(self, folder_path):
        # VA_bn2va.tsv: lists the BabelNet synsets in each VerbAtlas frame
        self.bn2va = self.__read_tsv(folder_path + "/VerbAtlas-1.1.0/VA_bn2va.tsv")
        self.bn_va_dict = [{}, defaultdict(lambda : [])]
        for bn2va_i in self.bn2va:
            self.bn_va_dict[0][bn2va_i[0]] = bn2va_i[1]
            self.bn_va_dict[1][bn2va_i[1]].append(bn2va_i[0])
            
        # bn2wn.tsv: mapping from BabelNet to WordNet
        self.bn2wn = self.__read_tsv(folder_path + "/VerbAtlas-1.1.0/bn2wn.tsv")
        self.bn_wn_dict = {}
        for bn2wn_i in self.bn2wn:
            self.bn_wn_dict[bn2wn_i[0]] = [bn2wn_i[1][3:-1], bn2wn_i[1][-1]]

        # VA_frame_info.tsv: mapping from VerbAtlas ID to all the frame information, 
        # including the frame name (second field)
        self.frame_info = self.__read_tsv(folder_path + "/VerbAtlas-1.1.0/VA_frame_info.tsv")
        self.frame_info_dict = {}
        for frame_info_i in self.frame_info:
            self.frame_info_dict[frame_info_i[0]] = frame_info_i[1:]

        # README of VerbAtlas
        with open(folder_path + "README.md") as file:
            my_file = csv.reader(file, delimiter="\t")
            readme = "\n".join([line[0] for line in my_file if line != []])
            print(readme)

    def __read_tsv(self, path):
        # Read a file from VerbAtlas
        tsv_file = []
        with open(path) as file:
            my_file = csv.reader(file, delimiter="\t")
            for line in list(my_file)[1:]:
                tsv_file.append(line)
        return tsv_file

# Verbatlas frames
vf = VerbAtlas("./verbatlas/")

                                VerbAtlas 1.1.0
                             http://verbatlas.org
               Andrea Di Fabio, Simone Conia and Roberto Navigli
               Sapienza NLP Group, Sapienza University of Rome
                             http://nlp.uniroma1.it
               Web site and resource maintenance by Babelscape
                             http://babelscape.com
VerbAtlas is a novel large-scale manually-crafted semantic resource for
wide-coverage, intelligible and scalable Semantic Role Labeling.
The goal of VerbAtlas is to manually cluster WordNet synsets that share similar
semantics into a set of semantically-coherent frames.
VerbAtlas is licensed under the CC BY-NC-SA 4.0 License.
PACKAGE CONTENTS
* README.txt (this file);
* LICENSES.txt (terms and conditions for the files provided in this package)
* VERBATLAS_LICENSE.txt (terms and conditions of the CC BY-NC-SA 4.0 License);
* WORDNET_LICENSE.txt (original license from WordNet 3.0);
* BABELNET_LICENSE.txt

In [None]:
def pprint(obj):
    '''
        Indented print of an object
    '''
    print(json.dumps(obj, indent=3))
    
def ambiguities(synset2frames):
    '''
        Separates ambiguous nominal synsets from unambiguous ones. 
        Ambiguous means those nominal synsets derived from two different verbs 
        associated with two different VerbAtlas frames.
        Input:
            synset2frames: 
                dictionary with: {nominal_synset: VerbAtlas_frames_set}
        Output:
            ambiguous: 
                Dictionary like input but containing only ambiguous synsets
            not_ambiguous: 
                Dictionary like input but containing only unambiguous synsets
    '''
    ambiguous = {}
    not_ambiguous = {}
    for nominal_synset, verbatlas_frames in synset2frames.items():
        if len(verbatlas_frames) > 1:
            # More than one frame associated with this nominal synset.
            ambiguous[nominal_synset] = verbatlas_frames
        else:
            # A set of only one value since is unambiguous
            not_ambiguous[nominal_synset] = list(verbatlas_frames)[0]
    return ambiguous, not_ambiguous

def nominal_synset_derivation(vf):
    '''
        For each verb contained in BableNet (associated with a VerbAtlas frame) 
        a 1-to-1 match is found in WordNet. From the verb in WordNet 
        the derivationally related forms are derived and for each of these 
        the derivation tree is traced back through the hypernyms to the "Entity" synset. 
        At this point several statistics are captured.
        Input: 
            vf: VerbAtlas object (all files that are needed in the form of dictionaries)
        Output:
            event_ambiguous_nominal_synset:
                {nominal_synset: VerbAtlas_frames_set} dictionary where nominal_synsets are: 
                    - Those that have reached the "Event" synset by going up the hypernym tree. 
                    - They are ambiguous.
            event_not_ambiguous_nominal_synset:
                {nominal_synset: VerbAtlas_frames_set} dictionary where nominal_synsets are: 
                    - Those that have reached the "Event" synset by going up the hypernym tree. 
                    - They are not ambiguous.
            not_event_ambiguous_nominal_synset:
                {nominal_synset: VerbAtlas_frames_set} dictionary where nominal_synsets are: 
                    - Those that didn't reached the "Event" synset by going up the hypernym tree. 
                    - They are not ambiguous.
            not_event_not_ambiguous_nominal_synset:
                {nominal_synset: VerbAtlas_frames_set} dictionary where nominal_synsets are: 
                    - Those that didn't reached the "Event" synset by going up the hypernym tree. 
                    - They are not ambiguous.
            final_identification_set:
                {nominal_synset: definition_pos_tag} dictionary where nominal_synsets are 
                those that didn't reached the "Event" synset by going up the 
                hypernym tree and definition_pos_tag is the universal pos tagging 
                associated to synset's definition.
                    
    '''
    # event_nominal_synset -> frame
    event_nom_syn2va_frames = defaultdict(lambda : set())
    # not_event_nominal_synset -> frame
    not_event_nom_syn2va_frames = defaultdict(lambda : set())

    # For each "bablnet to wordnet" verb.
    for bablenet_id, [wn_offset, wn_pos] in vf.bn_wn_dict.items():
        synset = wn.synset_from_pos_and_offset(wn_pos, int(wn_offset))
        # Lemmas from the verb synsets
        synset_lemmas = synset.lemmas()
        nominal_synsets = set()
        # Find nominal_synsets
        for synset_lemma in synset_lemmas:
            drfs = synset_lemma.derivationally_related_forms()
            for drf in drfs:
                pos = drf.synset().pos()
                if pos == "n":
                    nominal_synsets.add(drf.synset())
        # Derive the hypernym tree and save the path, for each nominal_synset
        for nominal_synset in nominal_synsets:            
            synsets_to_hypernym = set([nominal_synset])
            visited = set()
            while synsets_to_hypernym != set():
                synset_to_hypernym = synsets_to_hypernym.pop()
                visited.add(synset_to_hypernym)
                hypernyms = synset_to_hypernym.hypernyms()
                for hypernym in hypernyms:
                    lemmas = hypernym.lemmas()
                    for lemma in lemmas:
                        pos = lemma.synset().pos()
                        if pos == "n":
                            synsets_to_hypernym.add(lemma.synset())
            va_frame = vf.frame_info_dict[vf.bn_va_dict[0][bablenet_id]][0]
            # If a nomina_synset reach "event"
            if set(wn.synsets("event")).intersection(visited) != set():
                event_nom_syn2va_frames[nominal_synset].add(va_frame)
            else:
                not_event_nom_syn2va_frames[nominal_synset].add(va_frame)
    final_identification_set = {}
    for synset in not_event_nom_syn2va_frames.keys(): 
        final_identification_set[synset] = pos_tagging(synset.definition())

    # Divide the ambiguous from the unambiguous
    event_ambiguous_nominal_synset, event_not_ambiguous_nominal_synset = ambiguities(event_nom_syn2va_frames)
    not_event_ambiguous_nominal_synset, not_event_not_ambiguous_nominal_synset = ambiguities(not_event_nom_syn2va_frames)
    return (event_ambiguous_nominal_synset, 
            event_not_ambiguous_nominal_synset, 
            not_event_ambiguous_nominal_synset,
            not_event_not_ambiguous_nominal_synset,
            final_identification_set)

In [None]:
###########
# UTILITY #
###########
def pos_tagging(sentence, split = True):
    '''
    This function genererate the universal pos tag of a sentence.
    Args:
        sentence: sentence to tag
        split (default True): if True, the sentence is already tokenized, otherwise
                              we need a tokenization
    Return:
        universal_pos_tags: list of [word, tag], where "word" is the word in the sentence
                            and "tag" is the universal pos tag associated
    '''
    pos_tags = nltk.pos_tag(nltk.word_tokenize(sentence)) if split else nltk.pos_tag(sentence)
    universal_pos_tags = []
    for word, pos in pos_tags:
        if pos in ["JJ", "JJR", "JJS"]:
            universal_pos_tags.append([word, "ADJ"])
        elif pos in ["IN"]:
            universal_pos_tags.append([word, "ADP"])
        elif pos in ["RB", "RBR", "RBS"]:
            universal_pos_tags.append([word, "ADV"])
        elif pos in ["MD"]:
            universal_pos_tags.append([word, "AUX"])
        elif pos in ["CC"]:
            universal_pos_tags.append([word, "CCONJ"])
        elif pos in ["DT", "PDT", "WDT"]:
            universal_pos_tags.append([word, "DET"])
        elif pos in ["UH"]:
            universal_pos_tags.append([word, "INTJ"])
        elif pos in ["NN", "NNS", "NNP", "NNPS"]:
            universal_pos_tags.append([word, "NOUN"])
        elif pos in ["CD"]:
            universal_pos_tags.append([word, "NUM"])
        elif pos in ["POS"]:
            universal_pos_tags.append([word, "PART"])
        elif pos in ["PRP", "PRP$", "WP", "WP$"]:
            universal_pos_tags.append([word, "PRON"])
        elif pos in ["NNP", "NNPS"]:
            universal_pos_tags.append([word, "PROPN"])
        elif pos in [".", ",", ":", "!", "?"]:
            universal_pos_tags.append([word, "PUNCT"])
        elif pos in ["IN"]:
            universal_pos_tags.append([word, "SCONJ"])
        elif pos in ["SYM"]:
            universal_pos_tags.append([word, "SYM"])
        elif pos in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
            universal_pos_tags.append([word, "VERB"])
        else:
            universal_pos_tags.append([word, "X"])
    return universal_pos_tags

def api_call_verbatlas(definition):
    '''
    This function makes an API call to VerbAtlas obtaining its evaluation. 
    It then associates each word with either "~" or the frame if it is a verb.
    Args:
        definition: definition of a synset
    Return:
        idxs_and_vfs: list of [token_idx, frame_name] where token_idx is the index
                      of the verb in the sentece and frame_name is its associated frame.
                      With a response code of 500 this value is ""
        va_split_sent: the splitted sentence from the VerbAtlas response.
                       With a response code of 500 this value is ""
    '''
    # API call
    # If the character '"' is present in the definition, the input string of 
    # the API call will fail. For this reason, the escape character "\" has been added, 
    # which must appear in the request string and so '"' has been replaced with '\"'
    definition = definition.replace('"', '\\"')
    input_str = '[{"text":"' + definition + '", "lang":"EN"}]'
    # We convert the input string to JSON format
    input_json = json.loads(input_str)
    # We define the parameters of the HTTP request
    url = 'https://verbatlas.org/api/model'
    headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
    # We execute the HTTP request using the POST method and passing the parameters defined
    response = requests.post(url, headers=headers, json=input_json)
    
    # Postprocessing of the response
    idxs_and_vfs = ""
    va_split_sent = ""
    if response.status_code != 500:
        response_json = json.loads(response.text)
        va_split_sent = [x["rawText"] for x in response_json[0]["tokens"]]
        idxs_and_vfs = []
        for annotation in response_json[0]["annotations"]:
            frame_name = annotation["verbatlas"]["frameName"]
            token_index = annotation["tokenIndex"]
            idxs_and_vfs.append([token_index, frame_name])
    return idxs_and_vfs, va_split_sent

################################
# EVENT IDENTIFICATION DATASET #
################################
def compute_event_set():
    '''
    This function is used to obtain all nominal event synsets generated from the synsets "Event"
    Return:
        - list of all event synsets to use for the datasets
    '''
    synset_event = wn.synsets("event")
    event_set = set()
    # visitare l'albero di hyponyms ricorsivamente e aggiungere i synset nominali alla lista
    def get_hyponyms(synset):
        '''
        Recursive function that add all the hyponyms of a synset
        Args:
            synset: synset from which hyponyms will be obtained
        '''
        if synset.pos() == 'n':
            event_set.add(synset)
        for hyp in synset.hyponyms():
            get_hyponyms(hyp)

    for event_s in synset_event:
        get_hyponyms(event_s)
    return list(event_set)

def compute_not_event_set(event_set, 
                          not_ev):
    '''
    This function is used to obtain all nominal not event synsets generated 
    from specific synsets that we are sure are not event.
    Args:
        event_set: are the synsets to be classified as event
        not_ev: are the synsets that have not reached the "Event" synsets, ambiguous and not
    Return:
        not_event_set: list of all not event synsets to use for the datasets
    '''
    # Synsets as general as possible considered as not event through a manual analysis of the definition
    synset_list = ['thing.n.08', 'object.n.01', 'set.n.02', 'substance.n.04', 'matter.n.03', 'otherworld.n.01', 'measure.n.02', 'group.n.01', 'attribute.n.02']

    # Set of not event synsets derived from one of the synset_list
    synset_set = set()

    def get_hyponyms(synset):
        '''
        Recursive function that add all the hyponyms of a synset
        Args:
            synset: synset from which hyponyms will be obtained
        '''
        if synset.pos() == 'n':
            synset_set.add(synset)
        for hyp in synset.hyponyms():
            get_hyponyms(hyp)

    # Get not event synsets from the hyponym tree of the synsets in synset_list
    before = 0
    synset_dict = {}
    for synset_name in synset_list:
        synset = wn.synset(synset_name)
        get_hyponyms(synset)
        synset_dict[synset_name] = synset_set
        synset_set = set()
        
    list_of_lists = [list(synsets) for synsets in list(synset_dict.values())]
    not_event_set = [list(set(itertools.chain(*list_of_lists)) & not_ev)]
    rest_not_event = set(itertools.chain(*list_of_lists)) - set(not_event_set[0])
    num_not_event_synsets = len(rest_not_event)
    for key, value in synset_dict.items():
        synset_dict[key] = value - (value & set(not_event_set[0]))
    
    # Number of not event synset to select from the list for each geeneral synset
    division = int((len(event_set) - len(not_event_set[0]) - 34)/ 5)

    for i, (key, value) in enumerate(synset_dict.items()):
        # The value 26 was found manually looking at the frequencies. If the frequency is
        # under this value, then all the synsets will be added, otherwise only 
        # "division" synsets will be added
        if len(value) > 26: 
            value_list = list(value)
            r = random.sample(value_list, division)
            not_event_set.append(r) 
        else:
            not_event_set[0] += list(value)
    return not_event_set

def set_dict_identification(event_set, not_event_set):
    '''
    This function generates the dataset in the correct format
    Args:
        event_set: all event synsets
        not_event_set: all not event synsets
    Return:
        mset: dataset with the format: 
              {synset_name: [pos_tagging(synset_definition), label]}
    '''
    mset = {}
    for synset_e, synset_ne in zip(event_set, not_event_set):
        mset[str(synset_e)] = [pos_tagging(synset_e.definition()), "event"]
        mset[str(synset_ne)] = [pos_tagging(synset_ne.definition()), "not_event"]
    return mset

def identification_dataset_creation(final_identification_set,
                                    not_event_not_ambiguous_nominal_synset,
                                    not_event_ambiguous_nominal_synset):
    '''
    This function generates, for the identification step, the datasets: 
    train, dev, test and final. The final dataset contains all the not event synsets
    that could be event
    Args: 
        final_identification_set: all the not event synsets
        not_event_not_ambiguous_nominal_synset: all not event unambiguous synsets
        not_event_ambiguous_nominal_synset: all not event ambiguous synsets
    '''
    # Datasets ~[70-15-15]%
    sets = {"train": [], "dev": [], "test": [], "final": []}

    # Event synsets
    event_set = compute_event_set()
    # Not event synsets
    not_ev_amb = set(list(not_event_ambiguous_nominal_synset.keys()))
    not_ev_not_amb = set(list(not_event_not_ambiguous_nominal_synset.keys()))
    not_event_set = compute_not_event_set(event_set, set(list(not_ev_amb) + list(not_ev_not_amb)))
    # Not event synset for the final dataset
    final_not_event_set = set(list(final_identification_set.keys()))
    support_list = []
    for not_events in not_event_set:
        support_list += not_events
    final_not_event_set = final_not_event_set - (final_not_event_set & set(support_list))

    # Training set
    t_p = 0.7
    event_t = random.sample(list(event_set), int(len(event_set)*t_p))
    not_event_t = []
    not_train_not_event = []
    for not_event in not_event_set:        
        not_event_t += random.sample(list(not_event), int(len(not_event)*t_p))
        not_train_not_event.append(list(set(not_event) - set(not_event_t)))
    sets["train"] = set_dict_identification(event_t, not_event_t)
    
    # Developement set
    not_train_event = list(set(event_set) - set(event_t))
    event_d = random.sample(not_train_event, int(len(not_train_event)*0.5))
    not_event_d = []
    not_dev_not_event = []
    for not_event in not_train_not_event:        
        not_event_d += random.sample(list(not_event), int(len(not_event)*0.5))
        not_dev_not_event.append(list(set(not_event) - set(not_event_d)))
    sets["dev"] = set_dict_identification(event_d, not_event_d)
    
    # Test set
    event_te = list(set(not_train_event) - set(event_d))
    not_event_te = []
    for not_event in not_dev_not_event:
        not_event_te += not_event
    sets["test"] = set_dict_identification(event_te, not_event_te)

    # Final set
    mset = {}
    for final in final_not_event_set:
        mset[str(final)] = pos_tagging(final.definition())
    sets["final"] = mset

    # Save
    with open(DATASETS_PATH + "dataset_identification.json", "w") as f:
        json.dump(sets, f)

################################
# EVENT CLASSIFICATION DATASET #
################################
def set_dict_classification(synsets, all_synset):
    '''
    This function generates the dataset in the correct format
    Args:
        synsets: all synsets for a specific dataset (train, dev, test or final)
        all_synset: it's a dictionary containing all ambiguous or unambiguous synsets, 
                    where the key is the name of the synset and 
                    the value is the class list or class belonging (verbatlas frames).
    Return:
        mset: dataset with the format: 
              {synset_name: [pos_tagging(synset_definition), label, VerbAtlas_computation]}
    '''
    mset = {}
    synset_list = list(synsets)
    for synset in synset_list:
        vf_d = "~"
        idx_and_vf, va_split_sent = api_call_verbatlas(synset.definition())
        if idx_and_vf == "":
            pos_tag = pos_tagging(synset.definition())
            for word in pos_tag:
                word.append(vf_d)
        else:
            pos_tag = pos_tagging(va_split_sent, split = False)
            for idx, vf in idx_and_vf:
                if vf == "_":
                    vf = vf_d
                pos_tag[idx].append(vf)
            for word in pos_tag:
                if len(word) < 3:
                    word.append(vf_d)
        classes = all_synset[synset]
        if type(all_synset[synset]) == set:
            classes = list(classes)
        mset[str(synset)] = [pos_tag, classes]
    return mset

def classification_dataset_creation(event_not_ambiguous_nominal_synset, 
                                    event_ambiguous_nominal_synset,
                                    extra):
    '''
    This function generates, for the classification step, the datasets: 
    train, dev, test and final. The final dataset contains all the ambiguous synsets
    that could be disambiguated
    Args: 
        event_not_ambiguous_nominal_synset: all the event unambiguous synsets
        event_ambiguous_nominal_synset: all not event ambiguous synsets
        extra: all event synsets, ambiguous and not, found in the identification step
    '''
    # 1. Datasets ~[80-10-10]%
    sets = {}

    # 2. Include the extra synsets
    ambiguous = {}
    for synset, classes in list(extra["ambiguous"].items()) + list(event_ambiguous_nominal_synset.items()):
        ambiguous[synset] = classes
    unambiguous = {}
    for synset, classes in list(extra["unambiguous"].items()) + list(event_not_ambiguous_nominal_synset.items()):
        unambiguous[synset] = classes

    # 3. Generate the labels
    # Get frequences for each frame of the unambiguous synsets
    statistics = Counter(list(unambiguous.values()))
    # Labels dictionary {VerbAtlas_frame: [frequency, synsets_list]}
    labels = {}
    # Special class "other" for all frames with low frequency
    # This class is a dictionary {VerbAtlas_frame: [frequency, synsets_list]}
    labels["other"] = {}
    # The other class has a special class "few_occurences" that merge in a single
    # class all the frame with frequency less than 3 because they can't be splitted
    # in train, dev and test sets. 
    # This class has associated a triple [frequencey, frames_list, synsets_list]
    labels["other"]["few_occurrences"] = [0, [], []]
    for frame, occurrences in statistics.items():
        if occurrences >= 11:
            labels[frame] = [occurrences, []]
        else:
            if occurrences < 3:
                labels["other"]["few_occurrences"][0] += occurrences
                labels["other"]["few_occurrences"][1].append(frame)
            labels["other"][frame] = [occurrences, []]

    # 4. Manipulating the sets by including the label "other" instead of low-frequency frames 
    # and adding the synsets to the corresponding label. 
    # 4.1. For unambiguous synsets, if the associated frame is present in the label list, 
    # it remains unchanged. On the other hand, if it is not present, the label "other" 
    # is associated with this synset.
    unambiguous_valid = {}
    for synset, frame in unambiguous.items():
        if frame in list(labels.keys()):
            labels[frame][1].append(synset)
            unambiguous_valid[synset] = frame
        else:
            if frame in labels["other"]["few_occurrences"][1]:
                labels["other"]["few_occurrences"][2].append(synset)
            labels["other"][frame][1].append(synset)
            unambiguous_valid[synset] = "other"
    # 4.2. For ambiguous synsets, if the list of frames associated with it has 
    # at least all frames minus one within the non-"other" labels, then the pair [synset, frames] is kept.
    # In this way, all ambiguous synsets that have at most one frame of ambiguity 
    # in the class "other" will be kept, so that at the disambiguation step it is possible to determine 
    # which is the correct frame to associate to that synset, and, in addition, in this way, 
    # it was possible to aggregate most of the less frequent frames 
    # into a single class managing to minimize the synsets to be discarded.
    ambiguous_valid = {}
    for synset, frames in ambiguous.items():
        if len(set(list(labels.keys())).intersection(set(frames))) >= len(frames) - 1:
            ambiguous_valid[synset] = frames

    # 5. Train set
    t_p = 0.8
    train_set_synset = []
    sampled = False
    # For each frame, the training set gets the t_p% of the synsets. If the label is "other", 
    # the training set takes the t_p% of all frames in the label "other". For the label "few_occurrences" 
    # in the label "other," where all frames with less than three synsets are merged into one set, 
    # the t_p% was taken once from this merged set. In this way, from all types of frames 
    # the t_p% of samples were taken so as to balance between train, dev and test set 
    # respecting the splitting percentages (in this case [80-10-10]%)
    for label, synsets in labels.items():
        if label == "other":
            for label_oth, synsets_oth in synsets.items():
                if label_oth == "few_occurrences":
                    continue
                if label_oth in labels["other"]["few_occurrences"][1]:
                    if sampled:
                        continue
                    population = labels["other"]["few_occurrences"][2]
                    sample = random.sample(population, int(len(population)*t_p))
                    sampled = True
                else:
                    sample = random.sample(synsets_oth[1], int(len(synsets_oth[1])*t_p))
                    if synsets_oth[0] - len(sample) < 2:
                        sample = random.sample(sample, len(sample) - (synsets_oth[0] - len(sample)))
                train_set_synset += sample
        else:
            sample = random.sample(synsets[1], int(len(synsets[1])*t_p))
            train_set_synset += sample
    train_set = set_dict_classification(train_set_synset, unambiguous_valid)
    sets["train"] = train_set

    # 6. Developement set
    dev_set_synset = []
    rest_set_synset = list(unambiguous_valid.keys() - set(train_set_synset))
    sampled = False
    # Same reasoning as Train set (see section 5.) with the difference that half of the synsets, 
    # for each type of frame, are taken from the remaining synsets by removing those included in the Train set
    for label, synsets in labels.items():
        if label == "other":
            for label_oth, synsets_oth in synsets.items():
                if label_oth == "few_occurrences":
                    continue
                if label_oth in labels["other"]["few_occurrences"][1]:
                    if sampled:
                        continue
                    population = labels["other"]["few_occurrences"][2]
                    dev_synsets = set(population) - (set(population) - set(rest_set_synset))
                    sample = random.sample(list(dev_synsets), int(len(dev_synsets)*0.5))
                    sampled = True
                else:
                    dev_synsets = set(synsets_oth[1]) - (set(synsets_oth[1]) - set(rest_set_synset))
                    sample = random.sample(list(dev_synsets), int(len(dev_synsets)*0.5))      
                dev_set_synset += sample
        else:
            dev_synsets = set(synsets[1]) - (set(synsets[1]) - set(rest_set_synset))
            sample = random.sample(list(dev_synsets), int(len(dev_synsets)*0.5))
            dev_set_synset += sample
    dev_set = set_dict_classification(dev_set_synset, unambiguous)
    sets["dev"] = dev_set
    
    # 7. Test set
    # All remaining synsets after those inserted in Train and Dev sets (see sections 5. and 6.)
    test_set_synset = list(set(rest_set_synset) - set(dev_set_synset))
    test_set = set_dict_classification(test_set_synset, unambiguous)
    sets["test"] = test_set

    # 8. Final set
    # All ambiguous synsets that have passed the skimming of the class "other" (see section 4.2.)
    final_set_synset = list(ambiguous_valid.keys())
    final_set = set_dict_classification(final_set_synset, ambiguous)
    sets["final"] = final_set

    # 9. Save
    with open(DATASETS_PATH + "dataset_classification.json", "w") as f:
        json.dump(sets, f)
    with open(DATASETS_PATH + "labels_classification.json", "w") as f:
        json.dump(list(labels.keys()), f)
    with open(DATASETS_PATH + "other_labels_classification.json", "w") as f:
        json.dump(list(set(list(labels["other"].keys())) - set(['few_occurrences'])), f)

def extra_synsets_from_identification(not_event_ambiguous_nominal_synset, 
                                      not_event_not_ambiguous_nominal_synset):
    '''
    This function read the extra synsets found in the identification step.
    Args:
        not_event_ambiguous_nominal_synset: all not event ambiguous synsets
        not_event_not_ambiguous_nominal_synset: all not event unambiguous synsets
    Return:
        extra: all synsets obtained in the identification step divided into ambiguous and unambiguous
    '''
    with open(OUTPUT_PATH + "event_identification_step_results.json", "r") as file:
        event_identification_step = json.load(file)
    extra = {"ambiguous": {}, "unambiguous": {}}
    for synset_str, id in event_identification_step.items():
        if id == "event":
            synset = wn.synset(synset_str)
            if not_event_ambiguous_nominal_synset.get(synset) is not None:
                extra["ambiguous"][synset] = list(not_event_ambiguous_nominal_synset.get(synset))
            else:
                extra["unambiguous"][synset] = not_event_not_ambiguous_nominal_synset.get(synset)
    return extra

#Dataset creation

In [None]:
# Extrapolates all information about event, nonevent, ambiguous, and unambiguous synsets 
# through the link between WordNet and VerbAtlas
random.seed(104)
event_ambiguous_nominal_synset,             \
event_not_ambiguous_nominal_synset,         \
not_event_ambiguous_nominal_synset,         \
not_event_not_ambiguous_nominal_synset,     \
final_identification_set                    = nominal_synset_derivation(vf)

In [None]:
# Creates the Dataset for the Identification step
identification_dataset_creation(final_identification_set,
                                not_event_not_ambiguous_nominal_synset,
                                not_event_ambiguous_nominal_synset)

In [None]:
# Creates the Dataset for the Classification step
extra = extra_synsets_from_identification(not_event_ambiguous_nominal_synset, 
                                          not_event_not_ambiguous_nominal_synset)
classification_dataset_creation(event_not_ambiguous_nominal_synset, 
                                event_ambiguous_nominal_synset,
                                extra)