In [22]:
from tqdm import tqdm, tnrange
import json
import pandas as pd
import os

import utils
import parsers

# Parse training XML file

In [2]:
Training = parsers.TrainingParser('../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.data.xml')

Training.create_vocab(input_vocab_path = "../resources/semcor.input.vocab.json",
                      pos_vocab_path = "../resources/semcor.pos.vocab.json",
                      left_out_vocab_path = "../resources/semcor.leftout.vocab.json",
                      subsampling_rate=1e-4,
                      min_count=5)

# converting eval datasets

In [3]:
dir_ = "../resources/WSD_Evaluation_Framework/Evaluation_Datasets"
eval_datasets = [i for i in os.listdir(dir_) if i.startswith("se")]
eval_datasets

['semeval2013', 'semeval2015', 'senseval3', 'semeval2007', 'senseval2']

In [21]:
for name in eval_datasets:
    print("Dataset: {}".format(name))
    
    path = os.path.join(dir_, name)
    gold_file = [i for i in os.listdir(path) if i.endswith('gold.key.txt')][0]
    gold_file = os.path.join(path, gold_file)
    print("using {}".format(gold_file))

    df = utils.parse_evaluation(gold_file = gold_file,
                                babelnet2wordnet = '../resources/babelnet2wordnet.tsv',
                                babelnet2wndomains = '../resources/babelnet2wndomains.tsv',
                                babelnet2lexnames = '../resources/babelnet2lexnames.tsv')
    base = gold_file.split(".gold.key.txt")[0]

    df[['sentence_idx', 'babelnet']].to_csv(base+".gold.babelnet.txt", header=None, index=None, sep=' ')
    df[['sentence_idx', 'wordnet_domains']].to_csv(base+".gold.wordnet_domains.txt", header=None, index=None, sep=' ')
    df[['sentence_idx', 'lexicographer']].to_csv(base+".gold.lexicographer.txt", header=None, index=None, sep=' ')
    
    

Sensekey to Wordnet 1:   3%|▎         | 50/1644 [00:00<00:03, 491.94it/s]

Dataset: semeval2013
using ../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2013/semeval2013.gold.key.txt


Sensekey to Wordnet 1: 100%|██████████| 1644/1644 [00:02<00:00, 652.83it/s]
Sensekey to Wordnet 1:  17%|█▋        | 169/1022 [00:00<00:00, 1663.22it/s]

NA is 2.3%
Dataset: semeval2015
using ../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2015/semeval2015.gold.key.txt


Sensekey to Wordnet 1: 100%|██████████| 1022/1022 [00:00<00:00, 2405.48it/s]
Sensekey to Wordnet 1:   0%|          | 0/1850 [00:00<?, ?it/s]

NA is 26.1%
Dataset: senseval3
using ../resources/WSD_Evaluation_Framework/Evaluation_Datasets/senseval3/senseval3.gold.key.txt


Sensekey to Wordnet 1: 100%|██████████| 1850/1850 [00:01<00:00, 1177.68it/s]
Sensekey to Wordnet 1:  18%|█▊        | 84/455 [00:00<00:00, 829.68it/s]

NA is 19.9%
Dataset: semeval2007
using ../resources/WSD_Evaluation_Framework/Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt


Sensekey to Wordnet 1: 100%|██████████| 455/455 [00:00<00:00, 1252.94it/s]
Sensekey to Wordnet 1:  10%|▉         | 222/2282 [00:00<00:00, 2219.11it/s]

NA is 1.8%
Dataset: senseval2
using ../resources/WSD_Evaluation_Framework/Evaluation_Datasets/senseval2/senseval2.gold.key.txt


Sensekey to Wordnet 1: 100%|██████████| 2282/2282 [00:01<00:00, 1619.62it/s]


NA is 31.4%


# Gold output vocab (training file semcor)

In [None]:
# df = utils.parse_evaluation(gold_file = "../resources/WSD_Evaluation_Framework/Training_Corpora/SemCor/semcor.gold.key.txt",
#                             babelnet2wordnet = '../resources/babelnet2wordnet.tsv',
#                             babelnet2wndomains = '../resources/babelnet2wndomains.tsv',
#                             babelnet2lexnames = '../resources/babelnet2lexnames.tsv')

In [None]:
# for net in ['WordNet', 'BabelNet', 'WordNetDomain', 'LexNames']:
#     output_vocab = df[net].dropna().unique()
#     output_path = "../resources/semcor.vocab.{}.json".format(net)
#     print(output_path)
# #     with open(output_path, 'w') as f:
# #         f.write('\n'.join(output_vocab))
#     with open(output_path, 'w') as f:
#         json.dump(list(output_vocab), f)

# Create mapping file between synset types to be used for all purposes

In [None]:
def create_mapping(output_path = "../resources/mapping.csv",
                   babelnet2wordnet = '../resources/babelnet2wordnet.tsv',
                   babelnet2wndomains = '../resources/babelnet2wndomains.tsv',
                   babelnet2lexnames = '../resources/babelnet2lexnames.tsv'):
    """
    creates a mapping csv
    :param output_path: path
    :param babelnet2wordnet: path
    :param babelnet2wordnet: path
    :param babelnet2wordnet: path
    :return None: saves output csv to output_path
    """
    
    BabelNet = pd.read_csv(babelnet2wordnet, sep = '\t', names = ['babelnet', 'WordNet'])
    WordNetDomain = pd.read_csv(babelnet2wndomains, sep = '\t', names = ['babelnet', 'wordnet_domains'])
    LexicographerNet = pd.read_csv(babelnet2lexnames, sep = '\t', names = ['babelnet', 'lexicographer'])
    
    df = BabelNet.join(WordNetDomain.set_index('babelnet'), on='babelnet')
    df = df.join(LexicographerNet.set_index('babelnet'), on='babelnet')
    
    df.wordnet_domains.fillna("factotum", inplace=True)
    df.lexicographer.fillna("misc", inplace=True)
    
    df.to_csv(output_path, index = False)

In [None]:
create_mapping()
