In [41]:
import tarfile
import os
import xml.etree.ElementTree as etree
import pandas as pd
from nltk.corpus import wordnet as wn
from tqdm import tqdm, tnrange, tqdm_notebook
import csv
import json

In [2]:
archived_xml = '../resources/training-data/WSD_Training_Corpora/SemCor/semcor.data.xml'
mapping_file = '../resources/training-data/WSD_Training_Corpora/SemCor/semcor.gold.key.txt'

In [3]:
def sensekeyToSynsetConverter(sensekey: str):
    '''retrieves a WordNet synset from a sensekey using the nltk package'''
    synset = wn.lemma_from_key(sensekey).synset()
    
    synset_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos()
    return synset_id

In [4]:
mapping = pd.read_table(mapping_file, sep = ' ', names = ['sentence_idx', 'sensekey1', 'sensekey2'])
mapping.head()

Unnamed: 0,sentence_idx,sensekey1,sensekey2
0,d000.s000.t000,long%3:00:02::,
1,d000.s000.t001,be%2:42:03::,
2,d000.s000.t002,review%2:31:00::,
3,d000.s000.t003,objective%1:09:00::,
4,d000.s000.t004,benefit%1:21:00::,


In [None]:
len(mapping['sensekey2'].dropna())

# convert from sensekey to synset ID

In [None]:
tqdm.pandas(desc="my bar!")
# converting from sensekey to synset id for the two columns
mapping['sensekey1'] = mapping['sensekey1'].progress_apply(sensekeyToSynsetConverter)

In [None]:
tqdm.pandas(desc="my bar!")
# using notnull() instead of dropna because dropna() does not work on particular columns
mapping['sensekey2'][mapping['sensekey2'].notnull()] = mapping['sensekey2'][mapping['sensekey2'].notnull()].progress_apply(sensekeyToSynsetConverter)

# wordnet to BabelNet

In [None]:
file = '../resources/babelnet2wordnet.tsv'
BabelNet = pd.read_table(file, sep = '\t', names = ['BabelNet', 'WordNet', 'WordNet2'])
BabelNet.head()

In [None]:
file = '../resources/babelnet2wndomains.tsv'
WordNet = pd.read_table(file, sep = '\t', names = ['BabelNet', 'WordNetDomain'])
WordNet.head()

In [None]:
file = '../resources/babelnet2lexnames.tsv'
LexicographerNet = pd.read_table(file, sep = '\t', names = ['BabelNet', 'LexNames'])
LexicographerNet.head()

In [58]:
context = etree.iterparse(archived_xml, events=("start", "end"))
with open('../resources/f.csv', 'w', encoding='utf-8') as file:
    csv_writer =  csv.writer(file)
    csv_writer.writerow(('id', 'X', 'y'))
    
    for idx, (event, elem) in enumerate(tqdm(context)):

        if elem.tag == 'sentence' and event == 'start':
            idx = elem.get("id")
            X, y = [], []

        if elem.tag == "wf" and event == 'start':
            word = elem.text
            X.append(word)
            y.append(word)

        if elem.tag == "instance" and event == 'start':
            # get mapping from idx
            m = mapping[mapping['sentence_idx']== elem.get("id")]
            # create dict {lemma: [sensekey1, sensekey2]}
            word = elem.text
            X.append(word)

            #get sensekeys from mapping row
            l = [m['sensekey1'].iloc[0], m['sensekey2'].iloc[0]]
            #get rid of nan's if there is only one sensekey instead of two
            cleanedList = [x for x in l if str(x) != 'nan']
            y.append(cleanedList)

        if elem.tag == 'sentence' and event == 'end':
            #to_dump = {'x':X, 'y':y}
            csv_writer.writerow([idx, X, y])
        if idx==1000:
            break
        elem.clear()
del context


0it [00:00, ?it/s][A
32it [00:00, 295.16it/s][A
58it [00:00, 278.03it/s][A
88it [00:00, 273.77it/s][A
116it [00:00, 273.19it/s][A
158it [00:00, 293.47it/s][A
186it [00:00, 285.49it/s][A
220it [00:00, 289.54it/s][A
248it [00:00, 275.95it/s][A
274it [00:00, 274.54it/s][A
302it [00:01, 271.59it/s][A
330it [00:01, 271.75it/s][A
356it [00:01, 266.33it/s][A
388it [00:01, 269.19it/s][A
430it [00:01, 276.54it/s][A
460it [00:01, 270.18it/s][A
494it [00:01, 272.07it/s][A
530it [00:01, 275.04it/s][A
566it [00:02, 277.59it/s][A
597it [00:02, 278.98it/s][A
628it [00:02, 280.28it/s][A
659it [00:02, 278.65it/s][A
688it [00:02, 275.56it/s][A
715it [00:02, 273.61it/s][A
758it [00:02, 277.91it/s][A
996it [00:03, 270.66it/s]

In [60]:
df = pd.read_csv('../resources/f.csv')

In [65]:
df.iloc[0]['y']

"['How', ['long%3:00:02::'], 'has', 'it', ['be%2:42:03::'], 'since', 'you', ['review%2:31:00::'], 'the', ['objective%1:09:00::'], 'of', 'your', ['benefit%1:21:00::'], 'and', ['service%1:04:07::'], ['program%1:09:01::'], '?']"

In [33]:
y

['How',
 ['long%3:00:02::'],
 'has',
 'it',
 ['be%2:42:03::'],
 'since',
 'you',
 ['review%2:31:00::'],
 'the',
 ['objective%1:09:00::'],
 'of',
 'your',
 ['benefit%1:21:00::'],
 'and',
 ['service%1:04:07::'],
 ['program%1:09:01::'],
 '?']

# Predictions:
1. babelnet
2. wordnet_domains
3. lexicographer

In [31]:
f

{'x': ['How',
  'long',
  'has',
  'it',
  'been',
  'since',
  'you',
  'reviewed',
  'the',
  'objectives',
  'of',
  'your',
  'benefit',
  'and',
  'service',
  'program',
  '?']}

In [None]:
BNet.iloc[0]

In [None]:
BNet.iloc[0] in WordNet['BabelNet']

In [None]:
m.iloc[0][1:].dropna()

In [None]:
sensekeys.iloc[0][0]

In [None]:
context = etree.iterparse(archived_xml, events=("start", "end"))

with open('../resources/f.csv', 'w', encoding='utf-8') as file:
    
    csv_writer =  csv.writer(file)
    csv_writer.writerow(('id', 'sensekey1', 'sensekey2', 'lemma', 'text'))#, 'BabelNet', 'WordNetDomain', 'LexNames'))
    
    for idx, (event, elem) in enumerate(tqdm(context)):
        if elem.tag == 'sentence' and event == 'start':

            sentence, y = []
        if elem.tag == "wf" and event == 'start':
            word = elem.text
            sentence.append(word)
            y.append(word)
        if elem.tag == "instance" and event == 'start':
            # get mapping from idx
            m = mapping[mapping['sentence_idx']== elem.get("id")]
            # create dict {lemma: [sensekey1, sensekey2]}
            word = elem.text
            sentence.append(word)
            #sensekeys = m.drop(columns=["sentence_idx"]).dropna(axis=1)
            l = [m['sensekey1'].iloc[0], m['sensekey2'].iloc[0]]
            cleanedList = [x for x in l if str(x) != 'nan']
            y.append(cleanedList)
#             csv_writer.writerow([instance_id,
#                                 m['sensekey1'].iloc[0], m['sensekey2'].iloc[0], 
#                                 lemma, text])
        if elem.tag == 'sentence' and event == 'end':
            
            print(sentence)
            print(y)
            

            #get babelnet id from wordnet synset
#             BNet = BabelNet[BabelNet['WordNet'] == m['sensekey1'].iloc[0]]['BabelNet']
#             WordNetDomain = WordNet[WordNet['BabelNet'] == BNet.iloc[0]]['WordNetDomain']
#             LexNet = LexicographerNet[LexicographerNet['BabelNet'] == BNet.iloc[0]]['LexNames']
#             print(list(m.iloc[0]), lemma, text, BNet.iloc[0], WordNetDomain, LexNet.iloc[0])
#             csv_writer.writerow([instance_id,
#                                 m['sensekey1'].iloc[0], m['sensekey2'].iloc[0], 
#                                 lemma, text, 
#                                 BNet.iloc[0], WordNetDomain.iloc[0], LexNet.iloc[0]])
        elem.clear()
del context