In [149]:
import tarfile
import os
import xml.etree.ElementTree as etree
import pandas as pd
from nltk.corpus import wordnet as wn
from tqdm import tqdm, tnrange, tqdm_notebook

In [150]:
archived_xml = '../resources/training-data/WSD_Training_Corpora/SemCor/semcor.data.xml'
mapping_file = '../resources/training-data/WSD_Training_Corpora/SemCor/semcor.gold.key.txt'

In [151]:
def sensekeyToSynsetConverter(sensekey: str):
    '''retrieves a WordNet synset from a sensekey using the nltk package'''
    synset = wn.lemma_from_key(sensekey).synset()
    
    synset_id = "wn:" + str(synset.offset()).zfill(8) + synset.pos()
    return synset_id

In [152]:
mapping = pd.read_table(mapping_file, sep = ' ', names = ['sentence_idx', 'sensekey1', 'sensekey2'])
mapping.head()

Unnamed: 0,sentence_idx,sensekey1,sensekey2
0,d000.s000.t000,long%3:00:02::,
1,d000.s000.t001,be%2:42:03::,
2,d000.s000.t002,review%2:31:00::,
3,d000.s000.t003,objective%1:09:00::,
4,d000.s000.t004,benefit%1:21:00::,


# convert from sensekey to synset ID

In [5]:
tqdm.pandas(desc="my bar!")
# converting from sensekey to synset id for the two columns
mapping['sensekey1'] = mapping['sensekey1'].progress_apply(sensekeyToSynsetConverter)

my bar!: 100%|██████████| 226036/226036 [03:49<00:00, 983.39it/s] 


In [12]:
tqdm.pandas(desc="my bar!")
# using notnull() instead of dropna because dropna() does not work on particular columns
mapping['sensekey2'][mapping['sensekey2'].notnull()] = mapping['sensekey2'][mapping['sensekey2'].notnull()].progress_apply(sensekeyToSynsetConverter)

In [13]:
mapping[70:75]

Unnamed: 0,sentence_idx,sensekey1,sensekey2
70,d000.s010.t003,wn:01061489a,
71,d000.s010.t004,wn:00081572n,
72,d000.s010.t005,wn:08186047n,wn:01209576n
73,d000.s010.t006,wn:10053808n,
74,d000.s011.t000,wn:00248977n,


# wordnet to BabelNet

In [79]:
!ls ../resources/

babelnet2lexnames.tsv    error_idx.txt            [34mtraining-data[m[m
babelnet2wndomains.tsv   parsed_corpora_final.txt
babelnet2wordnet.tsv     test.xml


In [80]:
file = '../resources/babelnet2wordnet.tsv'
BabelNet = pd.read_table(file, sep = '\t', names = ['BabelNet', 'WordNet', 'WordNet2'])
BabelNet.head()

Unnamed: 0,BabelNet,WordNet,WordNet2
0,bn:00000001n,wn:08641944n,
1,bn:00000002n,wn:08950407n,
2,bn:00000003n,wn:04502851n,
3,bn:00000004n,wn:13742358n,
4,bn:00000005n,wn:13742573n,


In [81]:
file = '../resources/babelnet2wndomains.tsv'
WordNet = pd.read_table(file, sep = '\t', names = ['BabelNet', 'WordNetDomain'])
WordNet.head()

Unnamed: 0,BabelNet,WordNetDomain
0,bn:00000001n,factotum
1,bn:00000002n,geography
2,bn:00000003n,military
3,bn:00000004n,number
4,bn:00000005n,number


In [82]:
file = '../resources/babelnet2lexnames.tsv'
LexicographerNet = pd.read_table(file, sep = '\t', names = ['BabelNet', 'LexNames'])
LexicographerNet.head()

Unnamed: 0,BabelNet,LexNames
0,bn:00000001n,noun.location
1,bn:00000002n,noun.location
2,bn:00000003n,noun.artifact
3,bn:00000004n,noun.quantity
4,bn:00000005n,noun.quantity


In [135]:
import csv

In [148]:
context = etree.iterparse(archived_xml, events=("start", "end"))
c=0
with open('../resources/f.csv', 'w', encoding='utf-8') as file:
    csv_writer =  csv.writer(file)
    csv_writer.writerow(('id', 'Wordnet1', 'Wordnet2', 'lemma', 'text', 'BabelNet', 'WordNetDomain', 'LexNames'))
    
    for idx, (event, elem) in enumerate(tqdm(context)):
        
        if elem.tag == "instance" and event == 'end':
            
            instance_id = elem.get("id")
            m = mapping[mapping['sentence_idx']==instance_id]
            lemma = elem.get("lemma")
            text = elem.text
            
            #get babelnet id from wordnet synset
            BNet = BabelNet[BabelNet['WordNet'] == m['sensekey1'].iloc[0]]['BabelNet']
            WordNetDomain = WordNet[WordNet['BabelNet'] == BNet.iloc[0]]['WordNetDomain']
            LexNet = LexicographerNet[LexicographerNet['BabelNet'] == BNet.iloc[0]]['LexNames']
            
            if len(WordNetDomain)>0:
                #print(list(m.iloc[0]), lemma, text, BNet.iloc[0], WordNetDomain.iloc[0], LexNet.iloc[0])
                csv_writer.writerow([instance_id,
                                    m['sensekey1'].iloc[0], m['sensekey2'].iloc[0], 
                                    lemma, text, 
                                    BNet.iloc[0], WordNetDomain.iloc[0], LexNet.iloc[0]])
        if idx == 1000:
            break
    elem.clear()
del context


0it [00:00, ?it/s][A
7it [00:00, 63.84it/s][A
23it [00:00, 97.68it/s][A
35it [00:00, 99.07it/s][A
55it [00:00, 114.94it/s][A
67it [00:00, 111.11it/s][A
77it [00:00, 105.39it/s][A
87it [00:00, 103.34it/s][A
97it [00:00, 102.86it/s][A
109it [00:01, 102.24it/s][A
127it [00:01, 106.67it/s][A
141it [00:01, 107.67it/s][A
159it [00:01, 110.78it/s][A
175it [00:01, 112.31it/s][A
188it [00:01, 109.11it/s][A
207it [00:01, 112.25it/s][A
221it [00:01, 112.45it/s][A
237it [00:02, 113.70it/s][A
250it [00:02, 111.47it/s][A
265it [00:02, 111.71it/s][A
277it [00:02, 109.22it/s][A
293it [00:02, 110.00it/s][A
304it [00:02, 108.93it/s][A
317it [00:02, 108.48it/s][A
331it [00:03, 108.75it/s][A
342it [00:03, 106.58it/s][A
357it [00:03, 106.97it/s][A
371it [00:03, 107.01it/s][A
389it [00:03, 108.15it/s][A
407it [00:03, 109.29it/s][A
429it [00:03, 111.39it/s][A
443it [00:04, 109.25it/s][A
455it [00:04, 108.58it/s][A
471it [00:04, 108.84it/s][A
483it [00:04, 108.24it/s][A
495

In [144]:
m['sensekey1'].iloc[0]

'wn:02604760v'

In [132]:
m.iloc[0]

sentence_idx    d000.s002.t000
sensekey1         wn:00786195n
sensekey2                  NaN
Name: 20, dtype: object

In [106]:
WordNetDomain

62540    factotum
Name: WordNetDomain, dtype: object

In [None]:
sensekeyToSynsetConverter(m['sensekey1'][0])

In [None]:
sensekeyToSynsetConverter(m['sensekey2'][0])

In [103]:
 LexicographerNet[LexicographerNet['BabelNet'] == BNet.iloc[0]]['LexNames']

Unnamed: 0,BabelNet,LexNames
64641,bn:00064646n,noun.cognition


In [None]:
import csv
import xml.etree.ElementTree as ET

In [None]:
X_train = []
Y_train =  []
x = []
y = []

#reading the gold.txt.file for getting the word and sense for specific id 

input_file =  open("inputFile.csv",'a')
csv_writer =  csv.writer(input_file)
csv_writer.writerow(('Input', 'label'))


dictionary = {}
with open(mapping_file) as fp:
    
    for line in fp:
        data  =  line.split()
        dictionary[data[0]] = data[1]
        

for start,element  in ET.iterparse(archived_xml, events=("start",'end')):
    if element.tag == 'instance' and start == 'start':
        
        insta_id = element.get("id")


        if dictionary.get(insta_id) != None:
            x.append(element.text)
            y.append(dictionary.get(insta_id))

    elif element.tag =='sentence' and start=='end':
       
        if len(x) > 0:
            for i in range(len(x)):
                myList = [x[i],y[i]]
                csv_writer.writerow(myList)
            
    
        x = []
        y = []

    
    element.clear()
    # Also eliminate now-empty references from the root node to elem
    for ancestor in element.findall('.//key'):
        while ancestor.getprevious() is not None:
            del ancestor.getparent()[0]

# Predictions:
1. babelnet
2. wordnet_domains
3. lexicographer

In [None]:
#BabelNet synset ids to WordNet offset ids
synset_id = pd.read_csv(mapping_file, sep = "\t", error_bad_lines=False, header = None)
synset_id.columns = ['BabelNet', 'WordNet']
BabelNet_id = list(synset_id['BabelNet'])


N_annotations_present, textExists = False, False
bigrams, unigrams = [], []
sentence = ''
#total = 153763675
total = 290000000
#Write to file the last sentence written so in case of a crash, you can write and process starting from that sentence

f = open("../resources/parsed_corpora_RECOVERY_final.txt", "r+")
annotations_per_sentence = open("../resources/parsed_corpora_annotations_final.txt", "a")
data = f.readlines()
#checks for last iteration if exists
last_iteration, written_lines = [int(i) for i in data[-1].split(",")]
print("starting processing from iteration # {}\t Written lines so far: {}".format(last_iteration, written_lines))

############################
## main iteration start ###
###########################
with open('../resources/parsed_corpora_final.txt', 'a', encoding='utf-8') as file:
    for idx, (event, elem) in enumerate(tqdm(context)):

        #checks current idx so if preprocessing crashes, It start processing from this iteration
        if last_iteration < idx:
            #taking start of each sentence
            if elem.tag=='sentence' and event == 'start':
                sentence_idx = elem.get("id")

            #taking the sentence text (English only)
            if elem.tag == 'text' and elem.get("lang") == 'en':
                #checking if the text is not a None
                if elem.text!= None:
                    sentence += elem.text
                    textExists =  True
                else:
                    textExists = False

            if textExists:
                #taking the start of the sentence annotations (English only)
                if elem.tag == 'annotation' and elem.get("lang") == 'en' and event == 'start':
                    # checking if annotation is in the mapping from BabelNet to WordNet

                    current_synset_id = elem.text
                    if current_synset_id in BabelNet_id:
                        N_annotations_present=True
                        anchor = elem.get("anchor")
                        replace_by = "_".join(elem.get("lemma").split(" ")) + "_" + elem.text

                        #write N-grams and unigrams into memory
                        if len(anchor.split(" "))>1:
                            bigrams.append([anchor, replace_by])
                        elif len(anchor.split(" "))==1:
                            unigrams.append([anchor, replace_by])

                #after iterating through all annotations, write the transformed sentence
                if elem.tag=='sentence' and event == 'end':
                    if N_annotations_present:
                        annotations = 0

                        #ensure longest n-grams dominate, then replace
                        bigrams = sorted(bigrams, key = lambda k: len(k[0].split(" ")), reverse = True)

                        #replace n-grams
                        sentence = sentence.replace("-"," ")
                        for orig, replace in bigrams:
                            annotations+=1
                            wrap = lambda x: " "+x+" "
                            sentence = sentence.replace(wrap(orig), wrap(replace))

                        #split before unigrams so nothing gets replaced twice
                        sentence = sentence.split(" ")

                        #UNIGRAMS replacement
                        for index, (orig, replace) in enumerate(unigrams):
                            if orig in sentence:
                                annotations+=1
                                sentence[sentence.index(orig)] = replace

                        #join back to write to file
                        sentence = " ".join(sentence)

                        #write to file
                        file.write(sentence+"\n")
                        annotations_per_sentence.writelines(str(annotations)+"\n")
                        written_lines+=1

                        #reset
                        bigrams, unigrams, N_annotations_present, sentence = [], [], False, ''

                    else:
                        sentence = ''
                    f.writelines("{},{}\n".format(str(idx), str(written_lines)))
        #debugging
        if (idx+1)%5000000==0:
            print("Number of actually written lines: {}\n {:.3f}% done".format(written_lines, ((idx+last_iteration)/total)*100))
            #break
            #delete to ease memory
        elem.clear()
    del context

##########################
## main iteration end ###
#########################
print("_"*120)
print("Number of actually written lines: {}".format(written_lines))

f.close()
annotations_per_sentence.close()

In [None]:
(23+26+30.5)/3

In [None]:
context