## We will implement a simple lesk-based WSD. We use SEMCOR WSD dataset for the purpose

In [4]:
import xml.etree.ElementTree as ET
from nltk.corpus import wordnet as wn
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\call2\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Let us define a word class containing the following attributes

text --> the actual word
pos --> POS tag of the word
lemma --> Lemma of the word
wnsn --> wordnet synset id of the sense used

In [5]:
class Word:
    def __init__(self, text, pos=None, lemma=None, wnsn=None, lexsn=None):
        self.text = text
        self.pos = pos
        self.lemma = lemma
        self.wnsn = wnsn
        self.lexsn = lexsn

## Lets define Simplified Lesk algorithm first

In [6]:
# let us read a sample xml file
tree = ET.parse('semcor/brown2/tagfiles/br-e22.xml')

# get the root element
root = tree.getroot()

documents = []
# let us read every sentence, one-by-one
# we are ignoring the paragraph structure
for sentence_tree in root.findall('context/p/'):
    sentence = []
#     for every word in that sentence
    for word_tree in sentence_tree:
#         get the word
        word = Word(word_tree.text)
        
#         if the word xml tag contains info about pos, lemma, wnsn, lexsn, then extract it
        if 'pos' in word_tree.attrib:
            word.pos = word_tree.attrib['pos']
            
        if 'lemma' in word_tree.attrib:
            word.lemma = word_tree.attrib['lemma']
            
        if 'wnsn' in word_tree.attrib:
            word.wnsn = word_tree.attrib['wnsn']
            
        if 'lexsn' in word_tree.attrib:
            word.lexsn = word_tree.attrib['lexsn']
        
        sentence.append( word )
    documents.append(sentence)

print('Read {0} number of documents '.format(len(documents)))

Read 84 number of documents 


In [7]:
# To calculate accuracy
correct = 0
total = 0

# window is chosen as 5, vary the window size
window = 5

# for every sentence
for every_sentence in documents:
#     let's get the word in the sentence as a list
    sentence = [x.text for x in every_sentence] 
#     print("Sentence is {0}".format(' '.join(sentence)))

# for every word in the sentence
    for word_index in range(len(every_sentence)):
#         not all words have sense info
        if every_sentence[word_index].wnsn is not None:
            context_bag = []
            every_word = every_sentence[word_index]
            
            for index in range( max(0, word_index - window), min( word_index + window, len(sentence) ) ):
                if index == word_index:
                    continue
                context_bag.append( sentence[index] )
            
#             we know the POS tag of the word in the sentence
# restrict ourselves to only the senses for that POS category
            if every_word.pos.startswith('V'):
                synsets = wn.synsets(every_word.text, pos=wn.VERB)
            elif every_word.pos.startswith('J'):
                synsets = wn.synsets(every_word.text, pos=wn.ADJ)
            elif every_word.pos.startswith('R'):
                synsets = wn.synsets(every_word.text, pos=wn.ADV)
            else:
                synsets = wn.synsets(every_word.text, pos=wn.NOUN)
                
#             all inflections of the word might not be present
# search based on lemma
            if len(synsets) == 0:
                if every_word.pos.startswith('V'):
                    synsets = wn.synsets(every_word.lemma, pos=wn.VERB)
                elif every_word.pos.startswith('J'):
                    synsets = wn.synsets(every_word.lemma, pos=wn.ADJ)
                elif every_word.pos.startswith('R'):
                    synsets = wn.synsets(every_word.lemma, pos=wn.ADV)
                else:
                    synsets = wn.synsets(every_word.lemma, pos=wn.NOUN)
                
            if len(synsets) == 0:
                continue
            
#             find the best synset based on simple word-overlap between word context and synset examples
            synset_score = -100
            synset_id = ""
            for every_synset in synsets:
                synset_bag = []
                for every_synset_example in every_synset.examples():
                    synset_bag.extend( every_synset_example.split(' ') )
                
                matching_words = list( set( context_bag ).intersection( set(synset_bag) ) )
                if len(matching_words) > synset_score:
                    synset_score = len(matching_words)
                    synset_id = every_synset.name().split('.')[-1]
                    
            if synset_id.startswith('0'):
                synset_id = synset_id[1:]

#             print('Best matching synset id is {0} with overlapping words {1}'.format( synset_id, synset_score ))
#             print('Actual synset id is {0}'.format( every_word.wnsn ))
            
            if synset_id == every_word.wnsn:
                correct = correct + 1
            total = total + 1

print('Accuracy is {0}'.format( (correct * 1.0)/ (total * 1.0) ))

Accuracy is 0.5718181818181818


## Now we will define Extended Lesk

In [None]:
import itertools
def lesk(context_sentence, ambiguous_word, pos=None, hyperhypo=False):
    max_overlaps = 0; lesk_sense = None
    context_sentence = context_sentence.split()
    for ss in wn.synsets(ambiguous_word):
        # If POS is specified.
        if pos and ss.pos is not pos:
            continue

        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary+= ss.definition().split()
        # Includes lemma_names.
        lesk_dictionary+= ss.lemma_names()

        # Optional: includes lemma_names of hypernyms and hyponyms.
        if hyperhypo == True:
            lesk_dictionary+= list(chain(*[i.lemma_names() for i in ss.hypernyms()+ss.hyponyms()]))       


        overlaps = set(lesk_dictionary).intersection(context_sentence)

        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
    return lesk_sense

In [None]:
import nltk
nltk.download('stopwords')

In [10]:
import nltk
stop_words = nltk.corpus.stopwords.words('english')
print("number of stopwords:", len(stop_words))
print(stop_words)

number of stopwords: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own'

In [28]:
def accuracy(filename):
    tree = ET.parse('semcor/brown2/tagfiles/'+filename)

    # get the root element
    root = tree.getroot()

    documents = []
    # let us read every sentence, one-by-one
    # we are ignoring the paragraph structure
    for sentence_tree in root.findall('context/p/'):
        sentence = []
    #     for every word in that sentence
        for word_tree in sentence_tree:
    #         get the word
            word = Word(word_tree.text)

    #         if the word xml tag contains info about pos, lemma, wnsn, lexsn, then extract it
            if 'pos' in word_tree.attrib:
                word.pos = word_tree.attrib['pos']

            if 'lemma' in word_tree.attrib:
                word.lemma = word_tree.attrib['lemma']

            if 'wnsn' in word_tree.attrib:
                word.wnsn = word_tree.attrib['wnsn']

            if 'lexsn' in word_tree.attrib:
                word.lexsn = word_tree.attrib['lexsn']

            sentence.append( word )
        documents.append(sentence)
    correct = 0
    total = 0

    # window is chosen as 5, vary the window size
    window = 5

    # for every sentence
    for every_sentence in documents:
    #     let's get the word in the sentence as a list
        sentence = [x.text for x in every_sentence] 
    #     print("Sentence is {0}".format(' '.join(sentence)))

    # for every word in the sentence
        for word_index in range(len(every_sentence)):
    #         not all words have sense info
            if every_sentence[word_index].wnsn is not None and every_sentence[word_index].pos.startswith('N')==False:
                context_bag = ''
                every_word = every_sentence[word_index]

                for index in range( max(0, word_index - window), min( word_index + window, len(sentence) ) ):
                    if index == word_index:
                        continue
                    context_bag += sentence[index] +' '
                synsetname = lesk(context_bag, every_word.text)
                if synsetname is None:
                    continue
                synset_id = synsetname.name().split('.')[-1]
                if synset_id.startswith('0'):
                            synset_id = synset_id[1:]
                if synset_id == every_word.wnsn:
                    correct = correct + 1
                total = total + 1
    print('Accuracy is {0}'.format( (correct * 1.0)/ (total * 1.0) ))

### Here we will check for every file in brown2 folder

In [29]:
from os import walk

filenames = next(walk("semcor/brown2/tagfiles/"), (None, None, []))[2] 

for fileName in filenames:
    print("For the File Name :" , fileName)
    accuracy(fileName)

For the File Name : br-e22.xml
Accuracy is 0.42643391521197005
For the File Name : br-e23.xml
Accuracy is 0.4609571788413098
For the File Name : br-e25.xml
Accuracy is 0.4398976982097187
For the File Name : br-e26.xml
Accuracy is 0.40934065934065933
For the File Name : br-e27.xml
Accuracy is 0.5430711610486891
For the File Name : br-e28.xml
Accuracy is 0.40641711229946526
For the File Name : br-e30.xml
Accuracy is 0.4470588235294118
For the File Name : br-e31.xml
Accuracy is 0.42441860465116277
For the File Name : br-f08.xml
Accuracy is 0.450402144772118
For the File Name : br-f13.xml
Accuracy is 0.4461942257217848
For the File Name : br-f14.xml
Accuracy is 0.3961218836565097
For the File Name : br-f15.xml
Accuracy is 0.4077922077922078
For the File Name : br-f16.xml
Accuracy is 0.410958904109589
For the File Name : br-f17.xml
Accuracy is 0.42574257425742573
For the File Name : br-f18.xml
Accuracy is 0.47096774193548385
For the File Name : br-f20.xml
Accuracy is 0.4222222222222222
For 

## The task for you is:

#### 1. Use both gloss and example sentences for producing the context signature from synsets against which you aim to match the sentence context.
#### 2. Extend this approach over all the files present in the sense-tagged Brown corpus provided in the brown2 folder.
#### 3. Ensure stop word removal
#### 4. Try to improve this approach using the Extended Lesk algorithm. The algorithm was discussed in class.
#### HINT: Include hypernymy & hyponymy synsets' gloss and example sentences in the signature overlap.