In [2]:
import nltk

In [3]:
from nltk.corpus import wordnet as wn


In [5]:
 wn.lemmas('break', pos='n') # Retrieve all lexemes for the noun 'break'

[Lemma('interruption.n.02.break'),
 Lemma('break.n.02.break'),
 Lemma('fault.n.04.break'),
 Lemma('rupture.n.02.break'),
 Lemma('respite.n.02.break'),
 Lemma('breakage.n.03.break'),
 Lemma('pause.n.01.break'),
 Lemma('fracture.n.01.break'),
 Lemma('break.n.09.break'),
 Lemma('break.n.10.break'),
 Lemma('break.n.11.break'),
 Lemma('break.n.12.break'),
 Lemma('break.n.13.break'),
 Lemma('break.n.14.break'),
 Lemma('open_frame.n.01.break'),
 Lemma('break.n.16.break')]

In [6]:
l1 = wn.lemmas('break', pos='n')[0]

In [7]:
s1 = l1.synset() # get the synset for the first lexeme

In [8]:
s1

Synset('interruption.n.02')

In [9]:
s1.lemmas() # Get all lexemes in that synset


[Lemma('interruption.n.02.interruption'), Lemma('interruption.n.02.break')]

In [10]:
 s1.lemmas()[0].name() # Get the word of the first lexeme


'interruption'

In [11]:
s1.definition()

'some abrupt occurrence that interrupts an ongoing activity'

In [12]:
s1.examples()

['the telephone is an annoying interruption',
 'there was a break in the action when a player was hurt']

In [13]:
s1.hypernyms()

[Synset('happening.n.01')]

In [14]:
s1.hyponyms()

[Synset('dislocation.n.01'),
 Synset('eclipse.n.01'),
 Synset('punctuation.n.01'),
 Synset('suspension.n.04')]

In [15]:
l1.count()

3

#### Gensim is a vector space modeling package for Python

In [19]:
import gensim

In [23]:
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [25]:
v1 = model.wv['computer']

  """Entry point for launching an IPython kernel.


In [26]:
model.similarity('computer','calculator')

0.33398882

In [27]:
model.similarity('computer','toaster')

0.26003766

In [28]:
model.similarity('computer','dog')

0.12194334

In [29]:
model.similarity('computer','run')

0.09933449

In [32]:
# another way to compute similarity
import numpy as np

In [33]:
def cos(v1,v2):
    return np.dot(v1,v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))

In [34]:
cos(model.wv['computer'],model.wv['calculator'])

  """Entry point for launching an IPython kernel.


0.33398882

In [None]:
# lexsub_xml.py 


In [36]:
#!/usr/bin/env python
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import tostring
import sys
import re
import codecs

class Context(object):
    """
    Represent a single input word with context.
    """
    
    def __init__(self, cid, word_form, lemma, pos, left_context, right_context): 
        self.cid = cid
        self.word_form = word_form
        self.lemma = lemma
        self.pos = pos
        self.left_context = left_context
        self.right_context = right_context

    def __repr__(self):
        return "<Context_{cid}/{lemma}.{pos} {left} *{word}* {right}>".format(cid=self.cid, lemma = self.lemma, pos = self.pos, left = " ".join(self.left_context), word=self.word_form, right=" ".join(self.right_context))

class LexsubData(object):

    def __init__(self):
        self.total_count =  1
        pass

    def process_context(self, context_s):
        head_re = re.compile("<head>(.*)</head>")
        match =  head_re.search(context_s)
        target = match.groups(1)[0]
        context_left = context_s[:match.start()]
        context_right = context_s[match.end():]
        return target, context_left.split(), context_right.split()

    def parse_lexelt(self, lexelt):
        lex_item = lexelt.get('item')
        parts = lex_item.split('.')
        if len(parts) == 3:
            lemma, pos = parts[0], parts[2]
        else: 
            lemma, pos = parts[0], parts[1]

        for instance in lexelt:
            assert instance.tag=="instance"
            context = instance.find("context")                     
            context_s = "".join([str(context.text)] + [codecs.decode(ET.tostring(e),"UTF-8") for e in context])
            word_form, left_context, right_context = self.process_context(context_s)
            yield Context(self.total_count, word_form, lemma, pos, left_context, right_context)
            self.total_count += 1

    def parse_et(self,et):
       assert et.tag == "corpus"
       for lexelt in et: 
            assert lexelt.tag == "lexelt"
            for annotation in self.parse_lexelt(lexelt):
                yield annotation


def read_lexsub_xml(*sources):
    """
    Parse the lexical substitution data and return an iterator over Context instances.
    """
    lexsub_data = LexsubData()
    for source_f in sources:
        et = ET.parse(source_f)
        for annotation in lexsub_data.parse_et(et.getroot()):
            yield annotation
'''
if __name__=="__main__":

    for context in read_lexsub_xml(sys.argv[1]):
        print(context)

'''

'\nif __name__=="__main__":\n\n    for context in read_lexsub_xml(sys.argv[1]):\n        print(context)\n\n'

### Part 1: Candidate Synonyms from WordNet (10 pts)

In [None]:
# lexsub_main.py

In [39]:
#!/usr/bin/env python
import sys

from lexsub_xml import read_lexsub_xml

# suggested imports 
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
import gensim
import numpy as np

In [73]:
# Participate in the 4705 lexical substitution competition (optional): NO
# Alias: [please invent some name]

def tokenize(s):
    s = "".join(" " if x in string.punctuation else x for x in s.lower())    
    return s.split() 

def get_candidates(lemma, pos):
    # Part 1
    # return value a list
    possible_synonyms = []
    
    temp_set = set()
    for l in wn.lemmas(lemma, pos):
                for s_l in l.synset().lemmas():
                    if(s_l.name() != lemma):
                        temp_set.add(s_l.name())
                    
    possible_synonyms = list(temp_set)    
    
    return possible_synonyms

def smurf_predictor(context):
    """
    Just suggest 'smurf' as a substitute for all words.
    """
    return 'smurf'
    
'''
if __name__=="__main__":

    # At submission time, this program should run your best predictor (part 6).

    #W2VMODEL_FILENAME = 'GoogleNews-vectors-negative300.bin.gz'
    #predictor = Word2VecSubst(W2VMODEL_FILENAME)

    for context in read_lexsub_xml(sys.argv[1]):
        #print(context)  # useful for debugging
        prediction = smurf_predictor(context) 
        print("{}.{} {} :: {}".format(context.lemma, context.pos, context.cid, prediction))
'''

'\nif __name__=="__main__":\n\n    # At submission time, this program should run your best predictor (part 6).\n\n    #W2VMODEL_FILENAME = \'GoogleNews-vectors-negative300.bin.gz\'\n    #predictor = Word2VecSubst(W2VMODEL_FILENAME)\n\n    for context in read_lexsub_xml(sys.argv[1]):\n        #print(context)  # useful for debugging\n        prediction = smurf_predictor(context) \n        print("{}.{} {} :: {}".format(context.lemma, context.pos, context.cid, prediction))\n'

In [41]:
# get started testing
argv1 = "lexsub_trial.xml"
def testSmurf(argv1):
    for context in read_lexsub_xml(argv1):
        #print(context)  # useful for debugging
        prediction = smurf_predictor(context) 
        print("{}.{} {} :: {}".format(context.lemma, context.pos, context.cid, prediction))

In [74]:
# test part 1
get_candidates('slow','a')


{'boring',
 'deadening',
 'dense',
 'dim',
 'dull',
 'dumb',
 'ho-hum',
 'irksome',
 'obtuse',
 'sluggish',
 'tedious',
 'tiresome',
 'wearisome'}

### Part 2: WordNet Frequency Baseline (10 pts)

In [85]:
from collections import defaultdict
def wn_frequency_predictor(contex):
    # replace for part 2
    # dict that count the occurency
    counter = defaultdict(int)
    lemma = context.lemma
    pos = context.pos
    
    for l in wn.lemmas(lemma, pos):
                for s_l in l.synset().lemmas():
                    if(s_l.name() != lemma):
                        
                        counter[s_l.name()] += 1
    
    # find the word with highest frequency
    max_word = " "
    max_frequency = 0
    for word, frequency in counter.items():
        if(frequency > max_frequency):
            max_word = word
            max_frequency = frequency
        
    return max_word

In [86]:
wn_frequency_predictor('slow','a')

'dull'

### Part 3: Simple Lesk Algorithm 

In [152]:
def wn_simple_lesk_predictor(context):
    #replace for part 3  
    largest_overlap_size = 0
    synset = None
    for l in wn.lemmas(lemma, pos):
        for s in l.synset():
            set_temp1 = computer_overlap(context, synset)
            for h in s1.hypernyms():
                set_temp2 = compute_overlap(context, h)
                set_temp1.union(set_temp2)
            if(len(set_temp1) > largest_overlap_size):
                largest_overlap_size = len(set_temp1)
                synset = s
     
    counter = defaultdict(int)
    max
    
    for l in s.lemmas():
        if(s_l.name() != lemma):
            counter[s_l.name()] +=1
    
    # find the word with highest frequency
    max_word = " "
    max_frequency = 0
    for word, frequency in counter.items():
        if(frequency > max_frequency):
            max_word = word
            max_frequency = frequency
            
    return max_word

def compute_overlap(context, synset):
    # the definition
    overlap = set()
    definition = tokenize(synset.definition)
    examples = tokenize(synset.examples())
    stop_words = stopwords.words('english')
    
    for word in definition:
        if(word in stop_words):
            continue
        if(word in context.left_context or word in context.right_context):
            overlap.add(word)
            
    for example in examples:
        for word in example:
            if(word in stop_words):
                continue
            if(word in context.left_context or word in context.right_context):
                overlap.add(word)   
    return overlap

### Part 4: Most Similar Synonym
### Part 5: Context and Word Embedding

In [143]:
class Word2VecSubst(object):
        
    def __init__(self, filename):
        self.model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True)    

    def predict_nearest(self, context):
        # replace for part 4
        # same as part 1 
        possible_synonyms = get_candidates(context.lemma, context.pos)
        
        highest_similarity = 0.0 
        highest_synonym = ""
        for synonym in possible_synonyms:
            # ignoring the vocab that not in the model
            if(synonym not in self.model.vocab):
                continue
            temp_similarity = self.model.similarity(lemma, synonym)
            if(temp_similarity > highest_similarity):
                highest_similarity = temp_similarity
                highest_synonym = synonym
        
        return highest_synonym
    
    
    def predict_nearest_with_context(self, context): 
        # replace for part 5
        stop_words = stopwords.words('english')
        vector_target = self.model.wv[context.lemma]
        vector_sentence = np.zeros(vector_target.shape)
        possible_synonyms = get_candidates(context.lemma, context.pos)
        
        # possible 
        # remove the stop words within +-5 window and add the left data
        # build sentence vector within the window
        windows_words = []
        i = len(context.left_context)-1
        j = 0
        while(i>=0 and j<5):
            if(context.left_context[i] not in stop_words):
                windows_words.append(context.left_context[i])
            i -= 1
            j += 1
        
        k = 0
        while(k< len(context.right_context) and k<5):
            if(context.right_context[k] not in stop_words):
                windows_words.append(context.right_context[k])
            k += 1
            
        # compute the window words vector
        for word in windows_word:
            if(word not in self.model.vocab):
                continue
            vector_sentence += self.model.wv[word]
        
        # computer the synonym that has highest similarity with the sentence vector
        highest_similarity = 0.0 
        highest_synonym = ""
        for synonym in possible_synonyms:
            # ignoring the vocab that not in the model
            if(synonym not in self.model.vocab):
                continue
            temp_similarity = self.model.similarity(context.lemma, synonym)
            if(temp_similarity > highest_similarity):
                highest_similarity = temp_similarity
                highest_synonym = synonym
        
        return highest_synonym

### Part 6  Other ideas? (and competition)