## Baseline -- Retrofitting

In [1]:
from lexsub import *
import os
import numpy as np
import glob
import copy
import pdb

## We first need to write a function compute all the neighbor words according to the lexicons.

In [17]:
## lexFiles: can be a list of files
## return: neighboring words for each word 
## add all pairwise related neighbors
def get_neighbor_ws(lexFiles):
    word2neigh = {}
    accountedFor = set()
    for lexDex, lexF in enumerate(lexFiles):
        print("On [{}/{}]".format(lexDex, len(lexFiles)))
        with open(lexF, 'r') as f:
            for line in f:
                cWords = line.lower().strip().split(' ')
                for i in range(len(cWords)): 
                    w1 = cWords[i]  # current word
                    for j in range(i+1, len(cWords)):
                        w2 = cWords[j]  # later word
                        if w1 not in word2neigh:  
                            word2neigh[w1] = [w2]
                        else:
                            if w2 not in word2neigh[w1]:
                                word2neigh[w1].append(w2)
                        if w2 not in word2neigh:  
                            word2neigh[w2] = [w1]
                        else:
                            if w1 not in word2neigh[w2]:
                                word2neigh[w2].append(w1)
    return word2neigh
                              

    


## Some quick test show that the get adjacent word function is working.

In [3]:
lexFiles = glob.glob('../data/lexicons/ppdb-xl.txt')
print(lexFiles)
word2neigh = get_neighbor_ws(lexFiles)
print(word2neigh['happy'])

['../data/lexicons/ppdb-xl.txt']
On [0/1]
['jolly', 'jolie', 'merry', 'julie', 'nice', 'beautiful', 'pretty', 'sweet', 'lovely', 'pleasant', 'good', 'cute', 'handsome', 'fun', 'cool', 'belle', 'fine', 'enjoyable', 'gentle', 'good-looking', 'bel', 'beau', 'charming', 'well', 'owl', 'neat', 'joli', 'great', 'sympathetic', 'agreeable', 'pleased', 'gentil', 'kind', 'chouette', 'jamil', 'delighted', '-nice', 'beautifui', 'nice-looking', 'ravi', 'super', 'wonderful', 'fancy', 'pleasure', 'gorgeous', 'glad', 'satisfied', 'gratified', 'welcome', 'proud', 'grateful', 'thrilled', 'commends', 'excited', 'congratulated', 'fortunate', 'welcomed', 'gratifying', 'content', 'cheery', 'charmed', '-merry', '-happy', 'bonheur', 'happiness', 'horny', 'psyched', 'enthusiastic', 'packaged', 'aroused', 'impatient', 'exciting', 'excite', 'febrile', 'agitated', 'nervous', 'eager', 'pumped', 'joyous', 'jubilant', 'joyful', 'festive', 'pleasurable', 'pleasing', 'congenial', 'comfortable', 'delightful', 'happily'

## We need to design the retrofit function.

In [4]:
## wVecs: word vectors
## word2Neigh: adjacent words according to lexicons
## beta, alpha: hyper-parameters
## numIters: number of iterations
def retrofit(wVecs, word2Neigh, beta = 1.0, alpha = 1.0, numIters = 10):
    
    givenVocab = set()
    for word, vect in wVecs:
        givenVocab.add(word)

    # Need to create a modifiable version of the vectors as pymag is read only??!?!?!?
    newEmbs = {}
    for word, emb in wVecs:
        newEmbs[word] = emb
        
    for itCount in range(numIters):
        for wCount, gWord in enumerate(givenVocab):
            if (wCount % 100000) == 0:
                print("Iter [{}/{}] Word [{}/{}]".format(itCount, numIters, wCount, len(givenVocab)))
          
            tmpEmb = np.zeros(newEmbs[gWord].shape)
            if gWord in word2Neigh:
                nCount = 0
                cLoopVocab = word2Neigh[gWord]
                for word in cLoopVocab:
                    if word in newEmbs:
                        tmpEmb += beta * newEmbs[word]
                        nCount += 1
                
                newEmbs[gWord] = ((tmpEmb + (alpha * wVecs.query(gWord)))) / (nCount + alpha)
    
    return newEmbs

## Now let us use retrofitting to train a new set of word vectors.

In [10]:
senseMag = LexSub(os.path.join('../../','glove.6B.100d.magnitude'))
senseVecs = senseMag.wvecs
newWordVects = retrofit(senseVecs, word2neigh)

Iter [0/10] Word [0/400000]
Iter [0/10] Word [100000/400000]
Iter [0/10] Word [200000/400000]
Iter [0/10] Word [300000/400000]
Iter [1/10] Word [0/400000]
Iter [1/10] Word [100000/400000]
Iter [1/10] Word [200000/400000]
Iter [1/10] Word [300000/400000]
Iter [2/10] Word [0/400000]
Iter [2/10] Word [100000/400000]
Iter [2/10] Word [200000/400000]
Iter [2/10] Word [300000/400000]
Iter [3/10] Word [0/400000]
Iter [3/10] Word [100000/400000]
Iter [3/10] Word [200000/400000]
Iter [3/10] Word [300000/400000]
Iter [4/10] Word [0/400000]
Iter [4/10] Word [100000/400000]
Iter [4/10] Word [200000/400000]
Iter [4/10] Word [300000/400000]
Iter [5/10] Word [0/400000]
Iter [5/10] Word [100000/400000]
Iter [5/10] Word [200000/400000]
Iter [5/10] Word [300000/400000]
Iter [6/10] Word [0/400000]
Iter [6/10] Word [100000/400000]
Iter [6/10] Word [200000/400000]
Iter [6/10] Word [300000/400000]
Iter [7/10] Word [0/400000]
Iter [7/10] Word [100000/400000]
Iter [7/10] Word [200000/400000]
Iter [7/10] Word 

In [11]:
np.save("newWordVects_wn.npy", newWordVects)

In [12]:
with open("newVects_wn.txt", "w") as f:
    count = 0
    vSize = len(newWordVects)
    for word, emb in newWordVects.items():
        if (count % 5000) == 0:
            print("Writing to Txt [{}/{}]".format(count, vSize))
            
        f.write(word)
        for num in emb:
            f.write(" " + str(num))
        f.write("\n")
        count += 1

print("Written")

Writing to Txt [0/400000]
Writing to Txt [5000/400000]
Writing to Txt [10000/400000]
Writing to Txt [15000/400000]
Writing to Txt [20000/400000]
Writing to Txt [25000/400000]
Writing to Txt [30000/400000]
Writing to Txt [35000/400000]
Writing to Txt [40000/400000]
Writing to Txt [45000/400000]
Writing to Txt [50000/400000]
Writing to Txt [55000/400000]
Writing to Txt [60000/400000]
Writing to Txt [65000/400000]
Writing to Txt [70000/400000]
Writing to Txt [75000/400000]
Writing to Txt [80000/400000]
Writing to Txt [85000/400000]
Writing to Txt [90000/400000]
Writing to Txt [95000/400000]
Writing to Txt [100000/400000]
Writing to Txt [105000/400000]
Writing to Txt [110000/400000]
Writing to Txt [115000/400000]
Writing to Txt [120000/400000]
Writing to Txt [125000/400000]
Writing to Txt [130000/400000]
Writing to Txt [135000/400000]
Writing to Txt [140000/400000]
Writing to Txt [145000/400000]
Writing to Txt [150000/400000]
Writing to Txt [155000/400000]
Writing to Txt [160000/400000]
Wr

In [13]:
import subprocess

subprocess.call(["python", "-m", "pymagnitude.converter", "-i", "newVects_wn.txt", "-o", "newVects_wn.magnitude"])

0

## Evaluation on the new word vectors give us a score of 46.80.

In [14]:
from lexsub_check import precision

lexsub = LexSub('newVects_wn.magnitude')
output = []
with open(os.path.join('../data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        # fields is [index, sentence]
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))

with open(os.path.join('../data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=46.80


## We use a simple grid-search method for finetuning the hyper-parameters and find that beta = 1.0, alpha = 2.0 give us slightly higher score of 47.45.

In [16]:
newWordVects = retrofit(senseVecs, word2neigh, beta=1.0, alpha=2.0)
np.save("newWordVects_wn2.npy", newWordVects)

with open("newVects_wn2.txt", "w") as f:
    count = 0
    vSize = len(newWordVects)
    for word, emb in newWordVects.items():
        if (count % 5000) == 0:
            print("Writing to Txt [{}/{}]".format(count, vSize))
            
        f.write(word)
        for num in emb:
            f.write(" " + str(num))
        f.write("\n")
        count += 1

print("Written")


subprocess.call(["python", "-m", "pymagnitude.converter", "-i", "newVects_wn2.txt", "-o", "newVects_wn2.magnitude"])

from lexsub_check import precision

lexsub = LexSub('newVects_wn2.magnitude')
output = []
with open(os.path.join('../data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        # fields is [index, sentence]
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))

with open(os.path.join('../data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Iter [0/10] Word [0/400000]
Iter [0/10] Word [100000/400000]
Iter [0/10] Word [200000/400000]
Iter [0/10] Word [300000/400000]
Iter [1/10] Word [0/400000]
Iter [1/10] Word [100000/400000]
Iter [1/10] Word [200000/400000]
Iter [1/10] Word [300000/400000]
Iter [2/10] Word [0/400000]
Iter [2/10] Word [100000/400000]
Iter [2/10] Word [200000/400000]
Iter [2/10] Word [300000/400000]
Iter [3/10] Word [0/400000]
Iter [3/10] Word [100000/400000]
Iter [3/10] Word [200000/400000]
Iter [3/10] Word [300000/400000]
Iter [4/10] Word [0/400000]
Iter [4/10] Word [100000/400000]
Iter [4/10] Word [200000/400000]
Iter [4/10] Word [300000/400000]
Iter [5/10] Word [0/400000]
Iter [5/10] Word [100000/400000]
Iter [5/10] Word [200000/400000]
Iter [5/10] Word [300000/400000]
Iter [6/10] Word [0/400000]
Iter [6/10] Word [100000/400000]
Iter [6/10] Word [200000/400000]
Iter [6/10] Word [300000/400000]
Iter [7/10] Word [0/400000]
Iter [7/10] Word [100000/400000]
Iter [7/10] Word [200000/400000]
Iter [7/10] Word 

## To improve the performance, we first modify the adjacent word matrix so that it the first word is the anchor and the rest are the neighbors.

In [18]:
## lexFiles: can be a list of files
## return: neighboring words for each word 
## add all non-pairwise related neighbors
def get_oneside_neighbor_ws(lexFiles):
    word2neigh = {}
    accountedFor = set()
    #lexFiles = [lexFiles[-2]]
    #lexFiles = [lexFiles[-2], lexFiles[1]]
    for lexDex, lexF in enumerate(lexFiles):
        print("On [{}/{}]".format(lexDex, len(lexFiles)))
        with open(lexF, 'r') as f:
            for line in f:
                cWords = line.lower().strip().split(' ')
                tmpL = [word for word in cWords[1:]]
                if cWords[0] not in word2neigh:
                    word2neigh[cWords[0]] = tmpL
                else:
                    word2neigh[cWords[0]] = word2neigh[cWords[0]] + tmpL

    return word2neigh

## Evaluation score for this verions is 53.20 !!!       

In [19]:
from lexsub_check import precision

word2neigh_oneside = get_oneside_neighbor_ws(lexFiles)
newWordVects_oneside = retrofit(senseVecs, word2neigh_oneside, beta=1.0, alpha=2.0)
np.save("newWordVects_oneside_wn.npy", newWordVects_oneside)
with open("newVects_oneside_wn.txt", "w") as f:
    count = 0
    vSize = len(newWordVects_oneside)
    for word, emb in newWordVects_oneside.items():
        if (count % 100000) == 0:
            print("Writing to Txt [{}/{}]".format(count, vSize))
            
        f.write(word)
        for num in emb:
            f.write(" " + str(num))
        f.write("\n")
        count += 1

# create pymagnitude file
import subprocess
subprocess.call(["python", "-m", "pymagnitude.converter", "-i", "newVects_oneside_wn.txt", "-o", "newVects_oneside_wn.magnitude"])


lexsub = LexSub('newVects_oneside_wn.magnitude')
output = []
with open(os.path.join('../data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        # fields is [index, sentence]
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))

with open(os.path.join('../data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

On [0/1]
Iter [0/10] Word [0/400000]
Iter [0/10] Word [100000/400000]
Iter [0/10] Word [200000/400000]
Iter [0/10] Word [300000/400000]
Iter [1/10] Word [0/400000]
Iter [1/10] Word [100000/400000]
Iter [1/10] Word [200000/400000]
Iter [1/10] Word [300000/400000]
Iter [2/10] Word [0/400000]
Iter [2/10] Word [100000/400000]
Iter [2/10] Word [200000/400000]
Iter [2/10] Word [300000/400000]
Iter [3/10] Word [0/400000]
Iter [3/10] Word [100000/400000]
Iter [3/10] Word [200000/400000]
Iter [3/10] Word [300000/400000]
Iter [4/10] Word [0/400000]
Iter [4/10] Word [100000/400000]
Iter [4/10] Word [200000/400000]
Iter [4/10] Word [300000/400000]
Iter [5/10] Word [0/400000]
Iter [5/10] Word [100000/400000]
Iter [5/10] Word [200000/400000]
Iter [5/10] Word [300000/400000]
Iter [6/10] Word [0/400000]
Iter [6/10] Word [100000/400000]
Iter [6/10] Word [200000/400000]
Iter [6/10] Word [300000/400000]
Iter [7/10] Word [0/400000]
Iter [7/10] Word [100000/400000]
Iter [7/10] Word [200000/400000]
Iter [7/

## Second method with context words:

In [25]:
output = []
ctx = 6
vectors = lexsub.wvecs

lexsub.topn = 14
with open(os.path.join('../data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        dex = int(fields[0])
        sentence = fields[1].split(" ")
        st = max(dex - ctx, 0)
        end = min(dex + 1 + ctx, len(sentence))
        cCtx = sentence[st:dex] + sentence[dex + 1:end]
        
        sims = lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())
         
        avgScores = []
        balScores = []
                
        # Add method
        for sim in sims:
            tScore = vectors.similarity(sim, sentence[dex])
            ctxScore = sum(vectors.similarity(sim, cCtx))
            avgScore = (tScore + ctxScore) / (ctx * 2.0 + 1)
            balScore = (len(cCtx) * tScore + ctxScore) / (len(cCtx))
            balScores.append(balScore)
            avgScores.append(avgScore)
        
        balScores = np.array(balScores)
        avgScores = np.array(avgScores)
        topWs = np.argsort(balScores)[-10:]
        
        res = []
        for topW in topWs:
            res.append(sims[topW])
        
        res = " ".join(res)
        output.append(res)


## We have a score of 49.79 after considering context words, this is much lower than our previous score without the post-process...

In [26]:
from lexsub_check import precision
with open(os.path.join('../data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=49.79


## We also tried combining the results from the two result, but the score is always lower... It seems that the best score is 53.20 without using context.

In [59]:

output = []
ctx = 6
lexsub.topn=8
lexsub_new = LexSub('newVects_oneside_wn.magnitude')
vectors = lexsub_new.wvecs
lexsub_new.topn = 14
with open(os.path.join('../data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        dex = int(fields[0])
        sentence = fields[1].split(" ")
        st = max(dex - ctx, 0)
        end = min(dex + 1 + ctx, len(sentence))
        cCtx = sentence[st:dex] + sentence[dex + 1:end]
        
        cleanedCtx = []
        for word in cCtx:
            if word not in stopWords:
                cleanedCtx.append(word)
        
        cCtx = cleanedCtx
        
        sims = lexsub_new.substitutes(int(fields[0].strip()), fields[1].strip().split())
         
        avgScores = []
        balScores = []
                
        # Add method
        for sim in sims:
            tScore = vectors.similarity(sim, sentence[dex])
            ctxScore = sum(vectors.similarity(sim, cCtx))
            avgScore = (tScore + ctxScore) / (ctx * 2.0 + 1)
            balScore = (len(cCtx) * tScore + ctxScore) / (len(cCtx))
            balScores.append(balScore)
            avgScores.append(avgScore)
        
        balScores = np.array(balScores)
        avgScores = np.array(avgScores)
        topWs = np.argsort(balScores)[-2:]

        res = []
        res_new = []
        for topW in topWs:
            res_new.append(sims[topW])
        
        #res_new = " ".join(res_new)
        res_old = lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())
        res_shared = res_old + res_new
        assert len(res_shared) == 10
        res = " ".join(res_shared)
        output.append(res)

In [60]:
from lexsub_check import precision
with open(os.path.join('../data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=50.68
