# Word Embeddings
This document contains the source code for the addition of word embeddings.


## Step 1: Get embeddings & setup for desired embedding.

In [1]:
##Harvest the embeddings from files.
filew2 = "w2_vectors.txt"
filew5 = "w5_vectors.txt"
filew7 = "w7_vectors.txt"
#skip-gram embeddings files:
filew2sg = "w2_vectors_sg.txt"
filew5sg = "w5_vectors_sg.txt"
filew7sg = "w7_vectors_sg.txt"


##To modify which embeddings are used:
##change the dictFile to 'candidateDictw[WINDOW SIZE]' with suffix 'SG' for skip-gram
##change the outFile to 'EmbeddedOutput[WINDOW SIZE]' with suffix 'SG' for skip-gram
##change the input to the below 'with open(FILENAME)' to one of the files above.

dictFile = "candidateDictw2SG.txt"
##change this to prevent over-writing the wrong file for output!
outFile = open("EmbeddedOutput2SG.txt","w")
with open(filew2sg, "rb") as f:
    wText = f.read().decode('utf-8')
Words = wText.split("\n")
Words.pop(0)


'93951 300'

##  Vectorize function:
From the text provided by the embeddings file, convert into a set of word:vector relationships as a Dict for accessibility.

In [2]:
def vectorize(vectors):
    i = 0
    print("Vectorizing " + str(vectors.__len__()-1) + " words")
    output = {}
    for iWord in vectors:
        i += 1
        split = iWord.split(" ")
        thisWord = split[0]
        split.pop(0)
        if(thisWord != "" ):
            split.pop()
            list = []
            for f in split:
                list.append(float(f))
            output[thisWord] = list
        if (i % 10000 == 0):
            print("Vectorizing word:" + str(i))
    return output
wVec = vectorize(Words)


Vectorizing 93951 words
Vectorizing word:10000
Vectorizing word:20000
Vectorizing word:30000
Vectorizing word:40000
Vectorizing word:50000
Vectorizing word:60000
Vectorizing word:70000
Vectorizing word:80000
Vectorizing word:90000


## Getting Queries
Same as the code used for Task 2 - Generating queries - we open the Topics files and convert these into lists of terms per query.

In [3]:
##Get the queries to expand on - same code as in Query Processing file.
##read the three query files
filename1 = "topics.51-100.txt"
filename2 = "topics.101-150.txt"
filename3 = "topics.151-200.txt"

with open(filename1, "rb") as f:
    content1 = f.read().decode('utf-8')
with open(filename2, "rb") as f:
    content2 = f.read().decode('utf-8')
with open(filename3, "rb") as f:
    content3 = f.read().decode('utf-8')

##grab individual queries
import re

filteredQ1 = []
filteredQ2 = []
filteredQ3 = []

##break each query block into separate queries.
q1 = content1.split("<title>")
q2 = content2.split("<title>")
q3 = content3.split("<title>")
q1.pop(0)
q2.pop(0)
q3.pop(0)

##For each list of queries:
##Grab only the Topic bracket, and remove the 'Topic:' section
##Replace dashes and slashes with spaces to separate conjoined terms
##Strip all non-alpha/space characters
##Strip double-spacing
##Remove starting whitespace
##append to filtered lists.
for q in q1:
    text = q.split("<desc>")[0].replace("Topic:","")
    text =text.replace("/", " ")
    text =text.replace("-", " ")
    text =text.replace("\n", " ")
    text =text.replace(".", " ")
    text = re.sub("[^a-zA-Z ]","", text)
    text = re.sub(" +"," ", text)
    text = text.lstrip()
    filteredQ1.append(text)
for q in q2:
    text = q.split("<desc>")[0].replace("Topic:","")
    text =text.replace("/", " ")
    text =text.replace("-", " ")
    text =text.replace("\n", " ")
    text =text.replace(".", " ")
    text = re.sub("[^a-zA-Z ]","", text)
    text = re.sub(" +"," ", text)
    text = text.lstrip()
    filteredQ2.append(text)
for q in q3:
    text = q.split("<desc>")[0].replace("Topic:","")
    text =text.replace("/", " ")
    text =text.replace("-", " ")
    text =text.replace("\n", " ")
    text =text.replace(".", " ")
    text = re.sub("[^a-zA-Z ]","", text)
    text = re.sub(" +"," ", text)
    text = text.lstrip()
    filteredQ3.append(text)


## Candidate Dictionary Processing & Use
This is how we store & retrieve our candidate lists from the saved file. 
To start, we open the file and get all the stored candidates from it.
Later, we'll check against what we've read and if we can't find a new term, it will be added using addCandidate.
This ensures we only need to find candidates once per unique term.


In [4]:
##candidate dictionary processing
candidateDictionary = {}

with open(dictFile, "rb") as f:
    dictContent = f.read().decode('utf-8')
dictTerms = dictContent.split("\r\n")
for term in dictTerms:
    thisDict = term.split("~")
    word = thisDict[0]
    terms = thisDict[1].split(" ")
    candidateDictionary[word] = terms
    
print("Dictionary Size: " +str(candidateDictionary.__len__()))

def addCandidate(word,candidates):
    with open(dictFile, "a") as f:
        f.write("\n"+word+"~")
        for term in candidates:
            print(term)
            f.write(term+" ")     

Dictionary Size: 478


## Query Expansion
#### Candidate Generation:
For the first section here, we process each query, looking at each word individually.
We first check our candidate Dictionary to make sure we don't re-do any terms we've already checked.
If we can find the word in there, we add its stored candidates to the list of potential terms - the QueryTermsCandidates.

If we can't find the word in there, we compare all terms in the vectors provided from word embeddings with this word's vector.
This allows us to find the most similar vector in the entire collection - we'll store the highest k (five) terms associated with this one in the dictionary for later, and add these terms to the Candidate collection.

#### Query Expansion:
Once we're finished creating the candidate set, we parse through these candidates and find the ones that best match all the query terms. This is done by comparing the sum of vector comparisons with the query terms and this candidate, and selecting the best j (three) terms from these.
These new terms are added to the expandedQueries set to be output later.

In [5]:
from scipy import spatial
import operator

##combine our queries, and prepare the output set.
combinedQueries = filteredQ1 + filteredQ2 + filteredQ3
expandedQueries = []
##k and j are the two constants for expansion. k= number of candidates to produce per word,
##j = number of final best candidates from all terms to expand query with.
k=5
j=3


def expandQ(vectors,start):
    for query in combinedQueries[0:150]:
        ##Candidate Generation Begins Here:
        start+=1
        queryTerms = query.split(" ")
        ##define the list of candidate terms we'll be using:
        queryTermsCandidates = []
        ##for all words in each query:
        for word in queryTerms:
            candidateWordsOnly = []
            word = word.lower()
            ##check if we've already processed this word.
            if (word in candidateDictionary):
                candidateWordsOnly += candidateDictionary[word]
                for word in candidateWordsOnly:
                    queryTermsCandidates.append(word)
            ##if we haven't processed it, we need to find candidates:
            elif (str(word) in vectors.keys()) & (word != "") :
                ##get the vector for this word
                wordVector = vectors[word]
                vectorComparisons = {}
                ##loop through all words in our embeddings collection. add the word + score to the VectorComparisons dict.
                for vect in vectors.items():
                    vectScore = (1 - spatial.distance.cosine(vect[1], wordVector))##compareVectors(vect,wordVector)
                    vectorComparisons[vect[0]]=vectScore
                    ##vectorComparisons = vectScore
                ##sort our scored comparisons, in the right order.
                sortedComparisons = sorted(vectorComparisons.items(), key=operator.itemgetter(1))
                sortedComparisons.reverse()
                ##only grab the last k.
                candidateDictionary[word] = sortedComparisons[0:k]
                ##for each of the candidate terms, add the terms to both the dictionary, the saved file, and the list of candidates.
                for term in sortedComparisons[0:k]:
                    candidateWordsOnly.append(term[0])
                    queryTermsCandidates.append(term[0])
                addCandidate(word,candidateWordsOnly)
        ##Query Expansion Begins Here:
        ##setup our list of similarity scores per candidate:
        termQscores = []
        ##remove blank entries from list
        candidateTerms = list(filter(lambda a: a != '', queryTermsCandidates))
        ##for each of our candidate terms,
        
        for term in candidateTerms:
            ##Only consider words that aren't in the query (self-translation is OK, but we don't want to expand with duplicates!)
            if term not in (word.lower() for word in queryTerms):
                termScore = 0
                if '' in queryTerms:
                    queryTerms.remove('')
                ##Compare this term with all the words in the query, and sum the scores.
                for word in queryTerms:
                    word=word.lower()
                    if word in vectors:
                        termScore += (1 - spatial.distance.cosine(vectors[term.lower()], vectors[word]))
                termQscores.append((term,termScore))
        ##sort the candidates by their combined similarity:
        termQscores = sorted(termQscores, key=lambda tup: tup[1])
        termQscores.reverse()
        
        
        ##Add the j best candidate terms to the query.
        for t in termQscores[0:j]:
            queryTerms.append(t)
        expandedQueries.append(queryTerms)
            

    
##run the expansion - '51' starting number is for debugging!
expandQ(wVec,51)

In [6]:
##query output to text file
qNo = 51
##output individual queries to text file
for qu in expandedQueries:
    outFile.write("<top>\n")
    outFile.write("<num>" + str(qNo) + "</num><title>\n")
    for word in qu:
        if isinstance(word, tuple):
            outFile.write(word[0] + " ")
        else:
            outFile.write(word + " ")
    outFile.write("\n</title>\n")
    outFile.write("</top>\n")
    qNo = qNo+1
outFile.close()

150
['Airbus', 'Subsidies', ('industrie', 1.0435378713215067), ('subsidy', 0.905325539985559), ('aircraft', 0.8691413045539917)]
['South', 'African', 'Sanctions', ('africa', 1.5865439608153737), ('embargoes', 1.383595974375249), ('disinvestment', 1.3258873086935345)]
['Leveraged', 'Buyouts', ('buyout', 1.3675134911333136), ('restructurings', 1.3271809923368283), ('lbo', 1.3124796045044602)]
['Satellite', 'Launch', 'Contracts', ('tdrs', 1.4979813220714395), ('geostationary', 1.4708854069879176), ('navstar', 1.4490359552755456)]
['Insider', 'Trading', ('frauds', 0.9705184820700984), ('directionless', 0.942517960646976), ('merc', 0.9342479422430824)]
['Prime', 'Lending', 'Rate', 'Moves', 'Predictions', ('interst', 2.0784737063822765), ('iniatives', 2.0193693504352552), ('underemployment', 2.0020665229907086)]
['MCI', ('sprint', 0.6006605633561322), ('tci', 0.5691070243876946), ('mccaw', 0.5507547828965591)]
['Rail', 'Strikes', ('walkouts', 1.062361830274991), ('strike', 1.0534123178735837