# IR Assignment5 - Word2Vec
Tiffany Hoeung

In [82]:
from post_parser_record import PostParserRecord
post_reader = PostParserRecord("Posts_Coffee.xml")

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import re, string
import nltk
from nltk.corpus.reader.tagged import word_tokenize
import scipy
from scipy.spatial import distance

In [83]:
# Create a list for all question titles; form: [['word1', 'word2', 'word3'], ['word1', 'word2', 'word3']]
q_titles = []

# Collect every title; remove html tags and punctuations then tokenize to just words.
for id in post_reader.map_questions:
    title = re.sub("<.*?>|\\n|&quot;", " ", post_reader.map_questions[id].title.lower())
    q_titles.append(word_tokenize(title.translate(str.maketrans('', '', string.punctuation))))

# Initialize word2vec and pass in the titles.
model = Word2Vec(sentences=q_titles, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [84]:
# Train the model on the titles
model.train(q_titles, total_examples=len(q_titles), epochs=10)

(84774, 122710)

In [85]:
# Test vector and similarity retrievals
vector_coffee = model.wv['coffee']
print(vector_coffee.shape)
vector_espresso = model.wv['espresso']
print(vector_espresso.shape)
sim_coffee = model.wv.most_similar('coffee', topn=10)
print(sim_coffee)

(100,)
(100,)
[('beans', 0.9997697472572327), ('or', 0.9997586607933044), ('in', 0.9997496008872986), ('for', 0.9997355341911316), ('of', 0.9997085332870483), ('after', 0.9997034072875977), ('that', 0.9996819496154785), ('from', 0.9996774792671204), ('when', 0.9996718168258667), ('using', 0.9996609091758728)]


In [86]:
# Get vector representation of "When does coffee go off?"
question = "when does coffee go off"
question_vector = []

# For each word in the title, get the vector and average it with our current vector
for word in word_tokenize(question):
    if len(question_vector) == 0:
        question_vector = model.wv[word]
    else:
        temp_v = model.wv[word]
        question_vector = (question_vector + temp_v) / 2

question_vector

array([-0.11868864,  0.2266302 ,  0.02084659, -0.06473427,  0.0157832 ,
       -0.30569673,  0.12499233,  0.5164074 , -0.20170736, -0.19295172,
       -0.04602785, -0.3431693 , -0.01850331,  0.12144019,  0.07014248,
       -0.13679299,  0.08527677, -0.17394054, -0.07834335, -0.43059433,
        0.13129438,  0.01902593,  0.21266903, -0.07797459, -0.00861693,
       -0.03301916, -0.16221264, -0.10207799, -0.1731482 ,  0.02661534,
        0.20300892,  0.02324446,  0.09492109, -0.21272533, -0.10176508,
        0.22573161,  0.08369771, -0.09838966, -0.07028008, -0.31883806,
       -0.00269959, -0.17561169, -0.1558606 , -0.0011044 ,  0.14605427,
       -0.11525551, -0.17463887,  0.00443599,  0.08187588,  0.15162812,
        0.09642218, -0.18273005, -0.01624926, -0.06014058, -0.07306454,
        0.08204321,  0.11317929, -0.07266031, -0.18640903,  0.04403614,
       -0.01583135,  0.00140987,  0.1541054 , -0.02393781, -0.25282645,
        0.26129138,  0.04579557,  0.2283531 , -0.28406054,  0.24

In [87]:
# Collect vectors for every question title in the collection
title_vectors = {}
for id in post_reader.map_questions:
    title = re.sub("<.*?>|\\n|&quot;", " ", post_reader.map_questions[id].title.lower())
    title_tokens = (word_tokenize(title.translate(str.maketrans('', '', string.punctuation))))
    # For every word in this title, get the vector
    for word in title_tokens:
        # If vector exist already, get avg, else just set {id: vec} in dictionary
        if id in title_vectors:
            temp = model.wv[word]
            title_vectors[id] = (title_vectors[id] + temp) / 2
        else:
            title_vectors[id] = model.wv[word]
# Print out the vectors: should be like {docid: arr[vec], docid: arr[vec]}
title_vectors

{1: array([-0.24864945,  0.45493132,  0.03044859, -0.12945858,  0.03800869,
        -0.6200334 ,  0.25264242,  1.0583717 , -0.41806036, -0.4050075 ,
        -0.09372239, -0.709638  , -0.03516508,  0.26635617,  0.14084676,
        -0.28913638,  0.16907492, -0.3576761 , -0.16485322, -0.88637614,
         0.29062927,  0.04314749,  0.43729413, -0.1641171 , -0.00848097,
        -0.07149264, -0.3249207 , -0.21008363, -0.3629509 ,  0.0572073 ,
         0.432461  ,  0.05017675,  0.198552  , -0.44090468, -0.20682324,
         0.46200228,  0.16670434, -0.21154258, -0.1441358 , -0.6622558 ,
         0.00658343, -0.35301712, -0.3217578 , -0.01565921,  0.29630518,
        -0.22647129, -0.35327473,  0.01842504,  0.16476251,  0.32647228,
         0.20135333, -0.36492336, -0.0478727 , -0.12701644, -0.15621096,
         0.1813293 ,  0.24530149, -0.13564487, -0.38817835,  0.09359574,
        -0.01808565,  0.00697592,  0.32966524, -0.06101253, -0.53392726,
         0.542374  ,  0.09709424,  0.47752398, -

In [91]:
# Using the title vectors + current question vector, find best cosine similarities:
title_cosine = {}

for id in title_vectors:
    title_cosine[id] = (1 - distance.cosine(title_vectors[id], question_vector))

title_cosine = dict(sorted(title_cosine.items(), key=lambda item: item[1], reverse=True))
title_cosine

{123: 1,
 3205: 0.9997643232345581,
 5907: 0.9997629523277283,
 4863: 0.9997576475143433,
 2581: 0.9997561573982239,
 4268: 0.9997550845146179,
 2437: 0.9997541904449463,
 3326: 0.9997541904449463,
 2363: 0.999754011631012,
 5088: 0.9997535943984985,
 1552: 0.9997534155845642,
 104: 0.999752402305603,
 5027: 0.9997513890266418,
 1533: 0.9997491836547852,
 3706: 0.9997487664222717,
 1775: 0.9997475743293762,
 3837: 0.9997475743293762,
 4014: 0.9997473359107971,
 4258: 0.9997467994689941,
 4356: 0.999745786190033,
 3288: 0.99974524974823,
 5925: 0.9997451901435852,
 3873: 0.999743640422821,
 4611: 0.999743640422821,
 5923: 0.9997435212135315,
 3635: 0.9997434616088867,
 4542: 0.9997433423995972,
 4539: 0.9997432827949524,
 5840: 0.9997428059577942,
 3254: 0.9997427463531494,
 4479: 0.9997426867485046,
 3690: 0.9997420907020569,
 2936: 0.9997419118881226,
 3198: 0.9997418522834778,
 4206: 0.9997416138648987,
 1945: 0.9997411966323853,
 2432: 0.999740719795227,
 4242: 0.9997405409812927,
 

In [89]:
# Printing top 5 results plus their titles and ids
top5_cosine = dict(sorted(title_cosine.items(), key=lambda item: item[1], reverse=True)[:5])
print("Top 5 similar results to \'When does coffee go off\':")
for id in top5_cosine:
    title = post_reader.map_questions[id].title
    print("%d: %.10f %s" % (id, top5_cosine[id], title))

Top 5 similar results to 'When does coffee go off':
123: 1.0000000000 When does coffee go off?
3205: 0.9997643232 Are there any natural sweeteners that work well with coffee?
5907: 0.9997629523 Why are y'all so obsessed with coffee?
4863: 0.9997576475 Conversion of Krups Nespresso Machine to Ground Coffee?
2581: 0.9997561574 Is it good to have the "floating oil" on the surface of some coffee drinks?
