In [4]:
#Source: https://towardsdatascience.com/how-to-solve-analogies-with-word2vec-6ebaf2354009
import pandas as pd
import numpy as np
import os

import gensim
from gensim.models import KeyedVectors

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

In [8]:
path_glove_wiki = os.path.abspath('data/Wikipedia300.txt')
path_w2v_wiki = os.path.abspath('data/Wikipedia300_w2v.txt')


glove_file = datapath(path_glove_wiki)
tmp_file = get_tmpfile(path_w2v_wiki)

_ = glove2word2vec(glove_file, tmp_file)


path = os.path.abspath('data/Wikipedia300_w2v.txt')

model_wiki = KeyedVectors.load_word2vec_format(path, binary=False)

In [30]:
path_glove_cc = os.path.abspath('data/CommonCrawl300.txt')
path_w2v_cc = os.path.abspath('data/CommonCrawl300_w2v.txt')


glove_file = datapath(path_glove_cc)
tmp_file = get_tmpfile(path_w2v_cc)

_ = glove2word2vec(glove_file, tmp_file)


path = os.path.abspath('data/CommonCrawl300_w2v.txt')

model_cc = KeyedVectors.load_word2vec_format(path, binary=False)

In [31]:
path_glove_tw = os.path.abspath('data/Twitter100.txt')
path_w2v_tw = os.path.abspath('data/Twitter100_w2v.txt')


glove_file = datapath(path_glove_tw)
tmp_file = get_tmpfile(path_w2v_tw)

_ = glove2word2vec(glove_file, tmp_file)


path = os.path.abspath('data/Twitter100_w2v.txt')

model_tw = KeyedVectors.load_word2vec_format(path, binary=False)

In [61]:
def analogy(model,worda, wordb, wordc):
    result = model.most_similar(negative=[worda], 
                                positive=[wordb, wordc])
    return result[0][0]
    
def compare_models_analogy(worda, wordb, wordc):
    print("Wikipedia: " + analogy(model_wiki, worda, wordb, wordc))
    print("CommonCrawl: " + analogy(model_cc, worda, wordb, wordc))
    print("Twitter: " + analogy(model_tw, worda, wordb, wordc))
    
def compare_models_similarity(word):
    print("Wikipedia:") 
    wiki = model_wiki.most_similar(positive=word)
    for i in wiki:
        print(i[0])
    print()
    print("CommonCrawl:") 
    cc = model_cc.most_similar(positive=word)
    for i in cc:
        print(i[0])
    print()
    print("Twitter:") 
    tw = model_tw.most_similar(positive=word)
    for i in tw:
        print(i[0])
    print()
    
def compare_models_similarity_pairs(worda, wordb):
    print("Wikipedia:") 
    wikia = pd.DataFrame (model_wiki.most_similar(positive=worda),columns=[[worda, 'cosine_sim_a']])
    wikib = pd.DataFrame (model_wiki.most_similar(positive=wordb),columns=[[wordb, 'cosine_sim_b']])
    wiki = pd.concat([wikia, wikib], axis=1, join='inner') 
    print(wiki)
    print("CommonCrawl:") 
    cca = pd.DataFrame (model_cc.most_similar(positive=worda),columns=[[worda, 'cosine_sim_a']])
    ccb = pd.DataFrame (model_cc.most_similar(positive=wordb),columns=[[wordb, 'cosine_sim_b']])
    cc = pd.concat([cca, ccb], axis=1, join='inner') 
    print(cc)
    print("Twitter:") 
    twa = pd.DataFrame (model_tw.most_similar(positive=worda),columns=[[worda, 'cosine_sim_a']])
    twb = pd.DataFrame (model_tw.most_similar(positive=wordb),columns=[[wordb, 'cosine_sim_b']])
    tw = pd.concat([twa, twb], axis=1, join='inner') 
    print(tw)

In [108]:
model_wiki.most_similar(positive="women")

[('men', 0.7656920552253723),
 ('girls', 0.6347202062606812),
 ('female', 0.6183490753173828),
 ('woman', 0.604761004447937),
 ('male', 0.549830436706543),
 ('mothers', 0.5313720703125),
 ('athletes', 0.5268536806106567),
 ('she', 0.5137521028518677),
 ('young', 0.5133211016654968),
 ('children', 0.5079673528671265)]

In [109]:
compare_models_similarity("women")

Wikipedia:
men
girls
female
woman
male
mothers
athletes
she
young
children

CommonCrawl:
men
ladies
woman
girls
Women
females
female
womens
mothers
males

Twitter:
woman
girls
men
ladies
womens
female
females
other
they
people



In [94]:
compare_models_similarity_pairs("girl", "boy")

Wikipedia:
       girl cosine_sim_a       boy cosine_sim_b
0       boy     0.827289      girl     0.827289
1     woman     0.729642      boys     0.681233
2     girls     0.722729       kid     0.655293
3  teenager     0.650977       man     0.620828
4   teenage     0.649272   teenage     0.597385
5    mother     0.641797     child     0.595288
6      boys     0.628358  teenager     0.589650
7     child     0.622930    father     0.580069
8      teen     0.612524     girls     0.574547
9  daughter     0.605021       son     0.572696
CommonCrawl:
      girl cosine_sim_a       boy cosine_sim_b
0    girls     0.824546      girl     0.814832
1      boy     0.814832       kid     0.780285
2    woman     0.770079      boys     0.770499
3     lady     0.755798       man     0.704570
4     teen     0.740762  teenager     0.691475
5     sexy     0.724625     young     0.684768
6   blonde     0.695936       dad     0.679083
7  teenage     0.691481     daddy     0.678516
8    chick     0.688600  

In [37]:
compare_models_analogy("man", "doctor", "woman")

Wikipedia: physician
CommonCrawl: nurse
Twitter: doctors


In [38]:
compare_models_analogy("man", "programmer", "woman")

Wikipedia: programmers
CommonCrawl: programmers
Twitter: developer


In [39]:
compare_models_analogy("man", "scientist", "woman")

Wikipedia: researcher
CommonCrawl: researcher
Twitter: researcher


In [41]:
compare_models_analogy("man", "engineer", "woman")

Wikipedia: technician
CommonCrawl: technician
Twitter: technician


In [98]:
compare_models_analogy("woman", "smart", "man")

Wikipedia: intelligent
CommonCrawl: intelligent
Twitter: cool


In [102]:
compare_models_analogy("man", "trans", "woman")

Wikipedia: transatlantic
CommonCrawl: Trans
Twitter: indonesian


In [44]:
compare_models_analogy("woman", "smart", "man")

Wikipedia: intelligent
CommonCrawl: intelligent
Twitter: cool
