In [4]:
#Source: https://towardsdatascience.com/how-to-solve-analogies-with-word2vec-6ebaf2354009
import pandas as pd
import numpy as np
import os

import gensim
from gensim.models import KeyedVectors

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

In [8]:
path_glove_wiki = os.path.abspath('data/Wikipedia300.txt')
path_w2v_wiki = os.path.abspath('data/Wikipedia300_w2v.txt')


glove_file = datapath(path_glove_wiki)
tmp_file = get_tmpfile(path_w2v_wiki)

_ = glove2word2vec(glove_file, tmp_file)


path = os.path.abspath('data/Wikipedia300_w2v.txt')

model_wiki = KeyedVectors.load_word2vec_format(path, binary=False)

In [30]:
path_glove_cc = os.path.abspath('data/CommonCrawl300.txt')
path_w2v_cc = os.path.abspath('data/CommonCrawl300_w2v.txt')


glove_file = datapath(path_glove_cc)
tmp_file = get_tmpfile(path_w2v_cc)

_ = glove2word2vec(glove_file, tmp_file)


path = os.path.abspath('data/CommonCrawl300_w2v.txt')

model_cc = KeyedVectors.load_word2vec_format(path, binary=False)

In [31]:
path_glove_tw = os.path.abspath('data/Twitter100.txt')
path_w2v_tw = os.path.abspath('data/Twitter100_w2v.txt')


glove_file = datapath(path_glove_tw)
tmp_file = get_tmpfile(path_w2v_tw)

_ = glove2word2vec(glove_file, tmp_file)


path = os.path.abspath('data/Twitter100_w2v.txt')

model_tw = KeyedVectors.load_word2vec_format(path, binary=False)

In [123]:
def compare_models_analogy(worda, wordb, wordc):
    print("Wikipedia:")
    wiki = pd.DataFrame (model_wiki.most_similar(negative=[worda],positive=[wordb, wordc]),columns=[[wordc, 'cosine_sim']])
    print(wiki)
    print("CommonCrawl:")
    cc = pd.DataFrame (model_cc.most_similar(negative=[worda],positive=[wordb, wordc]),columns=[[wordc, 'cosine_sim']])
    print(cc)
    print("Twitter:")
    tw = pd.DataFrame (model_tw.most_similar(negative=[worda],positive=[wordb, wordc]),columns=[[wordc, 'cosine_sim']])
    print(tw)
    
def compare_models_similarity(word):
    print("Wikipedia:") 
    wiki = pd.DataFrame (model_wiki.most_similar(positive=word),columns=[[word, 'cosine_sim']])
    print(wiki)
    print("CommonCrawl:") 
    cc = pd.DataFrame (model_cc.most_similar(positive=word),columns=[[word, 'cosine_sim']])
    print(cc)
    print("Twitter:") 
    twitter = pd.DataFrame (model_tw.most_similar(positive=word),columns=[[word, 'cosine_sim']])
    print(twitter)
    
def compare_models_similarity_pairs(worda, wordb):
    print("Wikipedia:") 
    wikia = pd.DataFrame (model_wiki.most_similar(positive=worda),columns=[[worda, 'cosine_sim_a']])
    wikib = pd.DataFrame (model_wiki.most_similar(positive=wordb),columns=[[wordb, 'cosine_sim_b']])
    wiki = pd.concat([wikia, wikib], axis=1, join='inner') 
    print(wiki)
    print("CommonCrawl:") 
    cca = pd.DataFrame (model_cc.most_similar(positive=worda),columns=[[worda, 'cosine_sim_a']])
    ccb = pd.DataFrame (model_cc.most_similar(positive=wordb),columns=[[wordb, 'cosine_sim_b']])
    cc = pd.concat([cca, ccb], axis=1, join='inner') 
    print(cc)
    print("Twitter:") 
    twa = pd.DataFrame (model_tw.most_similar(positive=worda),columns=[[worda, 'cosine_sim_a']])
    twb = pd.DataFrame (model_tw.most_similar(positive=wordb),columns=[[wordb, 'cosine_sim_b']])
    tw = pd.concat([twa, twb], axis=1, join='inner') 
    print(tw)

In [111]:
model_wiki.most_similar(positive="strong")

[('stronger', 0.6895867586135864),
 ('despite', 0.6242086291313171),
 ('weak', 0.6232478618621826),
 ('strongest', 0.622769832611084),
 ('robust', 0.6030492782592773),
 ('strength', 0.5954651236534119),
 ('consistent', 0.570941150188446),
 ('solid', 0.5683797001838684),
 ('support', 0.5610021352767944),
 ('very', 0.5595080852508545)]

In [119]:
compare_models_similarity("women")

Wikipedia:
      women cosine_sim
0       men   0.765692
1     girls   0.634720
2    female   0.618349
3     woman   0.604761
4      male   0.549830
5   mothers   0.531372
6  athletes   0.526854
7       she   0.513752
8     young   0.513321
9  children   0.507967
CommonCrawl:
     women cosine_sim
0      men   0.825095
1   ladies   0.724850
2    woman   0.717893
3    girls   0.705516
4    Women   0.690789
5  females   0.678750
6   female   0.655832
7   womens   0.648444
8  mothers   0.618875
9    males   0.616936
Twitter:
     women cosine_sim
0    woman   0.805092
1    girls   0.770198
2      men   0.747575
3   ladies   0.747394
4   womens   0.740094
5   female   0.736097
6  females   0.711881
7    other   0.704463
8     they   0.699757
9   people   0.696706


In [94]:
compare_models_similarity_pairs("girl", "boy")

Wikipedia:
       girl cosine_sim_a       boy cosine_sim_b
0       boy     0.827289      girl     0.827289
1     woman     0.729642      boys     0.681233
2     girls     0.722729       kid     0.655293
3  teenager     0.650977       man     0.620828
4   teenage     0.649272   teenage     0.597385
5    mother     0.641797     child     0.595288
6      boys     0.628358  teenager     0.589650
7     child     0.622930    father     0.580069
8      teen     0.612524     girls     0.574547
9  daughter     0.605021       son     0.572696
CommonCrawl:
      girl cosine_sim_a       boy cosine_sim_b
0    girls     0.824546      girl     0.814832
1      boy     0.814832       kid     0.780285
2    woman     0.770079      boys     0.770499
3     lady     0.755798       man     0.704570
4     teen     0.740762  teenager     0.691475
5     sexy     0.724625     young     0.684768
6   blonde     0.695936       dad     0.679083
7  teenage     0.691481     daddy     0.678516
8    chick     0.688600  

In [126]:
compare_models_analogy("man", "doctor", "woman")

Wikipedia:
        woman cosine_sim
0   physician   0.609857
1       nurse   0.605909
2     doctors   0.591393
3    pregnant   0.533370
4     dentist   0.524034
5     medical   0.511250
6  pharmacist   0.504334
7     surgeon   0.500094
8      nurses   0.498949
9  physicians   0.498566
CommonCrawl:
          woman cosine_sim
0         nurse   0.692357
1       doctors   0.667855
2     physician   0.662202
3      pregnant   0.650009
4  gynecologist   0.626058
5    pharmacist   0.620841
6       midwife   0.609780
7  pediatrician   0.598572
8     pregnancy   0.592360
9       medical   0.592073
Twitter:
         woman cosine_sim
0      doctors   0.649473
1       mother   0.609547
2      dentist   0.588896
3        birth   0.575608
4  grandmother   0.566490
5      midwife   0.566192
6        nurse   0.558148
7        child   0.551933
8     daughter   0.545252
9       father   0.541687


In [38]:
compare_models_analogy("man", "programmer", "woman")

Wikipedia: programmers
CommonCrawl: programmers
Twitter: developer


In [39]:
compare_models_analogy("man", "scientist", "woman")

Wikipedia: researcher
CommonCrawl: researcher
Twitter: researcher


In [41]:
compare_models_analogy("man", "engineer", "woman")

Wikipedia: technician
CommonCrawl: technician
Twitter: technician


In [102]:
compare_models_analogy("man", "trans", "woman")

Wikipedia: transatlantic
CommonCrawl: Trans
Twitter: indonesian


In [125]:
compare_models_analogy("man", "strong", "woman")

Wikipedia:
        woman cosine_sim
0    stronger   0.540588
1   strongest   0.496957
2       women   0.456852
3      robust   0.455113
4     despite   0.453090
5        weak   0.452220
6         her   0.449586
7  consistent   0.442298
8      female   0.440086
9     support   0.432728
CommonCrawl:
        woman cosine_sim
0    stronger   0.593500
1   strongest   0.535380
2       women   0.523203
3    strongly   0.522918
4   extremely   0.504470
5    strength   0.504092
6  attractive   0.502718
7    positive   0.502413
8      robust   0.495524
9        very   0.490195
Twitter:
         woman cosine_sim
0     powerful   0.632457
1       loving   0.607668
2  independent   0.599768
3        heart   0.596056
4        truly   0.592127
5        often   0.591227
6    beautiful   0.589892
7   passionate   0.585448
8         very   0.582792
9     strength   0.581564


In [124]:
compare_models_analogy("woman", "strong", "man")

Wikipedia:
         man cosine_sim
0   stronger   0.549757
1    despite   0.532143
2       weak   0.531488
3       good   0.520228
4   strength   0.517391
5      solid   0.507273
6     robust   0.497557
7       well   0.489479
8  strongest   0.488160
9     enough   0.476605
CommonCrawl:
         man cosine_sim
0   stronger   0.612946
1       good   0.604684
2   strength   0.593914
3       weak   0.587595
4      solid   0.566812
5       well   0.558645
6  strongest   0.553546
7      tough   0.534405
8        but   0.533306
9      great   0.533095
Twitter:
    man cosine_sim
0   bro   0.661264
1    we   0.645014
2    so   0.639920
3  true   0.633815
4  hard   0.629617
5  keep   0.618759
6  haha   0.615891
7  well   0.612242
8  good   0.608737
9   too   0.608365


In [116]:
 model_wiki.most_similar(negative=["woman"], 
                                positive=["strong", "man"])

[('stronger', 0.5497574806213379),
 ('despite', 0.5321428179740906),
 ('weak', 0.5314878225326538),
 ('good', 0.52022784948349),
 ('strength', 0.5173912048339844),
 ('solid', 0.5072730779647827),
 ('robust', 0.49755674600601196),
 ('well', 0.4894790053367615),
 ('strongest', 0.48816049098968506),
 ('enough', 0.47660502791404724)]