In [1]:
# Turn on Auto-Complete
%config IPCompleter.greedy=True

In [2]:
# Start logging process at root level
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)

In [3]:
# Load model and dictionary
model_id_current = 99999
model_path_current = "models/enwiki-full-dict-"+str(model_id_current)+".model"

dictionary_full_wikien_lem_path = "dictionaries/enwiki-english-lemmatized.txt.bz2"

In [4]:
# Load word2vec model
from gensim.models import Word2Vec
model = Word2Vec.load(model_path_current, mmap='r')

2019-04-08 09:50:41,762 : INFO : 'pattern' package found; tag filters are available for English
2019-04-08 09:50:41,767 : INFO : loading Word2Vec object from models/enwiki-full-dict-99999.model
2019-04-08 09:50:45,989 : INFO : loading vocabulary recursively from models/enwiki-full-dict-99999.model.vocabulary.* with mmap=r
2019-04-08 09:50:45,997 : INFO : loading wv recursively from models/enwiki-full-dict-99999.model.wv.* with mmap=r
2019-04-08 09:50:46,004 : INFO : loading vectors from models/enwiki-full-dict-99999.model.wv.vectors.npy with mmap=r
2019-04-08 09:50:46,057 : INFO : setting ignored attribute vectors_norm to None
2019-04-08 09:50:46,059 : INFO : loading trainables recursively from models/enwiki-full-dict-99999.model.trainables.* with mmap=r
2019-04-08 09:50:46,060 : INFO : loading syn1neg from models/enwiki-full-dict-99999.model.trainables.syn1neg.npy with mmap=r
2019-04-08 09:50:46,063 : INFO : setting ignored attribute cum_table to None
2019-04-08 09:50:46,064 : INFO : 

In [5]:
# Custom lemmatizer function to play with word
from gensim.utils import lemmatize
#vocabulary = set(wv.index2word)
def lem(word):
    try:
        return lemmatize(word)[0].decode("utf-8")
    except:
        pass
        
print(lem("dog"))
print(lem("that"))

dog/NN
None


In [6]:
# Testing similarity
print("Most similar to",lem("woman"))
print(model.wv.most_similar(lem("woman")))
print("\nMost similar to","doctor/NN")
print(model.wv.most_similar("doctor/NN"))

2019-04-08 09:50:49,331 : INFO : precomputing L2-norms of word weight vectors


Most similar to woman/NN
[('lesbian/NN', 0.4600733518600464), ('man/NN', 0.4519272744655609), ('transwoman/NN', 0.45071160793304443), ('feminist/NN', 0.4451548457145691), ('feminist/VB', 0.4357849955558777), ('womanhood/NN', 0.4307554364204407), ('sztokman/NN', 0.3955826163291931), ('englishwoman/NN', 0.39512747526168823), ('feminist/JJ', 0.3936055898666382), ('antifeminist/NN', 0.39345383644104004)]

Most similar to doctor/NN
[('dentist/NN', 0.5012550354003906), ('physician/NN', 0.4942779242992401), ('dolittle/RB', 0.47275668382644653), ('veterinarian/NN', 0.4650288224220276), ('zhivago/VB', 0.4592878222465515), ('who/NN', 0.4559624195098877), ('zhivago/JJ', 0.4459368586540222), ('doolot/NN', 0.43288061022758484), ('pulmonologist/NN', 0.4282873570919037), ('internist/NN', 0.4256983995437622)]


In [7]:
# Saving some ram by using the KeyedVectors instance
wv = model.wv
del model

In [8]:
# Testing similarity with KeyedVectors
print("Most similar to",lem("woman"))
print(wv.most_similar(lem("woman")))
print("\nMost similar to",lem("man"))
print(wv.wv.most_similar(lem("man")))
print("\nMost similar to","doctor/NN")
print(wv.most_similar("doctor/NN"))
print("\nMost similar to","doctor/NN","cosmul")
print(wv.most_similar_cosmul(positive=["doctor/NN"]))

Most similar to woman/NN
[('lesbian/NN', 0.4600733518600464), ('man/NN', 0.4519272744655609), ('transwoman/NN', 0.45071160793304443), ('feminist/NN', 0.4451548457145691), ('feminist/VB', 0.4357849955558777), ('womanhood/NN', 0.4307554364204407), ('sztokman/NN', 0.3955826163291931), ('englishwoman/NN', 0.39512747526168823), ('feminist/JJ', 0.3936055898666382), ('antifeminist/NN', 0.39345383644104004)]

Most similar to man/NN
[('woman/NN', 0.4519272744655609), ('thug/NN', 0.40778350830078125), ('boy/NN', 0.3781570792198181), ('nudy/NN', 0.37579411268234253), ('roynell/VB', 0.3751066327095032), ('nowhere/NN', 0.3721664547920227), ('delmon/VB', 0.3710464537143707), ('wrightwood/VB', 0.3666095435619354), ('fogey/NN', 0.3656006157398224), ('taolu/RB', 0.3626784086227417)]

Most similar to doctor/NN


  """


[('dentist/NN', 0.5012550354003906), ('physician/NN', 0.4942779242992401), ('dolittle/RB', 0.47275668382644653), ('veterinarian/NN', 0.4650288224220276), ('zhivago/VB', 0.4592878222465515), ('who/NN', 0.4559624195098877), ('zhivago/JJ', 0.4459368586540222), ('doolot/NN', 0.43288061022758484), ('pulmonologist/NN', 0.4282873570919037), ('internist/NN', 0.4256983995437622)]

Most similar to doctor/NN cosmul
[('dentist/NN', 0.750626802444458), ('physician/NN', 0.747138261795044), ('dolittle/RB', 0.7363775968551636), ('veterinarian/NN', 0.7325136661529541), ('zhivago/VB', 0.7296431660652161), ('who/NN', 0.7279804944992065), ('zhivago/JJ', 0.7229676842689514), ('doolot/NN', 0.7164396643638611), ('pulmonologist/NN', 0.7141429781913757), ('internist/NN', 0.7128485441207886)]


In [9]:
print("similarity of doctor + woman - man")
wv.most_similar(positive=["doctor/NN","woman/NN"], negative=["man/NN"])

similarity of doctor + woman - man


[('midwife/NN', 0.4707856774330139),
 ('gynaecologist/NN', 0.4525271952152252),
 ('dentist/NN', 0.4475286900997162),
 ('nurse/NN', 0.4470705986022949),
 ('gynecologist/NN', 0.4430723786354065),
 ('obstetrics/JJ', 0.4371750056743622),
 ('anesthetist/NN', 0.4341369867324829),
 ('obstetrician/NN', 0.43188774585723877),
 ('pharmacist/NN', 0.4292064905166626),
 ('midwifery/NN', 0.4263818860054016)]

In [10]:
# Get cosmul of logic
print("cosmul of doctor + woman - man")
wv.most_similar_cosmul(positive=["doctor/NN","woman/NN"], negative=["man/NN"])

cosmul of doctor + woman - man


[('midwife/NN', 0.8986763954162598),
 ('gynaecologist/NN', 0.8874867558479309),
 ('midwifery/NN', 0.8800663352012634),
 ('anesthetist/NN', 0.8751558661460876),
 ('obstetrics/JJ', 0.8749769330024719),
 ('pediatrician/NN', 0.8707965612411499),
 ('obstetrician/NN', 0.8701597452163696),
 ('nurse/NN', 0.8692474365234375),
 ('gynecologist/NN', 0.8639448285102844),
 ('midwive/VB', 0.8639072179794312)]

In [11]:
# Ways to retrive word vector
print("Get item dog")
vec_dog = wv.__getitem__("dog/NN")
vec_dog = wv.get_vector("dog/NN")
vec_dog = wv.word_vec("dog/NN")

Get item dog


In [12]:
# Get similar words to vector
wv.similar_by_vector(vector=vec_dog, topn=10, restrict_vocab=None)
print("Similar to dog vector")
wv.most_similar(positive=[vec_dog])

Similar to dog vector


[('dog/NN', 1.0000001192092896),
 ('rottweiler/NN', 0.6149188280105591),
 ('poodle/NN', 0.5928210020065308),
 ('dogs/NN', 0.5638971328735352),
 ('puppy/NN', 0.556917667388916),
 ('dachshund/NN', 0.5536962747573853),
 ('pekingese/JJ', 0.5461680293083191),
 ('pet/VB', 0.5417574048042297),
 ('bullmastiff/NN', 0.5413403511047363),
 ('kennel/VB', 0.5349524021148682)]

In [13]:
# closer to __ than __
print("closer to dog than cat")
print(wv.words_closer_than("dog/NN", "cat/NN"))
print("\ncloser to cat than dog")
print(wv.words_closer_than("cat/NN", "dog/NN"))

closer to dog than cat
['pet/NN', 'hound/NN', 'kennel/NN', 'puppy/NN', 'kitten/NN', 'canine/JJ', 'beagle/NN', 'retriever/NN', 'dog/VB', 'dachshund/NN', 'feline/JJ', 'leash/NN', 'spaniel/NN', 'sheepdog/NN', 'poodle/NN', 'bichon/NN', 'komondor/NN', 'mastiff/NN', 'pug/NN', 'feline/NN', 'pet/VB', 'longhaired/JJ', 'dogs/JJ', 'rowlf/VB', 'pinscher/NN', 'rottweiler/NN', 'dogs/NN', 'coonhound/NN', 'sniffer/JJ', 'dachshund/VB', 'kennel/VB', 'rottweiler/JJ', 'mangy/JJ', 'euthanization/NN', 'leashed/JJ', 'pug/JJ', 'bullmastiff/NN', 'pekingese/JJ', 'akbash/NN', 'weimaraner/NN', 'pekingese/NN', 'pukin/NN', 'polydactyl/JJ']

closer to cat than dog
['anthropomorphic/JJ', 'kitten/NN', 'feline/JJ', 'tabby/JJ', 'feline/NN', 'fraggle/NN', 'dogs/JJ', 'tanuki/NN', 'mangy/JJ', 'meow/VB', 'wampus/NN', 'pekingese/JJ', 'nintendogs/JJ', 'scaredy/NN', 'zingano/NN']


In [14]:
# Normalized Vector
vec_king_norm = wv.word_vec("king/NN", use_norm=True)
# Not normalized vectore
vec_king_unnorm = wv.word_vec("king/NN", use_norm=False)

In [15]:
wv.most_similar(positive=[vec_king_norm], negative=[vec_king_unnorm])

[('martenetz/JJ', 0.25446975231170654),
 ('chinalink/VB', 0.23597364127635956),
 ('rosenstreich/JJ', 0.2354564666748047),
 ('kibbutznikiyot/NN', 0.23487138748168945),
 ('plastic/NN', 0.23192650079727173),
 ('unsightliest/JJ', 0.23074199259281158),
 ('gyoku/JJ', 0.2304389774799347),
 ('pjp/NN', 0.22816044092178345),
 ('молива/NN', 0.22780273854732513),
 ('oppotion/NN', 0.2250170111656189)]

In [16]:
# Get vector shape
vec_king_unnorm.shape

(300,)

In [17]:
# Generate random vector
import numpy as np
vec_random = np.random.rand(300,)
vec_random_norm = vec_random / vec_random.max(axis=0)
print("similar to random vector")
print(wv.most_similar(positive=[vec_random]))
print("\n similar to nomalized random vector")
print(wv.most_similar(positive=[vec_random_norm]))

similar to random vector
[('qasimov/JJ', 0.2712376117706299), ('schackel/NN', 0.2607859671115875), ('crenex/JJ', 0.25810354948043823), ('市川儀一/VB', 0.25635436177253723), ('格五/NN', 0.25499749183654785), ('ushū/NN', 0.2528385519981384), ('sneese/JJ', 0.24734818935394287), ('homonym/JJ', 0.2461884319782257), ('bodiford/JJ', 0.2461722195148468), ('bako/VB', 0.24406176805496216)]

 similar to nomalized random vector
[('qasimov/JJ', 0.2712376117706299), ('schackel/NN', 0.2607859671115875), ('crenex/JJ', 0.25810354948043823), ('市川儀一/VB', 0.25635436177253723), ('格五/NN', 0.25499749183654785), ('ushū/NN', 0.2528385519981384), ('sneese/JJ', 0.24734818935394287), ('homonym/JJ', 0.2461884319782257), ('bodiford/JJ', 0.2461722195148468), ('bako/VB', 0.24406176805496216)]


In [18]:
# Get similarity from a random vector and normilized king vector
print("similarity from a normalized random vector to normalized vector of king")
wv.most_similar(positive=[vec_random_norm,vec_king_norm])

similarity from a normalized random vector to normalized vector of king


[('qasimov/JJ', 0.26662397384643555),
 ('schackel/NN', 0.25880166888237),
 ('格五/NN', 0.2555546462535858),
 ('市川儀一/VB', 0.2543610632419586),
 ('miyatsuko/NN', 0.25367265939712524),
 ('crenex/JJ', 0.25238728523254395),
 ('sneese/JJ', 0.2518189549446106),
 ('tahsildar/NN', 0.2514416575431824),
 ('sopianae/NN', 0.2507050633430481),
 ('homonym/JJ', 0.25028181076049805)]

In [19]:
# Get similarity from a random vector and unormalized king vector
print("similarity from a random vector to unormalized vector of king")
wv.most_similar(positive=[vec_random,vec_king_unnorm])

similarity from a random vector to unormalized vector of king


[('king/NN', 0.8697292804718018),
 ('uṣur/NN', 0.4679512679576874),
 ('tukulti/NN', 0.457405686378479),
 ('neustrium/NN', 0.4550192356109619),
 ('cnut/NN', 0.4508383274078369),
 ('hengal/JJ', 0.44895803928375244),
 ('pileser/NN', 0.4454649090766907),
 ('shutruk/NN', 0.44528043270111084),
 ('ninurta/NN', 0.4444091320037842),
 ('harthacnut/NN', 0.4420572519302368)]

In [20]:
# Get cosine similarities from a vector to an array of vectors
print("cosine similarity from a random vector to unormalized vector of king")
wv.cosine_similarities(vec_random, [vec_king_unnorm])

cosine similarity from a random vector to unormalized vector of king


array([-0.04686037])

In [21]:
# Tests analogies based on a text file
from gensim.test.utils import datapath
analogy_scores = wv.evaluate_word_analogies('datasets/questions-words.txt',dummy4unknown=False)
#print(analogy_scores)

2019-04-08 09:50:56,994 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-04-08 09:50:56,995 : INFO : built Dictionary(12 unique tokens: ['time', 'eps', 'graph', 'minors', 'response']...) from 9 documents (total 29 corpus positions)
2019-04-08 09:50:58,110 : INFO : Evaluating word analogies for top 300000 words in the model on datasets/questions-words.txt
2019-04-08 09:50:58,160 : INFO : capital-common-countries: 100.0% (2/2)
2019-04-08 09:50:58,306 : INFO : Quadruplets with out-of-vocabulary words: 100.0%
2019-04-08 09:50:58,308 : INFO : NB: analogies containing OOV words were skipped from evaluation! To change this behavior, use "dummy4unknown=True"
2019-04-08 09:50:58,308 : INFO : Total accuracy: 100.0% (2/2)


In [22]:
# The the distance of two words
print("distance between dog and cat")
wv.distance("dog/NN","cat/NN")

distance between dog and cat


0.5276151299476624

In [23]:
# Get the distance of a word for the list of word
print("distance from dog to king and cat")
wv.distances("dog/NN",["king/NN","cat/NN"])

distance from dog to king and cat


array([1.002809 , 0.5276151], dtype=float32)

In [24]:
# Evaluate pairs of words
#wv.evaluate_word_pairs("datasets/SimLex-999.txt")

In [25]:
# Get sentence similarities

from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess    

def tokemmized(sentence, vocabulary):
    tokens = [lem(word) for word in simple_preprocess(sentence)]
    return [word for word in tokens if word in vocabulary]   

def compute_sentence_similarity(sentence_1, sentence_2, model_wv):
    vocabulary = set(model_wv.index2word)
    tokens_1 = tokemmized(sentence_1, vocabulary)
    tokens_2 = tokemmized(sentence_2, vocabulary)
    del vocabulary
    print(tokens_1, tokens_2)
    return model_wv.n_similarity(tokens_1, tokens_2)

similarity = compute_sentence_similarity('this is a sentence', 'this is also a sentence', wv)
print(similarity,"\n")

similarity = compute_sentence_similarity('the cat is a mammal', 'the bird is a aves', wv)
print(similarity,"\n")

similarity = compute_sentence_similarity('the cat is a mammal', 'the dog is a mammal', wv)
print(similarity)

['be/VB', 'sentence/NN'] ['be/VB', 'also/RB', 'sentence/NN']
0.8849491 

['cat/NN', 'be/VB', 'mammal/NN'] ['bird/NN', 'be/VB', 'ave/NN']
0.2748915 

['cat/NN', 'be/VB', 'mammal/NN'] ['dog/NN', 'be/VB', 'mammal/NN']
0.81017876


In [26]:
# Analogy with not normalized vectors
print("france is to france as berlin is to ?")
wv.most_similar([wv['france/NN'] - wv['paris/NN'] + wv['berlin/NN']])

france is to france as berlin is to ?


[('berlin/NN', 0.6598570346832275),
 ('germany/NN', 0.5397100448608398),
 ('france/NN', 0.49677813053131104),
 ('osnabrueck/NN', 0.4760308265686035),
 ('niedersachsen/RB', 0.45274922251701355),
 ('neukölln/RB', 0.4491945207118988),
 ('pliezhausen/NN', 0.4466242790222168),
 ('wuhlheide/VB', 0.4429861903190613),
 ('meerbusch/NN', 0.4415031671524048),
 ('filmtage/NN', 0.44115519523620605)]

In [27]:
# Analogy with normalized Vector
vec_france_norm = wv.word_vec(lem('France'), use_norm=True)
vec_paris_norm = wv.word_vec(lem('Paris'), use_norm=True)
vec_berlin_norm = wv.word_vec(lem('Berlin'), use_norm=True)
vec_germany_norm = wv.word_vec(lem('Germany'), use_norm=True)
vec_country_norm = wv.word_vec(lem('country'), use_norm=True)
print("france is to france as berlin is to ?")
wv.most_similar([vec_france_norm + vec_paris_norm - vec_country_norm])

france is to france as berlin is to ?


[('paris/NN', 0.7346838712692261),
 ('france/NN', 0.7047756910324097),
 ('issy/JJ', 0.5712026357650757),
 ('nimes/RB', 0.5693367719650269),
 ('beaubourg/NN', 0.56267249584198),
 ('chauny/NN', 0.5557336807250977),
 ('chatou/NN', 0.553068995475769),
 ('chaville/NN', 0.5530577898025513),
 ('melun/NN', 0.5504977703094482),
 ('antibe/NN', 0.5504209399223328)]

In [28]:
# Cosine Similarities
print("cosine_similarities of france and paris")
print(wv.cosine_similarities(vec_france_norm, [vec_paris_norm]))
print("cosine_similarities of france and berlin")
print(wv.cosine_similarities(vec_france_norm, [vec_berlin_norm]))
print("cosine_similarities of france and country")
print(wv.cosine_similarities(vec_france_norm, [vec_country_norm]))

cosine_similarities of france and paris
[0.46068862]
cosine_similarities of france and berlin
[0.09915119]
cosine_similarities of france and country
[0.17262796]


In [29]:
# Analogy
print("paris is to france as germany is to ?")
wv.most_similar([wv['france/NN'] + wv['germany/NN'] - wv['paris/NN']])

paris is to france as germany is to ?


[('germany/NN', 0.7783980369567871),
 ('france/NN', 0.5977498292922974),
 ('scandinavia/RB', 0.4345003366470337),
 ('geesthacht/NN', 0.4168975353240967),
 ('oberflacht/JJ', 0.39931991696357727),
 ('poland/NN', 0.39648324251174927),
 ('netherlands/NN', 0.3954917788505554),
 ('uedem/NN', 0.39445963501930237),
 ('czechoslovakia/NN', 0.3935864567756653),
 ('german/NN', 0.3916912376880646)]

In [30]:
# Analogy
print("cat is to mammal as sparrow is to ?")
wv.most_similar([wv['mammal/NN'] - wv['sparrow/NN'] + wv['cat/NN']])

cat is to mammal as sparrow is to ?


[('mammal/NN', 0.6350052356719971),
 ('cat/NN', 0.6204670667648315),
 ('carnivore/NN', 0.5320420861244202),
 ('rodent/NN', 0.5169624090194702),
 ('feline/NN', 0.5153012275695801),
 ('vertebrate/NN', 0.4844091832637787),
 ('human/NN', 0.48289990425109863),
 ('mustelid/NN', 0.47467949986457825),
 ('carnivorous/JJ', 0.46422073245048523),
 ('animal/NN', 0.461605966091156)]

In [31]:
# Analogy
print("grass is to green as sky is to ?")
wv.most_similar([wv['green/NN'] - wv['sky/NN'] + wv['grass/NN']])

grass is to green as sky is to ?


[('green/NN', 0.6720399856567383),
 ('grass/NN', 0.6147882342338562),
 ('forbs/JJ', 0.4486176669597626),
 ('bentgrass/NN', 0.4462357759475708),
 ('leafy/JJ', 0.4335836172103882),
 ('cocksfoot/NN', 0.430562287569046),
 ('herbage/NN', 0.42099571228027344),
 ('graminoid/NN', 0.4148740768432617),
 ('esparto/JJ', 0.4135362505912781),
 ('radish/VB', 0.4130854606628418)]

In [32]:
# Analogy
print("athens is to greece as baghdad is to ?")
wv.most_similar([wv['athens/NN'] - wv['greece/NN'] + wv['baghdad/NN']])

athens is to greece as baghdad is to ?


[('baghdad/NN', 0.7465616464614868),
 ('baghdad/JJ', 0.5089205503463745),
 ('basra/NN', 0.4829660654067993),
 ('najaf/NN', 0.4801582098007202),
 ('kadhimiya/NN', 0.4733431339263916),
 ('isfahan/NN', 0.47049379348754883),
 ('najaf/RB', 0.46331748366355896),
 ('kirkuk/NN', 0.46281489729881287),
 ('mashhad/NN', 0.4498487710952759),
 ('herat/NN', 0.44610485434532166)]