In [1]:
# Turn on Auto-Complete
%config IPCompleter.greedy=True

In [2]:
# Start logging process at root level
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)

In [3]:
# Load model and dictionary
#model_id_current = 99999
#model_path_current = "models/enwiki-full-dict-"+str(model_id_current)+".model"
model_path_current = "models/enwiki-20190319-lemmatized-99999.model"

dictionary_full_wikien_lem_path = "dictionaries/enwiki-20190409-dict-lemmatized.txt.bz2"

In [4]:
# Load word2vec model
from gensim.models import Word2Vec
model = Word2Vec.load(model_path_current, mmap='r')

2019-04-17 18:22:20,635 : INFO : 'pattern' package found; tag filters are available for English
2019-04-17 18:22:20,651 : INFO : loading Word2Vec object from models/enwiki-20190319-lemmatized-99999.model
2019-04-17 18:22:29,416 : INFO : loading vocabulary recursively from models/enwiki-20190319-lemmatized-99999.model.vocabulary.* with mmap=r
2019-04-17 18:22:29,420 : INFO : loading wv recursively from models/enwiki-20190319-lemmatized-99999.model.wv.* with mmap=r
2019-04-17 18:22:29,422 : INFO : loading vectors from models/enwiki-20190319-lemmatized-99999.model.wv.vectors.npy with mmap=r
2019-04-17 18:22:29,433 : INFO : setting ignored attribute vectors_norm to None
2019-04-17 18:22:29,435 : INFO : loading trainables recursively from models/enwiki-20190319-lemmatized-99999.model.trainables.* with mmap=r
2019-04-17 18:22:29,437 : INFO : loading syn1neg from models/enwiki-20190319-lemmatized-99999.model.trainables.syn1neg.npy with mmap=r
2019-04-17 18:22:29,439 : INFO : setting ignored a

In [5]:
# Custom lemmatizer function to play with word
from gensim.utils import lemmatize
#vocabulary = set(wv.index2word)
def lem(word):
    try:
        return lemmatize(word)[0].decode("utf-8")
    except:
        pass
        
print(lem("dog"))
print(lem("that"))

dog/NN
None


In [6]:
# Testing similarity
print("Most similar to",lem("woman"))
print(model.wv.most_similar(lem("woman")))
print("\nMost similar to","doctor/NN")
print(model.wv.most_similar("doctor/NN"))

2019-04-16 16:50:29,358 : INFO : precomputing L2-norms of word weight vectors


Most similar to woman/NN
[('lesbian/NN', 0.46007344126701355), ('man/NN', 0.4519272744655609), ('transwoman/NN', 0.45071157813072205), ('feminist/NN', 0.4451548755168915), ('feminist/VB', 0.4357849955558777), ('womanhood/NN', 0.4307554364204407), ('sztokman/NN', 0.3955826163291931), ('englishwoman/NN', 0.395127534866333), ('feminist/JJ', 0.3936055898666382), ('antifeminist/NN', 0.39345377683639526)]

Most similar to doctor/NN
[('dentist/NN', 0.5012549161911011), ('physician/NN', 0.4942778944969177), ('dolittle/RB', 0.4727567732334137), ('veterinarian/NN', 0.4650288224220276), ('zhivago/VB', 0.4592878222465515), ('who/NN', 0.4559624195098877), ('zhivago/JJ', 0.4459368586540222), ('doolot/NN', 0.4328805208206177), ('pulmonologist/NN', 0.4282873868942261), ('internist/NN', 0.4256983995437622)]


In [7]:
# Saving some ram by using the KeyedVectors instance
wv = model.wv
del model

In [8]:
# Testing similarity with KeyedVectors
print("Most similar to",lem("woman"))
print(wv.most_similar(lem("woman")))
print("\nMost similar to",lem("man"))
print(wv.wv.most_similar(lem("man")))
print("\nMost similar to","doctor/NN")
print(wv.most_similar("doctor/NN"))
print("\nMost similar to","doctor/NN","cosmul")
print(wv.most_similar_cosmul(positive=["doctor/NN"]))

Most similar to woman/NN
[('lesbian/NN', 0.46007344126701355), ('man/NN', 0.4519272744655609), ('transwoman/NN', 0.45071157813072205), ('feminist/NN', 0.4451548755168915), ('feminist/VB', 0.4357849955558777), ('womanhood/NN', 0.4307554364204407), ('sztokman/NN', 0.3955826163291931), ('englishwoman/NN', 0.395127534866333), ('feminist/JJ', 0.3936055898666382), ('antifeminist/NN', 0.39345377683639526)]

Most similar to man/NN


  """


[('woman/NN', 0.4519272446632385), ('thug/NN', 0.40778353810310364), ('boy/NN', 0.3781570494174957), ('nudy/NN', 0.37579405307769775), ('roynell/VB', 0.3751065731048584), ('nowhere/NN', 0.3721665143966675), ('delmon/VB', 0.3710464835166931), ('wrightwood/VB', 0.3666095733642578), ('fogey/NN', 0.3656005859375), ('taolu/RB', 0.3626784682273865)]

Most similar to doctor/NN
[('dentist/NN', 0.5012549161911011), ('physician/NN', 0.4942778944969177), ('dolittle/RB', 0.4727567732334137), ('veterinarian/NN', 0.4650288224220276), ('zhivago/VB', 0.4592878222465515), ('who/NN', 0.4559624195098877), ('zhivago/JJ', 0.4459368586540222), ('doolot/NN', 0.4328805208206177), ('pulmonologist/NN', 0.4282873868942261), ('internist/NN', 0.4256983995437622)]

Most similar to doctor/NN cosmul
[('dentist/NN', 0.750626802444458), ('physician/NN', 0.747138261795044), ('dolittle/RB', 0.7363777160644531), ('veterinarian/NN', 0.7325137257575989), ('zhivago/VB', 0.7296432256698608), ('who/NN', 0.7279804944992065), ('

In [9]:
print("similarity of doctor + woman - man")
wv.most_similar(positive=["doctor/NN","woman/NN"], negative=["man/NN"])

similarity of doctor + woman - man


[('midwife/NN', 0.4707856774330139),
 ('gynaecologist/NN', 0.45252716541290283),
 ('dentist/NN', 0.44752874970436096),
 ('nurse/NN', 0.4470706284046173),
 ('gynecologist/NN', 0.4430723786354065),
 ('obstetrics/JJ', 0.4371750056743622),
 ('anesthetist/NN', 0.4341370463371277),
 ('obstetrician/NN', 0.4318876564502716),
 ('pharmacist/NN', 0.429206520318985),
 ('midwifery/NN', 0.426381915807724)]

In [10]:
# Get cosmul of logic
print("cosmul of doctor + woman - man")
wv.most_similar_cosmul(positive=["doctor/NN","woman/NN"], negative=["man/NN"])

cosmul of doctor + woman - man


[('midwife/NN', 0.8986764550209045),
 ('gynaecologist/NN', 0.8874868750572205),
 ('midwifery/NN', 0.8800663948059082),
 ('anesthetist/NN', 0.8751559853553772),
 ('obstetrics/JJ', 0.8749768733978271),
 ('pediatrician/NN', 0.8707965612411499),
 ('obstetrician/NN', 0.8701598048210144),
 ('nurse/NN', 0.8692474961280823),
 ('gynecologist/NN', 0.8639448881149292),
 ('midwive/VB', 0.8639071583747864)]

In [11]:
# Ways to retrive word vector
print("Get item dog")
vec_dog = wv.__getitem__("dog/NN")
vec_dog = wv.get_vector("dog/NN")
vec_dog = wv.word_vec("dog/NN")

Get item dog


In [12]:
# Get similar words to vector
wv.similar_by_vector(vector=vec_dog, topn=10, restrict_vocab=None)
print("Similar to dog vector")
wv.most_similar(positive=[vec_dog])

Similar to dog vector


[('dog/NN', 0.9999999403953552),
 ('rottweiler/NN', 0.6149188876152039),
 ('poodle/NN', 0.5928210020065308),
 ('dogs/NN', 0.5638971924781799),
 ('puppy/NN', 0.556917667388916),
 ('dachshund/NN', 0.55369633436203),
 ('pekingese/JJ', 0.5461680889129639),
 ('pet/VB', 0.5417574644088745),
 ('bullmastiff/NN', 0.5413404107093811),
 ('kennel/VB', 0.5349524021148682)]

In [13]:
# closer to __ than __
print("closer to dog than cat")
print(wv.words_closer_than("dog/NN", "cat/NN"))
print("\ncloser to cat than dog")
print(wv.words_closer_than("cat/NN", "dog/NN"))

closer to dog than cat
['pet/NN', 'hound/NN', 'kennel/NN', 'puppy/NN', 'kitten/NN', 'canine/JJ', 'beagle/NN', 'retriever/NN', 'dog/VB', 'dachshund/NN', 'feline/JJ', 'leash/NN', 'spaniel/NN', 'sheepdog/NN', 'poodle/NN', 'bichon/NN', 'komondor/NN', 'mastiff/NN', 'pug/NN', 'feline/NN', 'pet/VB', 'longhaired/JJ', 'dogs/JJ', 'rowlf/VB', 'pinscher/NN', 'rottweiler/NN', 'dogs/NN', 'coonhound/NN', 'sniffer/JJ', 'dachshund/VB', 'kennel/VB', 'rottweiler/JJ', 'mangy/JJ', 'euthanization/NN', 'leashed/JJ', 'pug/JJ', 'bullmastiff/NN', 'pekingese/JJ', 'akbash/NN', 'weimaraner/NN', 'pekingese/NN', 'pukin/NN', 'polydactyl/JJ']

closer to cat than dog
['anthropomorphic/JJ', 'kitten/NN', 'feline/JJ', 'tabby/JJ', 'feline/NN', 'fraggle/NN', 'dogs/JJ', 'tanuki/NN', 'mangy/JJ', 'meow/VB', 'wampus/NN', 'pekingese/JJ', 'nintendogs/JJ', 'scaredy/NN', 'zingano/NN']


In [14]:
# Normalized Vector
vec_king_norm = wv.word_vec("king/NN", use_norm=True)
# Not normalized vectore
vec_king_unnorm = wv.word_vec("king/NN", use_norm=False)

In [15]:
wv.most_similar(positive=[vec_king_norm], negative=[vec_king_unnorm])

[('martenetz/JJ', 0.25446975231170654),
 ('chinalink/VB', 0.23597362637519836),
 ('rosenstreich/JJ', 0.23545649647712708),
 ('kibbutznikiyot/NN', 0.23487138748168945),
 ('plastic/NN', 0.23192651569843292),
 ('unsightliest/JJ', 0.23074200749397278),
 ('gyoku/JJ', 0.23043902218341827),
 ('pjp/NN', 0.22816041111946106),
 ('молива/NN', 0.22780275344848633),
 ('oppotion/NN', 0.2250169962644577)]

In [16]:
# Get vector shape
vec_king_unnorm.shape

(300,)

In [17]:
# Generate random vector
import numpy as np
vec_random = np.random.rand(300,)
vec_random_norm = vec_random / vec_random.max(axis=0)
print("similar to random vector")
print(wv.most_similar(positive=[vec_random]))
print("\n similar to nomalized random vector")
print(wv.most_similar(positive=[vec_random_norm]))

similar to random vector
[('dagana/NN', 0.310502290725708), ('boeny/NN', 0.30554085969924927), ('fitovinany/NN', 0.30136212706565857), ('qasimov/JJ', 0.2948823571205139), ('mangoro/NN', 0.29376664757728577), ('melaky/JJ', 0.293718159198761), ('dalila/NN', 0.28584954142570496), ('atsinanana/RB', 0.28495872020721436), ('brčko/JJ', 0.2835693955421448), ('interexport/NN', 0.2830585539340973)]

 similar to nomalized random vector
[('dagana/NN', 0.310502290725708), ('boeny/NN', 0.30554085969924927), ('fitovinany/NN', 0.30136212706565857), ('qasimov/JJ', 0.2948823571205139), ('mangoro/NN', 0.29376664757728577), ('melaky/JJ', 0.293718159198761), ('dalila/NN', 0.28584954142570496), ('atsinanana/RB', 0.28495872020721436), ('brčko/JJ', 0.2835693955421448), ('interexport/NN', 0.2830585539340973)]


In [18]:
# Get similarity from a random vector and normilized king vector
print("similarity from a normalized random vector to normalized vector of king")
wv.most_similar(positive=[vec_random_norm,vec_king_norm])

similarity from a normalized random vector to normalized vector of king


[('dagana/NN', 0.31837376952171326),
 ('boeny/NN', 0.3109041750431061),
 ('fitovinany/NN', 0.30090072751045227),
 ('mangoro/NN', 0.2952515184879303),
 ('melaky/JJ', 0.2938075065612793),
 ('qasimov/JJ', 0.29035305976867676),
 ('atsinanana/RB', 0.2888091504573822),
 ('taoudénit/NN', 0.28652405738830566),
 ('interexport/NN', 0.2856239676475525),
 ('dalila/NN', 0.28082406520843506)]

In [19]:
# Get similarity from a random vector and unormalized king vector
print("similarity from a random vector to unormalized vector of king")
wv.most_similar(positive=[vec_random,vec_king_unnorm])

similarity from a random vector to unormalized vector of king


[('king/NN', 0.8547921776771545),
 ('hattusili/VB', 0.44188767671585083),
 ('inshushinak/VB', 0.4376552402973175),
 ('neustrium/NN', 0.43171319365501404),
 ('usurper/NN', 0.4287143349647522),
 ('hengal/JJ', 0.42845696210861206),
 ('uṣur/NN', 0.426206111907959),
 ('chlothar/VB', 0.424801230430603),
 ('indravarman/NN', 0.42459505796432495),
 ('bhuvanaikabahu/NN', 0.42417895793914795)]

In [20]:
# Get cosine similarities from a vector to an array of vectors
print("cosine similarity from a random vector to unormalized vector of king")
wv.cosine_similarities(vec_random, [vec_king_unnorm])

cosine similarity from a random vector to unormalized vector of king


array([-0.03458707])

In [5]:
# Tests analogies based on a text file

analogy_scores = wv.accuracy('datasets/questions-words.txt',dummy4unknown=False)
#print(analogy_scores)

NameError: name 'wv' is not defined

In [None]:
# The the distance of two words
print("distance between dog and cat")
wv.distance("dog/NN","cat/NN")

In [None]:
# Get the distance of a word for the list of word
print("distance from dog to king and cat")
wv.distances("dog/NN",["king/NN","cat/NN"])

In [None]:
# Evaluate pairs of words
#wv.evaluate_word_pairs("datasets/SimLex-999.txt")

In [None]:
# Get sentence similarities

from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess    

def tokemmized(sentence, vocabulary):
    tokens = [lem(word) for word in simple_preprocess(sentence)]
    return [word for word in tokens if word in vocabulary]   

def compute_sentence_similarity(sentence_1, sentence_2, model_wv):
    vocabulary = set(model_wv.index2word)
    tokens_1 = tokemmized(sentence_1, vocabulary)
    tokens_2 = tokemmized(sentence_2, vocabulary)
    del vocabulary
    print(tokens_1, tokens_2)
    return model_wv.n_similarity(tokens_1, tokens_2)

similarity = compute_sentence_similarity('this is a sentence', 'this is also a sentence', wv)
print(similarity,"\n")

similarity = compute_sentence_similarity('the cat is a mammal', 'the bird is a aves', wv)
print(similarity,"\n")

similarity = compute_sentence_similarity('the cat is a mammal', 'the dog is a mammal', wv)
print(similarity)

In [None]:
# Analogy with not normalized vectors
print("france is to france as berlin is to ?")
wv.most_similar([wv['france/NN'] - wv['paris/NN'] + wv['berlin/NN']])

In [None]:
# Analogy with normalized Vector
vec_france_norm = wv.word_vec(lem('France'), use_norm=True)
vec_paris_norm = wv.word_vec(lem('Paris'), use_norm=True)
vec_berlin_norm = wv.word_vec(lem('Berlin'), use_norm=True)
vec_germany_norm = wv.word_vec(lem('Germany'), use_norm=True)
vec_country_norm = wv.word_vec(lem('country'), use_norm=True)
print("france is to france as berlin is to ?")
wv.most_similar([vec_france_norm + vec_paris_norm - vec_country_norm])

In [None]:
# Cosine Similarities
print("cosine_similarities of france and paris")
print(wv.cosine_similarities(vec_france_norm, [vec_paris_norm]))
print("cosine_similarities of france and berlin")
print(wv.cosine_similarities(vec_france_norm, [vec_berlin_norm]))
print("cosine_similarities of france and country")
print(wv.cosine_similarities(vec_france_norm, [vec_country_norm]))

In [None]:
# Analogy
print("paris is to france as germany is to ?")
wv.most_similar([wv['france/NN'] + wv['germany/NN'] - wv['paris/NN']])

In [None]:
# Analogy
print("cat is to mammal as sparrow is to ?")
wv.most_similar([wv['mammal/NN'] - wv['sparrow/NN'] + wv['cat/NN']])

In [None]:
# Analogy
print("grass is to green as sky is to ?")
wv.most_similar([wv['green/NN'] - wv['sky/NN'] + wv['grass/NN']])

In [None]:
# Analogy
print("athens is to greece as baghdad is to ?")
wv.most_similar([wv['athens/NN'] - wv['greece/NN'] + wv['baghdad/NN']])