In [11]:
# From Chapter 10 in Blueprints, Text Analytics
# Jens Albrecht, Sidharth Ramachandran, Christian Winkler
# Ml NLP analytics
# 
# This a modified version of the code in the chapter (chapter 10). 
# This version, modified:
# Nov 20, 2022.
# Sila. 

In [3]:
# pandas number format
import pandas as pd
pd.options.display.float_format = '{:.0f}'.format

In [4]:
# pandas number format
pd.options.display.float_format = '{:.0f}'.format

In [5]:
import gensim.downloader as api

info_df = pd.DataFrame.from_dict(api.info()['models'], orient='index')
info_df[['file_size', 'base_dataset', 'parameters']].head(5)

Unnamed: 0,file_size,base_dataset,parameters
fasttext-wiki-news-subwords-300,1005007116,"Wikipedia 2017, UMBC webbase corpus and statmt...",{'dimension': 300}
conceptnet-numberbatch-17-06-300,1225497562,"ConceptNet, word2vec, GloVe, and OpenSubtitles...",{'dimension': 300}
word2vec-ruscorpora-300,208427381,Russian National Corpus (about 250M words),"{'dimension': 300, 'window_size': 10}"
word2vec-google-news-300,1743563840,Google News (about 100 billion words),{'dimension': 300}
glove-wiki-gigaword-50,69182535,"Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)",{'dimension': 50}


In [6]:
# full list of columns
info_df.head(3)

Unnamed: 0,num_records,file_size,base_dataset,reader_code,license,parameters,description,read_more,checksum,file_name,parts,preprocessing
fasttext-wiki-news-subwords-300,999999,1005007116,"Wikipedia 2017, UMBC webbase corpus and statmt...",https://github.com/RaRe-Technologies/gensim-da...,https://creativecommons.org/licenses/by-sa/3.0/,{'dimension': 300},1 million word vectors trained on Wikipedia 20...,[https://fasttext.cc/docs/en/english-vectors.h...,de2bb3a20c46ce65c9c131e1ad9a77af,fasttext-wiki-news-subwords-300.gz,1,
conceptnet-numberbatch-17-06-300,1917247,1225497562,"ConceptNet, word2vec, GloVe, and OpenSubtitles...",https://github.com/RaRe-Technologies/gensim-da...,https://github.com/commonsense/conceptnet-numb...,{'dimension': 300},ConceptNet Numberbatch consists of state-of-th...,[http://aaai.org/ocs/index.php/AAAI/AAAI17/pap...,fd642d457adcd0ea94da0cd21b150847,conceptnet-numberbatch-17-06-300.gz,1,
word2vec-ruscorpora-300,184973,208427381,Russian National Corpus (about 250M words),https://github.com/RaRe-Technologies/gensim-da...,https://creativecommons.org/licenses/by/4.0/de...,"{'dimension': 300, 'window_size': 10}",Word2vec Continuous Skipgram vectors trained o...,[https://www.academia.edu/24306935/WebVectors_...,9bdebdc8ae6d17d20839dd9b5af10bc4,word2vec-ruscorpora-300.gz,1,The corpus was lemmatized and tagged with Univ...


In [7]:
pd.options.display.float_format = '{:.2f}'.format

In [8]:
model = api.load("glove-wiki-gigaword-50")



In [9]:
# See e.g.
# https://resources.wolframcloud.com/NeuralNetRepository/resources/GloVe-300-Dimensional-Word-Vectors-Trained-on-Wikipedia-and-Gigaword-5-Data

In [10]:
# Represent words as vectors
# Released in 2014 by the computer science department at Stanford University, this representation is trained using an original method called Global Vectors (GloVe). It encodes 400,000 tokens as unique vectors, 
# with all tokens outside the vocabulary encoded as the zero-vector. Token case is ignored.

In [12]:
%precision 2

'%.2f'

In [13]:
model.most_similar('king', topn=3)

[('prince', 0.82), ('queen', 0.78), ('ii', 0.77)]

In [14]:
v_king = model['king']
v_queen = model['queen']

print("Vector size:", model.vector_size)
print("v_king  =", v_king[:10])
print("v_queen =", v_queen[:10])
print("similarity:", model.similarity('king', 'queen'))

Vector size: 50
v_king  = [ 0.5   0.69 -0.6  -0.02  0.6  -0.13 -0.09  0.47 -0.62 -0.31]
v_queen = [ 0.38  1.82 -1.26 -0.1   0.36  0.6  -0.18  0.84 -0.06 -0.76]
similarity: 0.7839043


In [15]:
v_lion = model['lion']
v_nano = model['nanotechnology']

model.cosine_similarities(v_king, [v_queen, v_lion, v_nano])

array([ 0.78,  0.48, -0.25], dtype=float32)

In [16]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=3)

[('queen', 0.85), ('throne', 0.77), ('prince', 0.76)]

In [17]:
model.most_similar(positive=['paris', 'germany'], negative=['france'], topn=3)

[('berlin', 0.92), ('frankfurt', 0.82), ('vienna', 0.82)]

In [18]:
model.most_similar(positive=['france', 'capital'], topn=1)

[('paris', 0.78)]

In [19]:
model.most_similar(positive=['greece', 'capital'], topn=3)

[('central', 0.80), ('western', 0.76), ('region', 0.75)]