In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
import gensim

In [6]:
import math
from random import random
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Text8Corpus

data: Text8Corpus = Text8Corpus("../data/text8/text8.gz")

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(data)]
print(f'Identified {len(documents)} documents in data.')
random_selection = math.floor(random()*len(documents))
sample_document = documents[random_selection]
print(f'First few tokens in random document {random_selection}: {sample_document.words[:100]}')

Identified 1701 documents in data.
First few tokens in random document 1067: ['home', 'of', 'dr', 'willett', 'in', 'the', 'case', 'of', 'charles', 'dexter', 'ward', 'the', 'period', 'after', 'his', 'return', 'to', 'providence', 'the', 'last', 'decade', 'of', 'his', 'life', 'was', 'lovecraft', 's', 'most', 'prolific', 'during', 'this', 'time', 'period', 'he', 'produced', 'almost', 'all', 'of', 'his', 'best', 'known', 'short', 'stories', 'for', 'the', 'leading', 'pulp', 'publications', 'of', 'the', 'day', 'primarily', 'weird', 'tales', 'as', 'well', 'as', 'longer', 'efforts', 'like', 'the', 'case', 'of', 'charles', 'dexter', 'ward', 'and', 'at', 'the', 'mountains', 'of', 'madness', 'he', 'frequently', 'revised', 'work', 'for', 'other', 'authors', 'and', 'did', 'a', 'large', 'amount', 'of', 'ghost', 'writing', 'despite', 'his', 'best', 'writing', 'efforts', 'however', 'he', 'grew', 'ever', 'poorer', 'he', 'was', 'forced']


In [8]:
# create model
model = gensim.models.doc2vec.Doc2Vec(
    vector_size=50,     # size of the vector embedding
    min_count=3,        # only consider words that occur at least three times
    epochs=40,
    workers=4           # number of threads, 4 is usually pretty good
)

2023-03-31 18:22:06,930 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc3,s0.001,t4>', 'datetime': '2023-03-31T18:22:06.930860', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'created'}


In [10]:
# load data into model
model.build_vocab(documents)

2023-03-31 18:27:23,084 : INFO : collecting all words and their counts
2023-03-31 18:27:23,085 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-03-31 18:27:26,665 : INFO : collected 253854 word types and 1701 unique tags from a corpus of 1701 examples and 17005207 words
2023-03-31 18:27:26,666 : INFO : Creating a fresh vocabulary
2023-03-31 18:27:27,000 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 retains 100038 unique words (39.41% of original 253854, drops 153816)', 'datetime': '2023-03-31T18:27:27.000856', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2023-03-31 18:27:27,001 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 leaves 16816094 word corpus (98.89% of original 17005207, drops 189113)', 'datetime': '2023-03-31T18:27:27.001520', 'gensim': '4.3.1', 'python': '3.11.2 (main, 

In [12]:
# Train the model
# This usually takes a few minutes for the text8 dataset
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

2023-03-31 18:28:23,527 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 4 workers on 100038 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-03-31T18:28:23.527126', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'train'}
2023-03-31 18:28:24,533 : INFO : EPOCH 0 - PROGRESS: at 11.70% examples, 1462320 words/s, in_qsize 8, out_qsize 0
2023-03-31 18:28:25,534 : INFO : EPOCH 0 - PROGRESS: at 24.63% examples, 1547082 words/s, in_qsize 7, out_qsize 0
2023-03-31 18:28:26,537 : INFO : EPOCH 0 - PROGRESS: at 37.10% examples, 1560020 words/s, in_qsize 7, out_qsize 0
2023-03-31 18:28:27,539 : INFO : EPOCH 0 - PROGRESS: at 50.97% examples, 1608143 words/s, in_qsize 7, out_qsize 0
2023-03-31 18:28:28,544 : INFO : EPOCH 0 - PROGRESS: at 65.20% examples, 1644703 words/s, in_qsize 7, out_qsize 0
2023-03-31 18:2

In [14]:
from gensim.test.utils import simple_preprocess
# Test infer a vector
test_string = "Hello, world!"
test_tokens = simple_preprocess(test_string)
print(model.infer_vector(test_tokens))

[ 0.0011171  -0.14825673 -0.17609155 -0.13101754 -0.07584271  0.1981532
  0.1185814   0.27367732  0.03935472  0.13345967  0.09743626  0.10941651
  0.02519654 -0.15672484 -0.08884095 -0.35283133  0.08157589  0.07298915
 -0.39617378 -0.12477299 -0.28801444  0.17198338 -0.17194895  0.08744618
 -0.07567008 -0.27765927 -0.55538946 -0.47052908 -0.15491503 -0.43271708
  0.30307657  0.2855574  -0.2410872   0.09419218 -0.02951643  0.20422597
  0.0939188  -0.17469913 -0.04301504  0.16965272  0.06579589 -0.2362432
 -0.08117436 -0.37476528  0.0098258  -0.00713632  0.09533946  0.0440481
  0.2598772   0.01723556]


In [49]:
from gensim.test.utils import get_tmpfile

# persist model to disk
# To load, use Doc2Vec.load(fname)
model.save("../models/text8_model")

2023-03-31 19:06:43,541 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '../models/text8_model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-31T19:06:43.541931', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'saving'}
2023-03-31 19:06:43,544 : INFO : not storing attribute cum_table
2023-03-31 19:06:43,732 : INFO : saved ../models/text8_model


In [18]:
from numpy import dot
from numpy.linalg import norm

def get_similarity(text1: str, text2: str)-> float:
    embed1 = model.infer_vector(gensim.utils.simple_preprocess(text1))
    embed2 = model.infer_vector(gensim.utils.simple_preprocess(text2))
    # Cosine similarity
    return dot(embed1, embed2)/(norm(embed1))/(norm(embed2))

In [48]:
# test similarity function with model
keyword = "This is a tourist attraction."
test_phrases = [
    "The amazon rainforest",
    "Horses are noble animals.",
    "Zoos are family-friendly.",
    "This is the worst programming language.",
    "An absorbing book.",
    "Chemical-free apple juice."
]
similarities = list(map(
    lambda phrase: (phrase, get_similarity(phrase, keyword)), test_phrases)
)
similarities.sort(key=lambda x: x[1], reverse=True)
print(similarities)

[('Horses are noble animals.', 0.94261616), ('The amazon rainforest', 0.92553276), ('Zoos are family-friendly.', 0.87959206), ('This is the worst programming language.', 0.6979928), ('Chemical-free apple juice.', 0.5572638), ('An absorbing book.', 0.48015532)]
