In [1]:
# Following the tutorial at:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
import gensim

In [24]:
# directory that includes small corpuses for use in testing like this
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
print(f'test_data_dir is: \n{test_data_dir}')
print(f'test corpuses include: ')
for file in os.listdir(test_data_dir):
    if os.path.isfile(os.path.join(test_data_dir, file)):
        if file.endswith('.cor'):
            print(file)

test_data_dir is: 
/Users/minimal/detour-ai-ml/lib/python3.11/site-packages/gensim/test/test_data
test corpuses include: 
head500.noblanks.cor
varembed_lee_subcorpus.cor
lee.cor
pang_lee_polarity.cor
miIslita.cor
lee_background.cor


In [25]:
# The lee corpus contains a few hundred documents from an Australian news service
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
# A much smaller corpus from the same source
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [26]:
import smart_open

def read_corpus(filename: str, token_only: bool=False):
    # standard ASCII encoding
    with smart_open.open(filename, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            # tokenize each word, remove punctuation, remove cases
            tokens = gensim.utils.simple_preprocess(line)
            if token_only:
                yield tokens
            else:
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [27]:
# Load data
train_data = list(read_corpus(lee_train_file))
test_data = list(read_corpus(lee_test_file))

In [28]:
print(train_data[0])

TaggedDocument<['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', 'caused'

In [29]:
# create model
doc2vec_model = gensim.models.doc2vec.Doc2Vec(
    vector_size=50,     # size of the vector embedding
    min_count=2,        # only consider words that occur at least twice
    epochs=40
)

2023-03-30 17:48:03,765 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>', 'datetime': '2023-03-30T17:48:03.764939', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'created'}


In [31]:
doc2vec_model.build_vocab(train_data)

2023-03-30 17:53:57,926 : INFO : collecting all words and their counts
2023-03-30 17:53:57,928 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-03-30 17:53:57,940 : INFO : collected 6981 word types and 300 unique tags from a corpus of 300 examples and 58152 words
2023-03-30 17:53:57,941 : INFO : Creating a fresh vocabulary
2023-03-30 17:53:57,954 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 3955 unique words (56.65% of original 6981, drops 3026)', 'datetime': '2023-03-30T17:53:57.954239', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'prepare_vocab'}
2023-03-30 17:53:57,955 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 55126 word corpus (94.80% of original 58152, drops 3026)', 'datetime': '2023-03-30T17:53:57.955099', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23

300


In [37]:
# .wv property contains the corpus
print(len(doc2vec_model.wv)) # number of words in the corpus

3955


In [38]:
# Train the model
doc2vec_model.train(train_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

2023-03-30 17:56:51,439 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 3955 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-03-30T17:56:51.439966', 'gensim': '4.3.1', 'python': '3.11.2 (main, Feb 16 2023, 03:15:23) [Clang 14.0.0 (clang-1400.0.29.202)]', 'platform': 'macOS-12.4-x86_64-i386-64bit', 'event': 'train'}
2023-03-30 17:56:51,493 : INFO : EPOCH 0: training on 58152 raw words (42693 effective words) took 0.0s, 903267 effective words/s
2023-03-30 17:56:51,540 : INFO : EPOCH 1: training on 58152 raw words (42696 effective words) took 0.0s, 953061 effective words/s
2023-03-30 17:56:51,590 : INFO : EPOCH 2: training on 58152 raw words (42743 effective words) took 0.0s, 913237 effective words/s
2023-03-30 17:56:51,641 : INFO : EPOCH 3: training on 58152 raw words (42668 effective words) took 0.0s, 868123 effective words/s
2023-03-30 17:56:51,693 : INFO : EPOCH 4: training on 58152 raw wor

In [40]:
# test the embedding of some text:
def get_embedding(model: gensim.models.doc2vec.Doc2Vec, text: str):
    tokens = gensim.utils.simple_preprocess(text)
    return model.infer_vector(tokens)

print(get_embedding(doc2vec_model, 'Hello, world!'))

[-0.37315595 -0.1340146  -0.14571968 -0.01221686 -0.04597435 -0.06007713
  0.06483229  0.1097223  -0.07305732 -0.07222687 -0.15381745 -0.04771632
 -0.16542223 -0.17817795 -0.28254053 -0.01174737  0.0885735  -0.04683914
  0.0599751  -0.08511851  0.0641684   0.06057968  0.18171096  0.06763642
  0.22661273 -0.1925833   0.02115663 -0.06181027  0.05923361 -0.03388556
  0.12880793  0.2043309  -0.2736799   0.16304226 -0.0885709   0.1681882
  0.14373718 -0.08547237  0.13269953 -0.03191527  0.05970052  0.12853006
 -0.03631823 -0.25596973  0.23304014  0.00889588  0.1405064  -0.09080721
  0.02382964 -0.08414119]


In [55]:
# Assessing the model
ranks = []
second_ranks = []
for doc_id in range(len(train_data)):
    embedding = doc2vec_model.infer_vector(train_data[doc_id].words)
    sims = doc2vec_model.dv.most_similar([embedding], topn=len(doc2vec_model.dv))

    rank = [doc_id for docid, sim in sims].index(doc_id) # find the rank of the original doc
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections
counter = collections.Counter(ranks)
print(counter)

Counter({0: 300})


In [42]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % doc2vec_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_data[sims[index][0]].words)))

Document (299): «australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

In [45]:
# Training data
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_data) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_data[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_data[sim_id[0]].words)))

Train Document (51): «russian authorities have sentenced chechen warlord salman raduyev to life in prison for hostage siege in which more than people died salman raduyev is probably the most important chechen fighter russian authorities have ever caught relative of the first chechen president he was at the forefront of the insurgency leading raids against federal troops he was jealous of the achievements of his fellow commanders he resolved to outperform his rival and in january masterminded hostage taking in the neighbouring republic of dagestan apparently the aim was to destabilise dagestan and spread the war to the rest of the caucuses he ran out of luck as russian solders were not prepared to negotiate and cornered raduyev on the chechen border»

Similar Document (141, 0.7087212800979614): «united states air strikes on al qaeda fighters have intensified following the collapse of surrender talks with the northern alliance the battle for tora bora appears to be heading towards bloody

In [56]:
# Testing data
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_data) - 1)
inferred_vector = doc2vec_model.infer_vector(test_data[doc_id].words)
sims = doc2vec_model.dv.most_similar([inferred_vector], topn=len(doc2vec_model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER doc2vec_model %s:\n' % doc2vec_model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_data[sims[index][0]].words)))

Test Document (17): «the united nations world food program estimates that up to million people in seven countries malawi mozambique zambia angola swaziland lesotho and zimbabwe face death by starvation unless there is massive international response in malawi as many as people may have already died the signs of malnutrition swollen stomachs stick thin arms light coloured hair are everywhere»

SIMILAR/DISSIMILAR DOCS PER doc2vec_model Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (296, 0.7907843589782715): «today is world aids day and the latest figures show that million people are living with hiv world wide the latest united nations report on the aids epidemic has found eastern europe and the republics of the former soviet union are becoming the new battleground in the fight against the disease un officials say in russia the number of people carrying hiv doubles almost annually while ukraine has become the first nation in europe to report per cent of its adult population is hiv positive 

In [57]:
# Test the model trained with the small test corpus, on some examples
from numpy import dot
from numpy.linalg import norm

def get_similarity(text1: str, text2: str)-> float:
    embed1 = doc2vec_model.infer_vector(gensim.utils.simple_preprocess(text1))
    embed2 = doc2vec_model.infer_vector(gensim.utils.simple_preprocess(text2))
    # Cosine similarity
    return dot(embed1, embed2)/(norm(embed1))/(norm(embed2))

In [74]:
places={
            "NYC":
                "The largest city in the United States.",
            "Central Park":
                "Central Park is an urban park in New York City. It is the fifth-largest park in the city. The park has natural-looking plantings and landforms, having been almost entirely landscaped when built in the 1850s and 1860s. It has eight lakes and ponds that were created artificially by damming natural seeps and flows.",
            "Los Angeles Library":
                "The Los Angeles Public Library provides free and easy access to information, ideas, books and technology that enrich, educate and empower every individual in our city's diverse communities."
        }
keywords = ["warm", "knowledge", "busy", "urban"]
for keyword in keywords:
    similarity = {place: get_similarity(places[place], keyword) for place in places.keys()}
    best = max(places.keys(), key=lambda key: similarity[key])
    print(f'best for "{keyword}" is "{best}"')

best for "warm" is "Los Angeles Library"
best for "knowledge" is "Los Angeles Library"
best for "busy" is "NYC"
best for "urban" is "Central Park"
