### Examples from lab

In [1]:
import gensim

assert gensim.models.doc2vec.FAST_VERSION > -1

In [2]:
import gensim
import gensim.test.utils

# Set file names for train and test data
lee_train_file = gensim.test.utils.datapath('lee_background.cor')
lee_test_file = gensim.test.utils.datapath('lee.cor')

In [3]:
import smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

print(train_corpus[2])
print(test_corpus[2])

TaggedDocument<['the', 'national', 'road', 'toll', 'for', 'the', 'christmas', 'new', 'year', 'holiday', 'period', 'stands', 'at', 'eight', 'fewer', 'than', 'for', 'the', 'same', 'time', 'last', 'year', 'people', 'have', 'died', 'on', 'new', 'south', 'wales', 'roads', 'with', 'eight', 'fatalities', 'in', 'both', 'queensland', 'and', 'victoria', 'western', 'australia', 'the', 'northern', 'territory', 'and', 'south', 'australia', 'have', 'each', 'recorded', 'three', 'deaths', 'while', 'the', 'act', 'and', 'tasmania', 'remain', 'fatality', 'free'], [2]>
['the', 'united', 'states', 'government', 'has', 'said', 'it', 'wants', 'to', 'see', 'president', 'robert', 'mugabe', 'removed', 'from', 'power', 'and', 'that', 'it', 'is', 'working', 'with', 'the', 'zimbabwean', 'opposition', 'to', 'bring', 'about', 'change', 'of', 'administration', 'as', 'scores', 'of', 'white', 'farmers', 'went', 'into', 'hiding', 'to', 'escape', 'round', 'up', 'by', 'zimbabwean', 'police', 'senior', 'bush', 'administrat

In [4]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [5]:
model.build_vocab(train_corpus)

In [6]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [7]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x20ee611ba60>

In [8]:
vector = model.infer_vector(['Only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.21520542 -0.2505608  -0.02741721  0.24606045 -0.03637378  0.11938438
  0.13880064  0.00824329 -0.17791064 -0.12373569  0.07246169  0.02520719
 -0.06957895 -0.00248159 -0.1750319  -0.04833046  0.02146124  0.11537363
  0.19292526 -0.11860263 -0.06782593 -0.02835242  0.19102825  0.07415181
  0.02622746 -0.07513429 -0.11854329 -0.02253022 -0.14705893 -0.00239915
  0.2794211   0.15299292  0.24348325  0.07356468  0.1999081   0.20292024
 -0.14333631 -0.25024962 -0.03168344 -0.04608949  0.12515488 -0.03963595
 -0.02629471 -0.23895192  0.14269777  0.08968136 -0.0955759  -0.13183363
  0.16588934  0.09215467]


In [9]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 291, 1: 9})


In [10]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (11): «intelligence cannot say conclusively that saddam hussein has weapons of mass destruction an information gap that is complicating white house efforts to build support for an attack on saddam iraqi regime the cia has advised top administration officials to assume that iraq has some weapons of mass destruction but the agency has not given president bush smoking gun according to intelligence and administration officials»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (75, 0.7830148339271545): «us president george bush has marked the th day of the campaign against terrorism by calling on his allies to freeze the assets of two non us organisations suspected of supporting terrorism one of the groups is based in kashmir the other is alleged to have helped al qaeda develop nuclear weapons president bush says former scientist at pakistan atomic program had established group called utn after assisting osama bin laden network develop nuclear bo

### Task 0:
 Train your own doc2vec model on a test dataset. 

In [11]:
!pip install fastparquet



In [13]:
import pandas as pd

df = pd.read_csv("hf://datasets/CShorten/ML-ArXiv-Papers/ML-Arxiv-Papers.csv")
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,abstract
0,0,0.0,Learning from compressed observations,The problem of statistical learning is to co...
1,1,1.0,Sensor Networks with Random Links: Topology De...,"In a sensor network, in practice, the commun..."
2,2,2.0,The on-line shortest path problem under partia...,The on-line shortest path problem is conside...
3,3,3.0,A neural network approach to ordinal regression,Ordinal regression is an important type of l...
4,4,4.0,Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close r...
...,...,...,...,...
117587,4995,,Detecting COVID-19 Conspiracy Theories with Tr...,The sharing of fake news and conspiracy theori...
117588,4996,,Fair Feature Subset Selection using Multiobjec...,The feature subset selection problem aims at s...
117589,4997,,A Simple Duality Proof for Wasserstein Distrib...,We present a short and elementary proof of the...
117590,4998,,Combined Learning of Neural Network Weights fo...,"We introduce CoLN, Combined Learning of Neural..."


In [15]:
df["text"] = (df["title"].fillna('') + " " + df["abstract"].fillna('')).str.strip()


texts = df["text"].astype(str).tolist()
def preprocess_parquet(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


train_corpus = list(preprocess_parquet(texts))
test_corpus = list(preprocess_parquet(texts, tokens_only=True))  

print(train_corpus[2])
print(test_corpus[2])

TaggedDocument<['the', 'on', 'line', 'shortest', 'path', 'problem', 'under', 'partial', 'monitoring', 'the', 'on', 'line', 'shortest', 'path', 'problem', 'is', 'considered', 'under', 'various', 'models', 'of', 'partial', 'monitoring', 'given', 'weighted', 'directed', 'acyclic', 'graph', 'whose', 'edge', 'weights', 'can', 'change', 'in', 'an', 'arbitrary', 'adversarial', 'way', 'decision', 'maker', 'has', 'to', 'choose', 'in', 'each', 'round', 'of', 'game', 'path', 'between', 'two', 'distinguished', 'vertices', 'such', 'that', 'the', 'loss', 'of', 'the', 'chosen', 'path', 'defined', 'as', 'the', 'sum', 'of', 'the', 'weights', 'of', 'its', 'composing', 'edges', 'be', 'as', 'small', 'as', 'possible', 'in', 'setting', 'generalizing', 'the', 'multi', 'armed', 'bandit', 'problem', 'after', 'choosing', 'path', 'the', 'decision', 'maker', 'learns', 'only', 'the', 'weights', 'of', 'those', 'edges', 'that', 'belong', 'to', 'the', 'chosen', 'path', 'for', 'this', 'problem', 'an', 'algorithm', 'is

In [16]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [17]:
model.build_vocab(train_corpus)

In [18]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x20f245ba080>

### Task 1:
Assess validity of the model

In [20]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.16021617 -0.01150709  0.27132425  0.38358036 -0.3544724  -0.0689206
  0.4238491  -0.4716654  -0.17326868  0.05555303  0.54163843  0.00113569
  0.00160924 -0.00656376 -0.74199665 -0.25595453  0.07924189 -0.43915245
  0.02996937  0.17150386  0.1060473   0.04494313  0.16087134 -0.719976
 -0.15444705  0.27541524  0.46607977 -0.4355618  -0.42564014 -0.05880697
 -0.08311263  0.4945154   0.5161619   0.10167328 -0.44751242 -0.11406519
 -0.17334495 -0.35973993  0.49473974 -0.8351739   0.0500171  -0.05214887
 -0.10714909 -0.12761602  0.02966735 -0.09892797  0.28122318  0.35298225
 -0.24528234 -0.20091583]


In [21]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

KeyboardInterrupt: 

![Screenshot 2025-06-15 222947.png](attachment:169fdbcc-7d04-43d7-b629-e5fec8b532d5.png)

In [22]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (114874): «resact reinforcing long term engagement in sequential recommendation with residual actor long term engagement is preferred over immediate engagement in sequential recommendation as it directly affects product operational metrics such as daily active users daus and dwell time meanwhile reinforcement learning rl is widely regarded as promising framework for optimizing long term engagement in sequential recommendation however due to expensive online interactions it is very difficult for rl algorithms to perform state action value estimation exploration and feature extraction when optimizing long term engagement in this paper we propose resact which seeks policy that is close to but better than the online serving policy in this way we can collect sufficient data near the learned policy so that state action values can be properly estimated and there is no need to perform online exploration directly optimizing this policy is difficult due to the huge policy space res