In [49]:
import os
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import random

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [50]:
data_file = '../data/preprocessed_data.csv'

data = pd.read_csv(data_file, index_col=0)

X = data['plot'].values
X = X[:100]

In [64]:
def read_corpus(X, tokens_only=False):
    for i, line in enumerate(X):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            yield TaggedDocument(tokens, [i])
            
train_corpus = list(read_corpus(X))
X_tokens = list(read_corpus(X, tokens_only=True))

In [55]:
model = Doc2Vec(vector_size=300, min_count=3, epochs=40)
model.build_vocab(train_corpus)
print(f"Word 'life' appeared {model.wv.get_vecattr('life', 'count')} times in the training corpus.")

2023-11-20 16:36:34,757 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d300,n5,w5,mc3,s0.001,t3>', 'datetime': '2023-11-20T16:36:34.757011', 'gensim': '4.3.2', 'python': '3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:07:37) [GCC 12.3.0]', 'platform': 'Linux-4.15.0-210-generic-x86_64-with-glibc2.27', 'event': 'created'}
2023-11-20 16:36:34,759 : INFO : collecting all words and their counts
2023-11-20 16:36:34,760 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-11-20 16:36:34,763 : INFO : collected 2828 word types and 100 unique tags from a corpus of 100 examples and 5194 words
2023-11-20 16:36:34,764 : INFO : Creating a fresh vocabulary
2023-11-20 16:36:34,767 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 retains 471 unique words (16.65% of original 2828, drops 2357)', 'datetime': '2023-11-20T16:36:34.767497', 'gensim': '4.3.2', 'python': '3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:07:37

Word 'life' appeared 25 times in the training corpus.


In [56]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2023-11-20 16:36:35,051 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 471 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-11-20T16:36:35.051599', 'gensim': '4.3.2', 'python': '3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:07:37) [GCC 12.3.0]', 'platform': 'Linux-4.15.0-210-generic-x86_64-with-glibc2.27', 'event': 'train'}
2023-11-20 16:36:35,076 : INFO : EPOCH 0: training on 5194 raw words (2159 effective words) took 0.0s, 113819 effective words/s
2023-11-20 16:36:35,096 : INFO : EPOCH 1: training on 5194 raw words (2153 effective words) took 0.0s, 133555 effective words/s
2023-11-20 16:36:35,120 : INFO : EPOCH 2: training on 5194 raw words (2147 effective words) took 0.0s, 104948 effective words/s
2023-11-20 16:36:35,141 : INFO : EPOCH 3: training on 5194 raw words (2146 effective words) took 0.0s, 124424 effective words/s
2023-11-20 16:36:35,161 : INFO : EPOCH 4: training o

In [66]:
embeddings = []
# vector = model.infer_vector(X)
for i in tqdm(X_tokens):
    embeddings.append(model.infer_vector(i))
print(len(embeddings), embeddings[0].shape)

100 (300,)
