# Doc2Vec Training Notebook

In this notebook we will train the Doc2Vec model and save its output

In [None]:
import pandas as pd
from tqdm import tqdm
import logging
tqdm.pandas()
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from datetime import datetime

start = datetime.now()

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
data = pd.read_parquet('prepared-data.pq')
data.head()

Tag each document

In [None]:
documents = []
for i, sentence in tqdm(enumerate(data['toks']), total=data.shape[0]):
    documents.append(TaggedDocument(list(sentence), [i]))
len(documents)

In [None]:
model = Doc2Vec(documents, vector_size=128, window=5, min_count=50, workers=6, epochs=15)

In [None]:
model.save('doc2vec.model')

Compute the Document Vectors, this will speed up our DeepLabeler model from needing to do this when collating the data. 

In [None]:
model = Doc2Vec.load('doc2vec.model')
data['doc'] = data['toks'].progress_apply(lambda x: model.infer_vector(x))

In [None]:
docs = pd.DataFrame(data['doc'].apply(lambda x: list(x)).to_list(), columns=[f'vec.{i}' for i in range(128)])
docs.head()

In [None]:
docs['SUBJECT_ID'] = data['SUBJECT_ID'].copy()
docs['HADM_ID'] = data['HADM_ID'].copy()

In [None]:
docs.to_parquet('prepared-doc2vec.pq', index=False)

In [None]:
end = datetime.now()
total_time = end - start
total_time