# Doc2Vec Training Notebook

In this notebook we will train the Doc2Vec model and save its output

In [1]:
import pandas as pd
from tqdm import tqdm
import logging
tqdm.pandas()
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from datetime import datetime

start = datetime.now()

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
data = pd.read_parquet('prepared-data.pq')
data.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,toks,ICD9_CODE
0,3,145834.0,sex m service medicine chief complaint admitt...,"[sex, m, service, medicine, chief, complaint, ...","[0389, 78559, 5849, 4275, 41071, 4280, 6826, 4..."
1,4,185777.0,sex f service chief complaint shortness of br...,"[sex, f, service, chief, complaint, shortness,...","[042, 1363, 7994, 2763, 7907, 5715, 04111, V090]"
2,6,107064.0,sex f service admission diagnosis end stage r...,"[sex, f, service, admission, diagnosis, end, s...","[40391, 4440, 9972, 2766, 2767, 2859, 2753, V1..."
3,9,150750.0,sex m service neurology chief complaint weakn...,"[sex, m, service, neurology, chief, complaint,...","[431, 5070, 4280, 5849, 2765, 4019]"
4,10,184167.0,sex f service history of present illness baby...,"[sex, f, service, history, of, present, illnes...","[V3000, 7742, 76525, 76515, V290]"


Tag each document

In [4]:
documents = []
for i, sentence in tqdm(enumerate(data['toks']), total=data.shape[0]):
    documents.append(TaggedDocument(list(sentence), [i]))
len(documents)

100%|██████████| 52622/52622 [00:01<00:00, 30205.80it/s]


52622

In [5]:
model = Doc2Vec(documents, vector_size=128, window=5, min_count=50, workers=6, epochs=15)

2023-04-15 13:28:52,735 : INFO : collecting all words and their counts
2023-04-15 13:28:52,735 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-04-15 13:28:53,480 : INFO : PROGRESS: at example #10000, processed 12521549 words (16809733 words/s), 51674 word types, 0 tags
2023-04-15 13:28:54,268 : INFO : PROGRESS: at example #20000, processed 25066550 words (15935441 words/s), 71785 word types, 0 tags
2023-04-15 13:28:55,080 : INFO : PROGRESS: at example #30000, processed 38126453 words (16102383 words/s), 89122 word types, 0 tags
2023-04-15 13:28:56,053 : INFO : PROGRESS: at example #40000, processed 54015622 words (16332809 words/s), 110264 word types, 0 tags
2023-04-15 13:28:57,110 : INFO : PROGRESS: at example #50000, processed 70828953 words (15928319 words/s), 129348 word types, 0 tags
2023-04-15 13:28:57,389 : INFO : collected 133808 word types and 52622 unique tags from a corpus of 52622 examples and 75259763 words
2023-04-15 13:28:57,390

In [6]:
model.save('doc2vec.model')

2023-04-15 13:33:27,269 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-15T13:33:27.269152', 'gensim': '4.3.1', 'python': '3.11.2 (main, Mar 27 2023, 23:42:44) [GCC 11.2.0]', 'platform': 'Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'saving'}
2023-04-15 13:33:27,270 : INFO : not storing attribute cum_table
2023-04-15 13:33:27,320 : INFO : saved doc2vec.model


Compute the Document Vectors, this will speed up our DeepLabeler model from needing to do this when collating the data. 

In [7]:
model = Doc2Vec.load('doc2vec.model')
data['doc'] = data['toks'].progress_apply(lambda x: model.infer_vector(x))

2023-04-15 13:33:27,352 : INFO : loading Doc2Vec object from doc2vec.model
2023-04-15 13:33:27,369 : INFO : loading dv recursively from doc2vec.model.dv.* with mmap=None
2023-04-15 13:33:27,369 : INFO : loading wv recursively from doc2vec.model.wv.* with mmap=None
2023-04-15 13:33:27,370 : INFO : setting ignored attribute cum_table to None
2023-04-15 13:33:27,428 : INFO : Doc2Vec lifecycle event {'fname': 'doc2vec.model', 'datetime': '2023-04-15T13:33:27.428340', 'gensim': '4.3.1', 'python': '3.11.2 (main, Mar 27 2023, 23:42:44) [GCC 11.2.0]', 'platform': 'Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'loaded'}
100%|██████████| 52622/52622 [11:09<00:00, 78.64it/s] 


In [8]:
docs = pd.DataFrame(data['doc'].apply(lambda x: list(x)).to_list(), columns=[f'vec.{i}' for i in range(128)])
docs.head()

Unnamed: 0,vec.0,vec.1,vec.2,vec.3,vec.4,vec.5,vec.6,vec.7,vec.8,vec.9,...,vec.118,vec.119,vec.120,vec.121,vec.122,vec.123,vec.124,vec.125,vec.126,vec.127
0,-1.168114,-0.364408,-0.351587,0.042022,-0.216914,-1.940896,0.732966,0.029827,-1.218427,1.241066,...,-0.278072,-1.457268,0.717602,0.417174,1.228172,2.783473,-0.348849,-0.746716,-0.254826,-0.162575
1,-1.172263,1.296322,0.629791,2.0727,1.218928,-0.833741,0.648946,-1.671474,0.172852,1.269101,...,0.47846,-0.168765,1.570506,-0.722183,-0.057752,1.903284,2.839895,0.01853,0.39388,1.606591
2,-1.003505,0.81726,0.758062,0.308259,1.973406,-0.495774,0.429993,-0.705302,0.584977,-0.025737,...,0.304864,-0.620745,-0.982737,1.033692,1.223478,1.897892,2.041448,-2.15416,1.752321,-0.315013
3,-2.658166,1.205859,0.58838,-0.86818,0.052036,-0.845896,-0.28672,0.319456,-1.260307,1.001276,...,1.147267,-0.893167,1.185009,-0.650903,0.230766,1.178955,-1.222234,-0.486649,-0.897554,0.288846
4,-0.528551,1.135079,1.261992,0.542736,1.472284,-0.440664,0.915686,-0.205491,-0.306012,1.124715,...,0.504748,0.275608,0.184678,-0.164261,0.018767,1.694578,0.353929,-1.015501,-0.642076,-0.294704


In [9]:
docs['SUBJECT_ID'] = data['SUBJECT_ID'].copy()
docs['HADM_ID'] = data['HADM_ID'].copy()

In [10]:
docs.to_parquet('prepared-doc2vec.pq', index=False)

In [11]:
end = datetime.now()
total_time = end - start
total_time

datetime.timedelta(seconds=951, microseconds=687008)