# Word2Vec Model Training Notebook

In this notebook, we will train the word2vec model and save it's output. 

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import logging
from gensim.models import Word2Vec
from datetime import datetime

start = datetime.now()

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
data = pd.read_parquet('prepared-data.pq')
data.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,toks,ICD9_CODE
0,3,145834.0,sex m service medicine chief complaint admitt...,"[sex, m, service, medicine, chief, complaint, ...","[0389, 78559, 5849, 4275, 41071, 4280, 6826, 4..."
1,4,185777.0,sex f service chief complaint shortness of br...,"[sex, f, service, chief, complaint, shortness,...","[042, 1363, 7994, 2763, 7907, 5715, 04111, V090]"
2,6,107064.0,sex f service admission diagnosis end stage r...,"[sex, f, service, admission, diagnosis, end, s...","[40391, 4440, 9972, 2766, 2767, 2859, 2753, V1..."
3,9,150750.0,sex m service neurology chief complaint weakn...,"[sex, m, service, neurology, chief, complaint,...","[431, 5070, 4280, 5849, 2765, 4019]"
4,10,184167.0,sex f service history of present illness baby...,"[sex, f, service, history, of, present, illnes...","[V3000, 7742, 76525, 76515, V290]"


In [4]:
sentences = data['toks'].progress_apply(lambda x: list(x)).to_list()

100%|██████████| 52622/52622 [00:01<00:00, 44950.99it/s]


In [5]:
model = Word2Vec(sentences, vector_size=100, workers=6, min_count=50, sg=1, epochs=15, window=5)

2023-04-15 13:18:00,804 : INFO : collecting all words and their counts
2023-04-15 13:18:00,805 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-15 13:18:01,495 : INFO : PROGRESS: at sentence #10000, processed 12521549 words, keeping 51674 word types
2023-04-15 13:18:02,226 : INFO : PROGRESS: at sentence #20000, processed 25066550 words, keeping 71785 word types
2023-04-15 13:18:03,003 : INFO : PROGRESS: at sentence #30000, processed 38126453 words, keeping 89122 word types
2023-04-15 13:18:04,019 : INFO : PROGRESS: at sentence #40000, processed 54015622 words, keeping 110264 word types
2023-04-15 13:18:05,126 : INFO : PROGRESS: at sentence #50000, processed 70828953 words, keeping 129348 word types
2023-04-15 13:18:05,404 : INFO : collected 133808 word types from a corpus of 75259763 raw words and 52622 sentences
2023-04-15 13:18:05,405 : INFO : Creating a fresh vocabulary
2023-04-15 13:18:05,436 : INFO : Word2Vec lifecycle event {'msg': 'effective_min

In [6]:
model.save('word2vec.model')

2023-04-15 13:28:37,559 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-15T13:28:37.559485', 'gensim': '4.3.1', 'python': '3.11.2 (main, Mar 27 2023, 23:42:44) [GCC 11.2.0]', 'platform': 'Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'saving'}
2023-04-15 13:28:37,560 : INFO : not storing attribute cum_table
2023-04-15 13:28:37,578 : INFO : saved word2vec.model


In [7]:
end = datetime.now()
total_time = end - start
total_time

datetime.timedelta(seconds=642, microseconds=75618)