### Imports

In [1]:
import pandas as pd
from gensim.models import Word2Vec
import ast

## Dental word2vec model

In [29]:
corpus_dental = []
with open("data/sentences_text_clean_data.txt") as input_file:
    for line in input_file:
        corpus_dental.append(ast.literal_eval(line.split("\n")[0]))

In [30]:
epoch_count = 20

In [31]:
model = Word2Vec(
    corpus_dental,
    min_count=1,
    vector_size=256,
    epochs=epoch_count
)

In [32]:
# Write the vectors for each word to a .tsv file for visualization
pd.DataFrame(model.wv.vectors).to_csv(
    f'data/model_word2vec{epoch_count}_data_dental_vectors.tsv',
    sep="\t",header=False,index=False)

In [33]:
# Write the words to a .tsv file for visualization
pd.DataFrame(model.wv.index_to_key).to_csv(
    f'data/model_word2vec{epoch_count}_data_dental_words.tsv',
    sep="\t",header=False,index=False)

In [47]:
model.wv.similar_by_word('kopfschmerzen')

[('nackenschmerzen', 0.9185449481010437),
 ('migräne', 0.9095128178596497),
 ('kieferschmerzen', 0.8946459889411926),
 ('kopf-', 0.8896020650863647),
 ('rückenschmerzen', 0.8845061659812927),
 ('kopfschmerze', 0.877302885055542),
 ('ohrenschmerzen', 0.8770471811294556),
 ('verspannungen', 0.8768141865730286),
 ('gesichtsschmerzen', 0.867470383644104),
 ('èbungen', 0.8664426207542419)]

In [35]:
corpus_sts = []
with open("data/sts_sentences_text_clean_data_1.txt") as input_file:
    for line in input_file:
        corpus_sts.append(ast.literal_eval(line.split("\n")[0]))

In [44]:
model_sts = Word2Vec(
    corpus_sts,
    min_count=1,
    vector_size=256,
    epochs=epoch_count
)

In [45]:
model_sts.wv.key_to_index

{"'s": 0,
 'woman': 1,
 'playing': 2,
 'white': 3,
 'black': 4,
 'killed': 5,
 'percent': 6,
 'syria': 7,
 'china': 8,
 "''": 9,
 '``': 10,
 'running': 11,
 'president': 12,
 'police': 13,
 'water': 14,
 'person': 15,
 'red': 16,
 'girl': 17,
 'riding': 18,
 'nuclear': 19,
 'boy': 20,
 'u.s.': 21,
 'standing': 22,
 'sitting': 23,
 'brown': 24,
 "n't": 25,
 'iran': 26,
 'horse': 27,
 'pakistan': 28,
 'guitar': 29,
 'russia': 30,
 'train': 31,
 'dead': 32,
 'kills': 33,
 'dogs': 34,
 'talks': 35,
 'car': 36,
 'egypt': 37,
 'young': 38,
 'slicing': 39,
 'death': 40,
 'group': 41,
 'field': 42,
 'attack': 43,
 'government': 44,
 'walking': 45,
 'million': 46,
 'south': 47,
 'grass': 48,
 'north': 49,
 'obama': 50,
 'world': 51,
 'weapons': 52,
 'snow': 53,
 'korea': 54,
 'syrian': 55,
 'israel': 56,
 'women': 57,
 'small': 58,
 'state': 59,
 'blue': 60,
 'kill': 61,
 'stocks': 62,
 'military': 63,
 'cutting': 64,
 'close': 65,
 'afghanistan': 66,
 'bus': 67,
 'table': 68,
 'found': 69,
 'i

In [58]:
model_sts.wv.similar_by_word('water')

[('animal', 0.9903883337974548),
 ('dock', 0.9895626902580261),
 ('swimming', 0.9879536032676697),
 ('backyard', 0.9878522753715515),
 ('pool', 0.9871654510498047),
 ('woods', 0.9866950511932373),
 ('beach', 0.9866737723350525),
 ('duck', 0.9852656722068787),
 ('sweater', 0.985163152217865),
 ('rope', 0.9850854277610779)]