In [None]:
import os
import csv
from IPython.display import display
import sys
import time
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import gensim


import random

import psycopg2

from gensim.models import Word2Vec
import multiprocessing
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from sklearn.manifold import TSNE

from sklearn.decomposition import PCA


import seaborn as sns

def tsnescatterplot(model, word, list_names, color_list):
    """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 50), dtype='f')
    word_labels = [word]
    #color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        #color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        #color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=15).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [None]:
def fetch_db_rows():
    # Named cursor => server side cursor. Without this, _all_ data from the
    # query will be fetched into memory, which defeats the purpose of streaming
    query="SELECT codes FROM concept_embeddings.codes_by_site_pasc_78"
    with \
            psycopg2.connect(user="lormanv",
                                  password="",
                                  host="reslnpedsndb06.research.chop.edu",
                                  port="5432",
                                  database="dcc_covid_wk124_220818") as conn, \
            conn.cursor(name='get_rows') as cur:
        cur.itersize = 5
        cur.arraysize = 5
        cur.execute(query)

        while True:
            rows_temp = cur.fetchmany()
            rows=[x[0].split(" ") for x in rows_temp]     
#            rows=[list(set(x)) for x in rows_temp2]
            for row in rows:
#                if (len(row)>1):
                    yield random.sample(row, len(row))
            if not rows:
                break

In [None]:
class SentencesIterator():
    def __init__(self, generator_function):
        self.generator_function = generator_function
        self.generator = self.generator_function()

    def __iter__(self):
        # reset the generator
        self.generator = self.generator_function()
        return self

    def __next__(self):
        result = next(self.generator)
        if result is None:
            raise StopIteration
        else:
            return result

In [None]:
sentences = SentencesIterator(fetch_db_rows)

In [None]:
model = Word2Vec(vector_size=200, window=5, min_count=3, workers=12, sg=1, hs=1, negative=5)
#model.save("word2vec.model")

In [None]:
model.build_vocab(sentences, progress_per=100000)

In [None]:
model.save('./model_all_sites_full')
#new_model = gensim.models.Word2Vec.load('./tmp/mymodel')

In [None]:
model.corpus_count

In [None]:
sentences = SentencesIterator(fetch_db_rows)

In [None]:
model.train(sentences,total_examples=model.corpus_count, epochs=15)

In [None]:
model.save('./model_all_sites_full')

In [None]:
from gensim.models.callbacks import CallbackAny2Vec


class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(loss)
        print(f'  Loss: {loss}')
        self.epoch += 1

loss_logger = LossLogger()

In [None]:
model.train(sentences,total_examples=model.corpus_count, epochs=15, callbacks=[loss_logger],
                                      compute_loss=True)

In [None]:
model=Word2Vec.load('./mymodel')