In [2]:
import pandas as pd
#from top2vec import Top2Vec
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types

from tensorflow.keras.layers import Input
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Reshape, Embedding, Concatenate, dot
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.losses import cosine_similarity
from tensorflow.keras.callbacks import TensorBoard
from tensorboard.plugins import projector

In [4]:
!which jupyter

/home/ubuntu/thesis_env2/bin/jupyter


In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  2
GPUs:  2


In [6]:
df = pd.read_pickle('./Data/df_processed.pickle')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367947 entries, 0 to 367946
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181781 non-null  object        
 1   date               367947 non-null  datetime64[ns]
 2   domain             367947 non-null  object        
 3   title              367862 non-null  object        
 4   url                367947 non-null  object        
 5   content            367947 non-null  object        
 6   topic_area         367947 non-null  object        
 7   content_processed  367947 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 22.5+ MB


In [8]:
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[end, year, corner, past time, think, position..."


In [9]:
# Note to do - need to add time element

def log_newline(self, how_many_lines=1):
    file_handler = None
    if self.handlers:
        file_handler = self.handlers[0]

    # Switch formatter, output a blank line
    file_handler.setFormatter(self.blank_formatter)
    for i in range(how_many_lines):
        self.info('')

    # Switch back
    file_handler.setFormatter(self.default_formatter)

def logger_w2v():
    
    log_file = os.path.join('./Data', 'word2vec.log')
    print('log file location: ', log_file)
    
    log_format= '%(asctime)s - %(levelname)s - [%(module)s]\t%(message)s'
    formatter = logging.Formatter(fmt=(log_format))
    
    fhandler = logging.FileHandler(log_file)
    fhandler.setFormatter(formatter)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    logger.default_formatter = formatter
    logger.blank_formatter = logging.Formatter(fmt="")
    logger.newline = types.MethodType(log_newline, logger)
    
    return logger
    

In [60]:
class Word2Vec:
    """
    apply word2vec to text
    """

    def __init__(self, logger, vocab_size, vector_dim, input_target, input_context,
                 load_pretrained_weights, weights_file_name, train_model_flag, checkpoint_file):
        """
        Args:
            vocab size: integer of number of words to form vocabulary from
            vector_dim: integer of number of dimensions per word
            input_target: tensor representing target word
            input_context: tensor representing context word
        """
        self.logger = logger        
        self.vocab_size = vocab_size
        self.vector_dim = vector_dim
        self.input_target = input_target
        self.input_context = input_context
        self.load_pretrained_weights = load_pretrained_weights
        self.weights_file_name = weights_file_name
        self.checkpoint_file = checkpoint_file
        self.train_model_flag = train_model_flag
        self.model = self.create_model()
        
    def build_dataset(self, words):
        """
        :process raw inputs into a dataset

        Args:
            words: list of strings

        Returns:
            tuple:
                data: list of integers representing words in words
                count: list of count of most frequent words with size n_words
                dictionary: dictionary of word to unique integer
                reverse dictionary: dictionary of unique integer to word
        """
        self.logger.info("Building dataset")

        count = [['UNK', -1]]
        words = [item for sublist in words for item in sublist]
        count.extend(collections.Counter(words).most_common(self.vocab_size - 1))
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        self.dictionary = dictionary

        # Save dictionary
        dict_path = './Data'
        dict_file = 'dictionary.csv'
        dict_file = os.path.join(dict_path,dict_file)
        
        with open(dict_file, 'w') as f:
            for key in dictionary.keys():
                f.write("%s,%s\n"%(key,dictionary[key]))

        return data, count, dictionary, reversed_dictionary
    
    def get_training_data(self, data, window_size):
        """
        :create text and label pairs for model training

        Args:
            data: list of integers representing words in words
            window_size: integer of number of words around the target word that
                         will be used to draw the context words from.

        Returns:
            tuple:
                word_target: list of arrays representing target word 
                word_context: list of arrays representing context word in 
                              relation to target word
                labels: list containing 1 for true context, 0 for false context
        """
        sampling_table = sequence.make_sampling_table(self.vocab_size)
        couples, labels = skipgrams(data, self.vocab_size, window_size=window_size, 
                                    sampling_table=sampling_table)

        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        return word_target, word_context, labels

    def create_model(self):
        """
        :keras functional API and embedding layers

        Returns:
            model: untrained word2vec model
        """

        # embedding layer
        embedding = Embedding(self.vocab_size, self.vector_dim, input_length=1, name='embedding')

        # embedding vectors
        target = embedding(self.input_target)
        target = Reshape((self.vector_dim, 1))(target)
        context = embedding(self.input_context)
        context = Reshape((self.vector_dim, 1))(context)

        # dot product operation to get a similarity measure
        dot_product = dot([target, context], axes=1, normalize=False)
        dot_product = Reshape((1,))(dot_product)

        # add the sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)

        # create the training model
        self.model = Model(inputs=[self.input_target, self.input_context], outputs=output)

        return self.model

    def train_model(self, epochs, batch_size, word_target, word_context, labels):
        """
        :trains word2vec model

        Args:
            model: word2vec model
            epochs: integer of number of iterations to train model on
            batch_size: integer of number of words to pass to epoch
            word_target: list of arrays representing target word 
            word_context: list of arrays representing context word in relation 
                          to target word
            labels: list containing 1 for true context, 0 for false context

        Returns:
            model: trained word2vec model
        """
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        #loss = tf.keras.losses.BinaryCrossentropy()
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer)

        # tensorboard callback
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir='tensorboard_log/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)

        if self.load_pretrained_weights:
            self.load_prior_weights()
            if not self.train_model_flag:
                return self.model

        arr_1 = np.zeros((batch_size,))
        arr_2 = np.zeros((batch_size,))
        arr_3 = np.zeros((batch_size,))
        for i in range(epochs):
            idx = np.random.choice(list(range(len(labels))), size=batch_size, replace=False)
            arr_1[:] = np.array([word_target[i] for i in idx])
            arr_2[:] = np.array([word_context[i] for i in idx])
            arr_3[:] = np.array([labels[i] for i in idx])
            loss = self.model.train_on_batch([arr_1, arr_2], arr_3)
            with summary_writer.as_default():
                tf.summary.scalar('loss', loss, step=i)
            if (i+1) % 500 == 0:
                print("Iteration {}, loss={}".format(i+1, loss))
            if (i+1) % 1000 == 0:
                checkpoint_dir = './model/model_weights'
                checkpoint_file = f"cp-epoch-{i+1:010d}.h5"
                checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
                self.model.save_weights(checkpoint_path)
                self.embedding_projector(log_dir)
            #if (i+1) % 10 == 0:
                # Embeddings Projector
                #logger.info("prepare data for visualization")
                #visualization_prep.tsv_file(self.model, dictionary, visualization_dir)
                #visualization_prep.tb_projector(self.model, dictionary, log_dir)

        return self.model
    
    def embedding_projector(self, log_dir):
        """
        :visualise embeddings in tensorboard
        """
        # Save Labels separately on a line-by-line manner.
        with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
            for subwords in self.dictionary.keys():
                f.write("{}\n".format(subwords))
            # Fill in the rest of the labels with "unknown"
            for unknown in range(1, self.vocab_size - len(self.dictionary.keys())):
                f.write("unknown #{}\n".format(unknown))

        # Save the weights we want to analyse as a variable. 
        weights = tf.Variable(self.model.layers[2].get_weights()[0])
        checkpoint_w = tf.train.Checkpoint(embedding=weights)
        checkpoint_w.save(os.path.join(log_dir, "embedding.ckpt"))

        # Set up config
        config_tb = projector.ProjectorConfig()
        embedding_tb = config_tb.embeddings.add()
        embedding_tb.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
        embedding_tb.metadata_path = 'metadata.tsv'
        projector.visualize_embeddings(log_dir, config_tb)
        
        
    def load_prior_weights(self):
        """
        :load prior weights if load_pretrained_weights = True in main file
        """ 
        #abs_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))
        #checkpoint_dir = os.path.join(abs_path, self.config['model']['model_dir'], self.config['model']['model_weights'])
        #checkpoint_path = os.path.join(checkpoint_dir,self.checkpoint_file)
        checkpoint_dir = './model/model_weights'
        checkpoint_file = self.weights_file_name
        checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
        self.model.load_weights(checkpoint_path)
        self.logger.info('Loaded pre trained wweights from {}'.format(str(checkpoint_path)))

In [54]:
def get_word_vectors(model):
    
    embedding_weights = model.layers[2].get_weights()[0]
    #word_embeddings = {w:embedding_weights[idx] for w, idx in dictionary.items()}
    
    return embedding_weights

In [55]:
words = df['content_processed'][:50000]
sorted(words[4][:10])

['awards shows',
 'celebrities',
 'coronavirus pandemic',
 'life returns',
 'like',
 'normal',
 'parties',
 'premieres',
 'things',
 'walk red carpets']

In [56]:
logger = logger_w2v()

vocab_size = 10000
vector_dim = 250
input_target = Input((1,))
input_context = Input((1,))
load_pretrained_weights = True
weights_file_name = f"cp-epoch-0000010000-B1000.h5"
checkpoint_file = None
train_model_flag = False

word2vec = Word2Vec(logger, vocab_size, vector_dim, input_target, input_context,
                    load_pretrained_weights, weights_file_name, train_model_flag, checkpoint_file)

data, count, dictionary, reversed_dictionary = word2vec.build_dataset(words)

log file location:  ./Data/word2vec.log


In [57]:
print(len(dictionary))
print(len(data))
#count
#reversed_dictionary
#dictionary

10000
12155341


In [58]:
window_size = 3

word_target, word_context, labels = word2vec.get_training_data(data, window_size)

### Parameter Notes

BATCH SIZE ADJ  
20210414-082710 - shows training improvement from 0.69 to 0.43 loss  
aritcles 50,000  
batch_size 1000  
epochs 5000  

ARTICLES PROCESSED ADJ   
20210413-204616 - shows training improvement from 0.69 to 0.52 loss  
aritcles 20,000  
batch_size 100  
epochs 5000 

LEARNING RATE ADJ  
20210413-161618 - shows training improvement from 0.69 to 0.68 loss  
aritcles 50,000  
batch_size 100  
epochs 5000
learning rate = 1e-4 (normally 1e-3)

In [61]:
### epochs = 1
batch_size = 1000

logger.info("Training model with {} epochs".format(epochs))

model = word2vec.train_model(epochs, batch_size, word_target, word_context, labels)


point 3


# Document Vectors - WIP - INCOMPLETE

In [77]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import umap
import hdbscan

In [78]:
word_embeddings = get_word_vectors(model)
print(len(word_embeddings))
word_embeddings

10000


array([[ 0.01714278,  0.00172679,  0.00864159, ...,  0.02745277,
        -0.04311557,  0.03495124],
       [-0.02374163, -0.00595867,  0.08397059, ..., -0.20137618,
         0.02719574,  0.2900733 ],
       [ 0.13762866, -0.28491428,  0.31918922, ..., -0.34580296,
        -0.17177935,  0.2265929 ],
       ...,
       [-0.11215585, -0.01939139,  0.1825676 , ..., -0.11114957,
         0.08726616, -0.01546698],
       [ 0.04149646,  0.12326611,  0.29882836, ..., -0.1440421 ,
         0.1885879 , -0.11382382],
       [ 0.23639232,  0.07771707,  0.01684266, ..., -0.13491702,
        -0.234442  , -0.25808415]], dtype=float32)

In [79]:
documents = df['content_processed'][:50000]

In [85]:
class DocVec:
    """
    apply doc2vec to text
    """

    def __init__(self, logger, documents, reversed_dictionary, word_embeddings):
        """
        Args:
            vocab size: integer of number of words to form vocabulary from
            vector_dim: integer of number of dimensions per word
            input_target: tensor representing target word
            input_context: tensor representing context word
        """
        self.logger = logger        
        self.documents = documents
        self.index_to_word_dict = reversed_dictionary
        self.word_embeddings = word_embeddings
        
        doc2vec_args = {"vector_size": 250,
                "min_count": 50,
                "window": 15,
                "sample": 1e-5,
                "negative": 0,
                "hs": 1,
                "epochs": 50,
                "dm": 0,
                "dbow_words": 1}

        logger.info('Pre-processing documents for training')

        train_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
        doc2vec_args["documents"] = train_corpus
        
        logger.info('Creating joint document/word embedding')
        #self.model = Doc2Vec(**doc2vec_args)
        self.model = Doc2Vec(vector_size = 250,
                min_count = 50,
                window = 15,
                sample = 1e-5,
                negative = 0,
                hs = 1,
                epochs = 50,
                dm = 0,
                dbow_words = 1,
                documents = train_corpus)
        
        print('point 1')
        
        
        # create 5D embeddings of documents
        logger.info('Creating lower dimension embedding of documents')
        
        umap_args = {'n_neighbors': 15,
                     'n_components': 5,
                     'metric': 'cosine'}
        
        umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))

        # find dense areas of document vectors
        logger.info('Finding dense areas of documents')
        
        hdbscan_args = {'min_cluster_size': 15,
                         'metric': 'euclidean',
                         'cluster_selection_method': 'eom'}

        cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
        
        # calculate topic vectors from dense areas of documents
        logger.info('Finding topics')

        # create topic vectors
        self._create_topic_vectors(cluster.labels_)

        # deduplicate topics
        self._deduplicate_topics()

        # find topic words and scores
        self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
        
        
        
    @staticmethod
    def _l2_normalize(vectors):

        if vectors.ndim == 2:
            return normalize(vectors)
        else:
            return normalize(vectors.reshape(1, -1))[0]
    
    def _get_document_vectors(self, norm=True):

        if norm:
            self.model.docvecs.init_sims()
            return self.model.docvecs.vectors_docs_norm
        else:
            return self.model.docvecs.vectors_docs
    
    def _create_topic_vectors(self, cluster_labels):

        unique_labels = set(cluster_labels)
        if -1 in unique_labels:
            unique_labels.remove(-1)
        self.topic_vectors = self._l2_normalize(
            np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]]
                      .mean(axis=0) for label in unique_labels]))

    def _deduplicate_topics(self):
        core_samples, labels = dbscan(X=self.topic_vectors,
                                      eps=0.1,
                                      min_samples=2,
                                      metric="cosine")

        duplicate_clusters = set(labels)

        if len(duplicate_clusters) > 1 or -1 not in duplicate_clusters:

            # unique topics
            unique_topics = self.topic_vectors[np.where(labels == -1)[0]]

            if -1 in duplicate_clusters:
                duplicate_clusters.remove(-1)

            # merge duplicate topics
            for unique_label in duplicate_clusters:
                unique_topics = np.vstack(
                    [unique_topics, self._l2_normalize(self.topic_vectors[np.where(labels == unique_label)[0]]
                                                       .mean(axis=0))])

            self.topic_vectors = unique_topics
            
    def _index2word(self, index):
        return self.index_to_word_dict[index]

    def _get_word_vectors(self):
        return self.word_embeddings
            
    def _find_topic_words_and_scores(self, topic_vectors):
        topic_words = []
        topic_word_scores = []

        res = np.inner(topic_vectors, self._get_word_vectors())
        top_words = np.flip(np.argsort(res, axis=1), axis=1)
        top_scores = np.flip(np.sort(res, axis=1), axis=1)

        for words, scores in zip(top_words, top_scores):
            topic_words.append([self._index2word(i) for i in words[0:50]])
            topic_word_scores.append(scores[0:50])

        topic_words = np.array(topic_words)
        topic_word_scores = np.array(topic_word_scores)

        return topic_words, topic_word_scores

In [86]:
doc2vec = Doc2Vec(logger, documents, reversed_dictionary, word_embeddings)

TypeError: 'dict' object cannot be interpreted as an integer

In [None]:
train_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]

test = Doc2Vec(vector_size = 250,
                min_count = 50,
                window = 15,
                sample = 1e-5,
                negative = 0,
                hs = 1,
                epochs = 50,
                dm = 0,
                dbow_words = 1,
                documents = train_corpus)