In [1]:
import pandas as pd
#from top2vec import Top2Vec
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types
import pickle
from tqdm import tqdm

from tensorflow.keras.layers import Input
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Reshape, Embedding, Concatenate, dot
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.losses import cosine_similarity
from tensorflow.keras.callbacks import TensorBoard
from tensorboard.plugins import projector


In [2]:
!which jupyter

/home/ubuntu/thesis_env2/bin/jupyter


In [3]:
import os
#os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

#tf.debugging.set_log_device_placement(True)

physical_devices = tf.config.list_physical_devices('GPU')
for gpu_instance in physical_devices: 
    tf.config.experimental.set_memory_growth(gpu_instance, True)
print(physical_devices)
#tf.config.set_visible_devices(physical_devices[0],'GPU')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))
os.getcwd()


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Num GPUs Available:  2
GPUs:  2


'/home/ubuntu/thesis/Thesis'

In [4]:
df = pd.read_pickle('./data/df_processed_bigrams.pickle')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365200 entries, 0 to 369046
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181507 non-null  object        
 1   date               365200 non-null  datetime64[ns]
 2   domain             365200 non-null  object        
 3   title              365115 non-null  object        
 4   url                365200 non-null  object        
 5   content            365200 non-null  object        
 6   topic_area         365200 non-null  object        
 7   content_processed  365200 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 25.1+ MB


In [6]:
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...


In [7]:
# Note to do - need to add time element

def log_newline(self, how_many_lines=1):
    file_handler = None
    if self.handlers:
        file_handler = self.handlers[0]

    # Switch formatter, output a blank line
    file_handler.setFormatter(self.blank_formatter)
    for i in range(how_many_lines):
        self.info('')

    # Switch back
    file_handler.setFormatter(self.default_formatter)

def logger_w2v():
    
    log_file = os.path.join('./data', 'word2vec.log')
    print('log file location: ', log_file)
    
    log_format= '%(asctime)s - %(levelname)s - [%(module)s]\t%(message)s'
    formatter = logging.Formatter(fmt=(log_format))
    
    fhandler = logging.FileHandler(log_file)
    fhandler.setFormatter(formatter)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    logger.default_formatter = formatter
    logger.blank_formatter = logging.Formatter(fmt="")
    logger.newline = types.MethodType(log_newline, logger)
    
    return logger
    

In [8]:
class Word2Vec:
    """
    apply word2vec to text
    """

    def __init__(self, logger, vocab_size, vector_dim, input_target, input_context,
                 load_pretrained_weights, weights_file_name, train_model_flag, checkpoint_file):
        """
        Args:
            vocab size: integer of number of words to form vocabulary from
            vector_dim: integer of number of dimensions per word
            input_target: tensor representing target word
            input_context: tensor representing context word
        """
        self.logger = logger        
        self.vocab_size = vocab_size
        self.vector_dim = vector_dim
        self.input_target = input_target
        self.input_context = input_context
        self.load_pretrained_weights = load_pretrained_weights
        self.weights_file_name = weights_file_name
        self.checkpoint_file = checkpoint_file
        self.train_model_flag = train_model_flag
        self.model = self.create_model()
        
    def build_dataset(self, words):
        """
        :process raw inputs into a dataset

        Args:
            words: list of strings

        Returns:
            tuple:
                data: list of integers representing words in words
                count: list of count of most frequent words with size n_words
                dictionary: dictionary of word to unique integer
                reverse dictionary: dictionary of unique integer to word
        """
        self.logger.info("Building dataset")

        count = [['UNK', -1]]
        words = [item for sublist in words for item in sublist]
        print(len(words))
        count.extend(collections.Counter(words).most_common(self.vocab_size - 1))
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0        
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        self.dictionary = dictionary

        # Save dictionary
        dict_path = './data'
        dict_file = 'dictionary.csv'
        dict_file = os.path.join(dict_path,dict_file)
        
        with open(dict_file, 'w') as f:
            for key in dictionary.keys():
                f.write("%s,%s\n"%(key,dictionary[key]))

        return data, count, dictionary, reversed_dictionary
    
    def get_training_data(self, data, window_size):
        """
        :create text and label pairs for model training

        Args:
            data: list of integers representing words in words
            window_size: integer of number of words around the target word that
                         will be used to draw the context words from.

        Returns:
            tuple:
                word_target: list of arrays representing target word in integer form
                word_context: list of arrays representing context word in 
                              relation to target word in integer form
                labels: list containing 1 for true context, 0 for false context
                couples: list of pairs of word indexes aligned with labels
        """
        # the probability of sampling the word i-th most common word 
        sampling_table = sequence.make_sampling_table(self.vocab_size)
        
        self.logger.info("finding training data with labels")
        couples, labels = skipgrams(data, self.vocab_size, window_size=window_size, 
                                    sampling_table=sampling_table)

        print('length of couples', len(couples))
        self.logger.info(f"number of training samples: {len(couples)}")
        self.logger.info("converting tuple of training samples to target and context variables")
        #word_target, word_context = zip(*couples) cannot handle long lists
        word_target = [c[0] for c in couples]
        word_context = [c[1] for c in couples]
        self.logger.info("converting to numpy arrays")
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")
        labels = np.array(labels, dtype="int32")
        
        self.logger.info("training data acquired")

        return word_target, word_context, labels
    

    def create_model(self):
        """
        :keras functional API and embedding layers

        Returns:
            model: untrained word2vec model
        """
        
        # embedding layer
        self.embedding = Embedding(self.vocab_size, self.vector_dim, input_length=1, name='embedding')

        # embedding vectors
        target = self.embedding(self.input_target)
        target = Reshape((self.vector_dim, 1))(target)
        context = self.embedding(self.input_context)
        context = Reshape((self.vector_dim, 1))(context)

        # dot product operation to get a similarity measure
        dot_product = dot([target, context], axes=1, normalize=False)
        dot_product = Reshape((1,))(dot_product)

        # add the sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)

        # create the training model
        self.model = Model(inputs=[self.input_target, self.input_context], outputs=output)

        return self.model

    def train_model(self, epochs, batch_size, word_target, word_context, labels):
        """
        :trains word2vec model

        Args:
            model: word2vec model
            epochs: integer of number of iterations to train model on
            batch_size: integer of number of words to pass to epoch
            word_target: list of arrays representing target word 
            word_context: list of arrays representing context word in relation 
                          to target word
            labels: list containing 1 for true context, 0 for false context

        Returns:
            model: trained word2vec model
        """
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        #loss = tf.keras.losses.BinaryCrossentropy()
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer)

        # tensorboard callback
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir='tensorboard_log/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)

        if self.load_pretrained_weights:
            self.load_prior_weights()
            if not self.train_model_flag:
                return self.model

        arr_1 = np.zeros((batch_size,))
        arr_2 = np.zeros((batch_size,))
        arr_3 = np.zeros((batch_size,))
        
        for i in range(epochs):
            idx = np.random.choice(list(range(len(labels))), size=batch_size, replace=False)
            arr_1[:] = np.array([word_target[i] for i in idx])
            arr_2[:] = np.array([word_context[i] for i in idx])
            arr_3[:] = np.array([labels[i] for i in idx])
            loss = self.model.train_on_batch([arr_1, arr_2], arr_3)
            with summary_writer.as_default():
                tf.summary.scalar('loss', loss, step=i)
            if (i+1) % 500 == 0:
                print("Iteration {}, loss={}".format(i+1, loss))
            if (i+1) % 2 == 0:
                checkpoint_dir = './model/model_weights'
                checkpoint_file = f"cp-epoch-{i+1:010d}.h5"
                checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
                self.model.save_weights(checkpoint_path)
                self.embedding_projector(log_dir)

        return self.model
    
    def train_model_v2(self, epochs, dataset):
        """
        :trains word2vec model

        Args:
            model: word2vec model
            epochs: integer of number of iterations to train model on
            batch_size: integer of number of words to pass to epoch
            word_target: list of arrays representing target word 
            word_context: list of arrays representing context word in relation 
                          to target word
            labels: list containing 1 for true context, 0 for false context

        Returns:
            model: trained word2vec model
        """
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        #loss = tf.keras.losses.BinaryCrossentropy()
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer)

        # tensorboard callback
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir='tensorboard_log/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)

        if self.load_pretrained_weights:
            self.load_prior_weights()
            if not self.train_model_flag:
                return self.model

        i = 0
        for idx_e in range(epochs):
            for (arr_1, arr_2), arr_3 in dataset:
            #for batch in dataset:
                loss = self.model.train_on_batch([arr_1, arr_2], arr_3)    
                #loss = self.model.train_on_batch(batch) 
                i += 1
                with summary_writer.as_default():
                    tf.summary.scalar('loss', loss, step=i)
                #if (i+1) % 500 == 0:
                    #print("Iteration {}, loss={}".format(i+1, loss))
                if i % 10000 == 0:
                    checkpoint_dir = './model/model_weights'
                    checkpoint_file = f"cp-epoch-{i:010d}.h5"
                    checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
                    self.model.save_weights(checkpoint_path)
                    self.embedding_projector(log_dir)

        return self.model
    
    def embedding_projector(self, log_dir):
        """
        :visualise embeddings in tensorboard
        """
        # Save Labels separately on a line-by-line manner.
        with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
            for subwords in self.dictionary.keys():
                f.write("{}\n".format(subwords))
            # Fill in the rest of the labels with "unknown"
            for unknown in range(1, self.vocab_size - len(self.dictionary.keys())):
                f.write("unknown #{}\n".format(unknown))

        # Save the weights we want to analyse as a variable. 
        weights = tf.Variable(self.model.layers[2].get_weights()[0])
        checkpoint_w = tf.train.Checkpoint(embedding=weights)
        checkpoint_w.save(os.path.join(log_dir, "embedding.ckpt"))

        # Set up config
        config_tb = projector.ProjectorConfig()
        embedding_tb = config_tb.embeddings.add()
        embedding_tb.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
        embedding_tb.metadata_path = 'metadata.tsv'
        projector.visualize_embeddings(log_dir, config_tb)
        
        
    def load_prior_weights(self):
        """
        :load prior weights if load_pretrained_weights = True in main file
        """ 
        #abs_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))
        #checkpoint_dir = os.path.join(abs_path, self.config['model']['model_dir'], self.config['model']['model_weights'])
        #checkpoint_path = os.path.join(checkpoint_dir,self.checkpoint_file)
        checkpoint_dir = './model/model_weights'
        checkpoint_file = self.weights_file_name
        checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
        self.model.load_weights(checkpoint_path)
        self.logger.info('Loaded pre trained wweights from {}'.format(str(checkpoint_path)))

In [9]:
def get_word_vectors(model):
    
    embedding_weights = model.layers[2].get_weights()[0]
    #word_embeddings = {w:embedding_weights[idx] for w, idx in dictionary.items()}
    
    return embedding_weights

In [10]:
def tokenise_dataset(df):

    tokens = df['content_processed'].str.split(" ")

    return tokens

In [11]:
#words = df['content_processed'][:50000]
words = tokenise_dataset(df)
sorted(words[4][:10])

['carpets',
 'celebrities',
 'coronavirus',
 'life',
 'normal',
 'pandemic',
 'premieres',
 'red',
 'returns',
 'walk']

In [12]:
logger = logger_w2v()

vocab_size = 10000
vector_dim = 300
input_target = Input((1,))
input_context = Input((1,))
load_pretrained_weights = True
weights_file_name = f"cp-epoch-0000560000_210817.h5"
checkpoint_file = None
train_model_flag = False

word2vec = Word2Vec(logger, vocab_size, vector_dim, input_target, input_context,
                    load_pretrained_weights, weights_file_name, train_model_flag, checkpoint_file)

data, count, dictionary, reversed_dictionary = word2vec.build_dataset(words)

log file location:  ./data/word2vec.log
146530982


In [13]:
print(len(dictionary))
print(len(data))
#count
#reversed_dictionary
#dictionary

10000
146530982


In [14]:
data[:5]

[67, 9, 4267, 191, 12]

In [15]:
#dictionary

In [16]:
#dictionary.keys()

In [17]:
dictionary['supply_chain']

1138

In [18]:
process = False
window_size = 3

if process:
    # ~1 hour to run
    word_target, word_context, labels = word2vec.get_training_data(data, window_size)
    np.save('word_target', word_target)
    np.save('word_context', word_context)
    np.save('labels', labels)
else:
    word_target = np.load('word_target.npy')
    word_context = np.load('word_context.npy')
    labels = np.load('labels.npy')
    
print(len(word_target))
print(len(word_context))
print(len(labels))

383241934
383241934
383241934


In [19]:
print(word_target[:5])
print(word_context[:5])
print(labels[:5])

[8074 2238  631  653 4389]
[7255 1438 9400 8458 5593]
[0 1 0 0 0]


### Parameter Notes

BATCH SIZE ADJ  
20210414-082710 - shows training improvement from 0.69 to 0.43 loss  
aritcles 50,000  
batch_size 1000  
epochs 5000  

ARTICLES PROCESSED ADJ   
20210413-204616 - shows training improvement from 0.69 to 0.52 loss  
aritcles 20,000  
batch_size 100  
epochs 5000 

LEARNING RATE ADJ  
20210413-161618 - shows training improvement from 0.69 to 0.68 loss  
aritcles 50,000  
batch_size 100  
epochs 5000
learning rate = 1e-4 (normally 1e-3)

In [20]:
epochs = 10
batch_size = 1024*2
buffer_size = 5000
AUTOTUNE = tf.data.AUTOTUNE


# Configure dataset as tf dataset to improve performance
dataset = tf.data.Dataset.from_tensor_slices(((word_target, word_context), labels))

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
#dataset = dataset.repeat(epochs).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
#dataset = dataset.shuffle(buffer_size, reshuffle_each_iteration=True).batch(batch_size, drop_remainder=True).repeat(epochs)

print(dataset)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<BatchDataset shapes: (((2048,), (2048,)), (2048,)), types: ((tf.int32, tf.int32), tf.int32)>
<PrefetchDataset shapes: (((2048,), (2048,)), (2048,)), types: ((tf.int32, tf.int32), tf.int32)>


In [21]:
len(dataset)

187129

In [22]:
logger.info(f"Training model with {epochs} epochs, batch size: {batch_size}")

#model = word2vec.train_model(epochs, batch_size, word_target, word_context, labels)
model = word2vec.train_model_v2(epochs, dataset)


In [23]:
#assert False

# Document Vectors and Topic Modelling

In [24]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#import umap
import umap.umap_ as umap
import hdbscan
from sklearn.preprocessing import normalize
from sklearn.cluster import dbscan

In [25]:
word_embeddings = get_word_vectors(model)
print(len(word_embeddings))
word_embeddings

10000


array([[ 0.03451507,  0.01146441,  0.0072502 , ..., -0.03005952,
        -0.0086208 ,  0.03863366],
       [-1.8858466 , -0.5918929 ,  1.8012348 , ..., -0.52105606,
        -0.44268593, -1.3113848 ],
       [-2.0880358 ,  0.93517476,  1.3186673 , ..., -0.0911454 ,
         0.11058746, -0.5393993 ],
       ...,
       [ 0.1791399 , -0.94644576,  1.405138  , ...,  0.14882308,
        -0.300119  ,  0.09040979],
       [-1.968081  , -1.4943357 ,  1.6757054 , ...,  0.9575353 ,
        -0.8117295 , -0.80107796],
       [ 0.18074216, -0.01231587,  0.784399  , ...,  0.23737086,
        -0.39951843, -0.61023384]], dtype=float32)

In [26]:
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...


In [27]:
documents = df['content_processed'][:50000]

In [28]:
print(type(word_embeddings))
print(type(documents))
print(type(reversed_dictionary))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'dict'>


In [47]:
class Doc2VecCustom:
    """
    apply doc2vec to text
    """

    def __init__(self, logger, documents, reversed_dictionary, word_embeddings):
        """
        Args:
        """
        self.logger = logger        
        self.documents = documents
        self.index_to_word_dict = reversed_dictionary
        self.word_embeddings = word_embeddings

        logger.info('Pre-processing documents for training')

        train_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
        
        logger.info('Creating joint document/word embedding')
        #self.model = Doc2Vec(**doc2vec_args)
        self.model = Doc2Vec(vector_size = 300,
                min_count = 50, # ignores words with total frequency lower than this
                window = 3, # maximum distance between the current and predicted word within a sentence
                sample = 1e-5, # threshold for configuring which higher frequency words are randomly downsampled
                workers = 8, # CPU's to use
                negative = 0, # 0 = no negative sampling
                hs = 1, # 1 = hierarchical softmax, 0 + neg non-zero = negative sampling
                epochs = 200,
                dm = 1, # 0 = Distributed bag of words (PV_DBOW), 1 = Distributed memory (PV-DM)
                dbow_words = 1, # 1 = train word-vecctors, 0 = only train doc-vectors
                documents = train_corpus)
        
        print('point 1')
        
        
        # create 5D embeddings of documents
        logger.info('Creating lower dimension embedding of documents')
        
        umap_args = {'n_neighbors': 15,
                     'n_components': 5,
                     'metric': 'cosine'}
        
        umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))

        # find dense areas of document vectors
        logger.info('Finding dense areas of documents')
        
        hdbscan_args = {'min_cluster_size': 15,
                         'metric': 'euclidean',
                         'cluster_selection_method': 'eom'}

        cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
        
        # calculate topic vectors from dense areas of documents
        logger.info('Finding topics')

        # create topic vectors
        self._create_topic_vectors(cluster.labels_)

        # deduplicate topics
        self._deduplicate_topics()

        # find topic words and scores
        self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
        
        # assign documents to topic
        self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
                                                                      self._get_document_vectors())

        # calculate topic sizes
        self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)

        # re-order topics
        self._reorder_topics(hierarchy=False)
        
        logger.info('Topic modelling completed')
        
    @staticmethod
    def _l2_normalize(vectors):

        if vectors.ndim == 2:
            return normalize(vectors)
        else:
            return normalize(vectors.reshape(1, -1))[0]
    
    def _get_document_vectors(self, norm=True):

        if norm:
            #self.model.dv.init_sims()
            #return self.model.dv.vectors_docs_norm
            return self.model.dv.get_normed_vectors()
        else:
            return self.model.dv.vectors
    
    def _create_topic_vectors(self, cluster_labels):

        unique_labels = set(cluster_labels)
        if -1 in unique_labels:
            unique_labels.remove(-1)
        self.topic_vectors = self._l2_normalize(
            np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]]
                      .mean(axis=0) for label in unique_labels]))

    def _deduplicate_topics(self):
        core_samples, labels = dbscan(X=self.topic_vectors,
                                      eps=0.1,
                                      min_samples=2,
                                      metric="cosine")

        duplicate_clusters = set(labels)

        if len(duplicate_clusters) > 1 or -1 not in duplicate_clusters:

            # unique topics
            unique_topics = self.topic_vectors[np.where(labels == -1)[0]]

            if -1 in duplicate_clusters:
                duplicate_clusters.remove(-1)

            # merge duplicate topics
            for unique_label in duplicate_clusters:
                unique_topics = np.vstack(
                    [unique_topics, self._l2_normalize(self.topic_vectors[np.where(labels == unique_label)[0]]
                                                       .mean(axis=0))])

            self.topic_vectors = unique_topics
            
    def _index2word(self, index):
        return self.index_to_word_dict[index]

    def _get_word_vectors(self):
        return self.word_embeddings
            
    def _find_topic_words_and_scores(self, topic_vectors):
        topic_words = []
        topic_word_scores = []

        res = np.inner(topic_vectors, self._get_word_vectors())
        top_words = np.flip(np.argsort(res, axis=1), axis=1)
        top_scores = np.flip(np.sort(res, axis=1), axis=1)

        for words, scores in zip(top_words, top_scores):
            topic_words.append([self._index2word(i) for i in words[0:50]])
            topic_word_scores.append(scores[0:50])

        topic_words = np.array(topic_words)
        topic_word_scores = np.array(topic_word_scores)

        return topic_words, topic_word_scores
    
    @staticmethod
    def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
        batch_size = 10000
        doc_top = []
        if dist:
            doc_dist = []

        if document_vectors.shape[0] > batch_size:
            current = 0
            batches = int(document_vectors.shape[0] / batch_size)
            extra = document_vectors.shape[0] % batch_size

            for ind in range(0, batches):
                res = np.inner(document_vectors[current:current + batch_size], topic_vectors)
                doc_top.extend(np.argmax(res, axis=1))
                if dist:
                    doc_dist.extend(np.max(res, axis=1))
                current += batch_size

            if extra > 0:
                res = np.inner(document_vectors[current:current + extra], topic_vectors)
                doc_top.extend(np.argmax(res, axis=1))
                if dist:
                    doc_dist.extend(np.max(res, axis=1))
            if dist:
                doc_dist = np.array(doc_dist)
        else:
            res = np.inner(document_vectors, topic_vectors)
            doc_top = np.argmax(res, axis=1)
            if dist:
                doc_dist = np.max(res, axis=1)

        if dist:
            return doc_top, doc_dist
        else:
            return doc_top
        
    def _calculate_topic_sizes(self, hierarchy=False):
        if hierarchy:
            topic_sizes = pd.Series(self.doc_top_reduced).value_counts()
        else:
            topic_sizes = pd.Series(self.doc_top).value_counts()

        return topic_sizes
    
    def _reorder_topics(self, hierarchy=False):

        if hierarchy:
            self.topic_vectors_reduced = self.topic_vectors_reduced[self.topic_sizes_reduced.index]
            self.topic_words_reduced = self.topic_words_reduced[self.topic_sizes_reduced.index]
            self.topic_word_scores_reduced = self.topic_word_scores_reduced[self.topic_sizes_reduced.index]
            old2new = dict(zip(self.topic_sizes_reduced.index, range(self.topic_sizes_reduced.index.shape[0])))
            self.doc_top_reduced = np.array([old2new[i] for i in self.doc_top_reduced])
            self.hierarchy = [self.hierarchy[i] for i in self.topic_sizes_reduced.index]
            self.topic_sizes_reduced.reset_index(drop=True, inplace=True)
        else:
            self.topic_vectors = self.topic_vectors[self.topic_sizes.index]
            self.topic_words = self.topic_words[self.topic_sizes.index]
            self.topic_word_scores = self.topic_word_scores[self.topic_sizes.index]
            old2new = dict(zip(self.topic_sizes.index, range(self.topic_sizes.index.shape[0])))
            self.doc_top = np.array([old2new[i] for i in self.doc_top])
            self.topic_sizes.reset_index(drop=True, inplace=True)
            


In [48]:
# 50,000 Start 11:05 was still going 2 hours later, but had to restart computer
# 50,000 Start 1:25 finished 3:40 not training word embed
# 50,000 Start 3:50 still going at 5
doc2vec = Doc2VecCustom(logger, documents, reversed_dictionary, word_embeddings)

point 1


In [70]:
from gensim.test.utils import get_tmpfile

#fname = get_tmpfile("doc2vec_model")
fname = "doc2vec_model"
doc2vec.save(fname)


AttributeError: 'Doc2VecCustom' object has no attribute 'save'

In [29]:
#model = Doc2Vec.load("doc2vec_model/saved_model.pb")

In [69]:
print(len(doc2vec.topic_words))
doc2vec.topic_words

47


array([['unturned_looking', 'dividing', 'technavio_indepth', ...,
        'reserves', 'mixing', 'floors'],
       ['considered_isolation', 'microbes_shown', 'hard_nonporous', ...,
        'outbreaks_prompting', 'growing_client', 'term_defined'],
       ['zacks', 'ads', 'forwardlooking_statements', ..., 'receives',
        'moody', 'forwardlooking'],
       ...,
       ['trends_vendor', 'landscape_vendor', 'assess_competitive', ...,
        'lng', 'waves', 'update_revise'],
       ['complications_cdc', 'mind_chance', 'reason_preserve', ...,
        'finds_organizes', 'pioneering', 'navarro'],
       ['identify_advance', 'experience_formatting', 'hedge_fund', ...,
        'photos_novel', 'disclosure', 'severely_ill']], dtype='<U29')

In [50]:
doc2vec.topic_words[0]

array(['unturned_looking', 'dividing', 'technavio_indepth', 'wipes_clean',
       'prohibited', 'smell_symptoms', 'surfaces_seat', 'archie', 'walls',
       'trapped', 'hard_surfaces', 'pools', 'libraries', 'territory',
       'ways_figure', 'makeshift', 'nose', 'privacy_notice',
       'insights_identify', 'performing', 'significance_investors',
       'unsafe', 'insider_monkey', 'closing', 'hands_clean',
       'thousand_enduse', 'gardens', 'barred', 'vii', 'bit_bit',
       'common_stockholders', 'chargeoffs', 'independently', 'acres',
       'cleaned', 'companyfollowemail_disclosure', 'consulate',
       'potentially_sick', 'sizing_forces', 'banned', 'economists_polled',
       'trips_outside', 'junk', 'complications_cdc', 'evacuated',
       'use_disinfecting', 'overnight', 'reserves', 'mixing', 'floors'],
      dtype='<U29')

In [51]:
doc2vec.topic_words[1]

array(['considered_isolation', 'microbes_shown', 'hard_nonporous',
       'leaving_decision', 'tracked_insider', 'dgap', 'spreads_happen',
       'studies_flu', 'unlawful', 'germs_killing', 'largely_leaving',
       'unpaid', 'cdc_recommends', 'expects_anticipates',
       'surfaces_particularly', 'implied_forwardlooking',
       'companyfollowemail_disclosure', 'seat_spreading',
       'germs_typically', 'moody', 'binding', 'days_onset',
       'jurisdiction_offer', 'placebocontrolled', 'upholstered_seats',
       'particles', 'buyer', 'paycheck_protection', 'inspector_general',
       'engagements', 'surfaces', 'customary', 'cloth', 'construed',
       'substitute', 'respiratory_illnesses', 'consists_enterprises',
       'jacksonville', 'accordance_gaap', 'penalties', 'recipient',
       'season_safest', 'raising_specter', 'rhinovirus', 'fines',
       'contemplated', 'equipment_ppe', 'outbreaks_prompting',
       'growing_client', 'term_defined'], dtype='<U29')

In [52]:
doc2vec.topic_words[10]

array(['dgap', 'technavio_suggests', 'leaving_decision',
       'positions_slowgrowing', 'regioncountry_vs', 'tankers', 'loss',
       'forecast_scenarios', 'leaders_affirmed', 'unlawful',
       'beliefs_expectations', 'study_identifies', 'privacy_notice',
       'reasonable_assumptions', 'historical_facts', 'plans_objectives',
       'penalty', 'addressable', 'graphic', 'oil_gas', 'sealed', 'resins',
       'thousand_regioncountry', 'wire', 'boats', 'earnings_esp',
       'ethanol', 'beverages', 'historical_fact', 'disposal',
       'competitive_landscape', 'merger', 'kospi', 'guarantees',
       'differ_materially', 'click_q1', 'strikes', 'taxes',
       'modification', 'separation', 'alcohol', 'positions_changing',
       'commonly', 'regioncountry_table', 'hereof', 'taxable',
       'explosion', 'unpaid', 'compounds', 'accident'], dtype='<U29')

In [55]:
doc2vec.topic_words[46]

array(['identify_advance', 'experience_formatting', 'hedge_fund',
       'oxygen_ventilator', 'trends_drivers', 'mineral_reserves',
       'mineral_resources', 'scientists_document', 'sen',
       'solution_reportlinker', 'assess_competitive', 'mind_chance',
       'privacy_notice', '13f_filings', 'fears_decide',
       'investor_letters', 'running_mate', 'guidance_reflecting',
       'supplementary', 'kt', 'significance_investors', 'readily',
       'finds_organizes', 'reason_preserve', 'andor', 'study_synthesis',
       'exposing_sick', 'adverse_reactions', 'microbes_shown',
       'progressing_cagr', 'meur', 'insights_identify', 'defaults',
       'procurement', 'stock_pitches', 'returned_outperformed',
       'bbc_radio', 'landscape_vendor', 'positions_changing',
       'latest_recommendations', 'summation_data', 'cognitive',
       'explain_symptoms', 'instruments', 'si', 'regulation', 'ifrs',
       'photos_novel', 'disclosure', 'severely_ill'], dtype='<U29')

In [45]:
doc2vec.topic_vectors

array([[ 0.03508409,  0.05171107,  0.10154462, ..., -0.01006028,
        -0.07205673, -0.09094811],
       [-0.04935476,  0.07906763,  0.04383451, ...,  0.01413208,
        -0.14334318, -0.05271035],
       [-0.02902463,  0.02421883,  0.03241279, ..., -0.03645328,
        -0.06429061, -0.07020091],
       ...,
       [-0.01402772,  0.1053759 ,  0.00595971, ...,  0.07351311,
        -0.03523719, -0.01995409],
       [-0.01313634, -0.02805134,  0.07992914, ...,  0.02176862,
        -0.04151111, -0.06642555],
       [-0.01305523,  0.16479506,  0.03820763, ...,  0.0457513 ,
        -0.08577923, -0.06591271]], dtype=float32)

# Top2Vec - OOB

In [8]:
import pandas as pd
#from top2vec import Top2Vec
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types
import pickle

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from top2vec import Top2Vec

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [9]:
df = pd.read_pickle('./data/df_processed_bigrams.pickle')
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...


In [10]:
find_topics = False
min_count = 1000 # ignore words with total frequency less than this
speed = 'learn' # can try 'deep-learn' for possible better embeddings but will take longer

if find_topics:
    # import lemmatised data
    with open('data/data_lemmatized.pickle', 'rb') as f:
        data_lemmatized = pickle.load(f)
    
    data_lemmatized_str = [' '.join(article) for article in data_lemmatized]
    print(len(data_lemmatized))
    print(len(data_lemmatized_str))
    
    # Find topics
    # ~ 12.5 hours to run on lemmatised data
    #documents = df['content_processed'][:50000].values
    documents = data_lemmatized_str
    model = Top2Vec(documents, workers=4, min_count=min_count, speed=speed)
    model.save('top2vec_new.model')
else:
    #model = Top2Vec.load('top2vec.model')
    model = Top2Vec.load('top2vec_vocab_limit.model')

print(len(model.topic_words))
print(model._get_word_vectors().shape)

1773
(9453, 300)


In [11]:
model.topic_words

array([['barrels_day', 'bpd', 'crude', ..., 'gallon', 'gregorio',
        'slash'],
       ['touching_face', 'hands_clean', 'sick', ..., 'workout', 'gov',
        'africanamerican'],
       ['nongaap', 'gaap', 'ebitdare', ..., 'audio_webcast',
        'study_identifie', 'cloudbase'],
       ...,
       ['gift', 'card', 'debit', ..., 'crossborder', 'curbside_pickup',
        'biometric'],
       ['dare', 'commercialize', 'nda', ..., 'gel', 'milestone', 'drug'],
       ['nda', 'tolerability', 'openlabel', ..., 'brent', 'toxicity',
        'treasury_yield']], dtype='<U15')

In [12]:
model.topic_words[0]

array(['barrels_day', 'bpd', 'crude', 'opec', 'glut', 'oil', 'barrel',
       'refiner', 'brent_crude', 'wti', 'eia', 'brent', 'crude_future',
       'gasoline', 'shale', 'crude_oil', 'refinery', 'output', 'libya',
       'exxon', 'aramco', 'refining', 'permian', 'producer', 'chevron',
       'petroleum', 'saudi', 'reuters_poll', 'importer', 'oil_ga',
       'upstream', 'rig', 'hydrocarbon', 'rout', 'taper', 'natural_ga',
       'gulf', 'petrochemical', 'oilfield', 'refine', 'oil_gas',
       'curtailment', 'iraq', 'royal_dutch', 'diesel', 'chesapeake',
       'lowest_level', 'gallon', 'gregorio', 'slash'], dtype='<U15')

In [13]:
model.topic_words[1]

array(['touching_face', 'hands_clean', 'sick', 'rate_dippe',
       'illness_cause', 'afterward', 'whitmer', 'sicken', 'air_setting',
       'yorker', 'held_outdoor', 'overwhelm', 'quarantined',
       'surfaces_seat', 'sinuses_common', 'seats_contact', 'sweat',
       'caring_sick', 'wipes_clean', 'cdc', 'disinfect_hard',
       'experts_warn', 'subway', 'birx', 'couch', 'hernandez', 'sidewalk',
       'breathe', 'screen_seat', 'hiring_rebounde', 'neighbor',
       'cellphone', 'countless', 'swath', 'caseload', 'girlfriend',
       'breathing', 'bryant', 'sitting_window', 'gov_andrew',
       'school_district', 'plane_window', 'evidence_widely', 'shout',
       'coauthor', 'epicenter', 'flu', 'workout', 'gov',
       'africanamerican'], dtype='<U15')

In [14]:
model.topic_words[2]

array(['nongaap', 'gaap', 'ebitdare', 'divestiture', 'teekay',
       'longlived_asset', 'item_', 'isg', 'chegg', 'variability',
       'ability_attract', 'sec_filing', 'passcode', 'nareit',
       'live_webcast', 'dialing', 'shortterm_phase', 'gross_margin',
       'affo', 'gotomarket', 'diluted', 'replay', 'technavio',
       'free_sample', 'extinguishment', 'section_entitle', 'reach_revise',
       'dell', 'a_securitie', 'actual_result', 'safe_harbor',
       'trends_driver', 'trailing_cagr', 'act_amended', 'offers_uptodate',
       'webcast', 'gartner', 'onpremise', 'dialin', 'netback',
       'usa_canada', 'highperformance', 'remain_unscathe',
       'periodic_report', 'comparability', 'gain_instant', 'reform_act',
       'audio_webcast', 'study_identifie', 'cloudbase'], dtype='<U15')

### Get topic sizes

Number of documents most similar to each topic. Topics are in decreasing order of size.  
topic_sizes: The number of documents most similar to each topic.  
topic_nums: The unique index of every topic will be returned.  

In [15]:
topic_sizes, topic_ids = model.get_topic_sizes()
df_topic_sizes = pd.DataFrame(data=zip(topic_ids, topic_sizes), columns=['topic_id', 'num_docs'])

In [16]:
df_topic_sizes

Unnamed: 0,topic_id,num_docs
0,0,3882
1,1,2972
2,2,2739
3,3,2512
4,4,2155
...,...,...
1768,1768,19
1769,1769,19
1770,1770,18
1771,1771,18


### Get Topics
topic_words: For each topic the top 50 words are returned, in order of semantic similarity to topic.  
word_scores: For each topic the cosine similarity scores of the top 50 words to the topic are returned.  
topic_nums: The unique index of every topic will be returned.  

In [63]:
topic_words, word_scores, topic_ids = model.get_topics(model.get_num_topics())
topic_sizes, topic_ids = model.get_topic_sizes()
df_topics = pd.DataFrame(data=zip(topic_ids, topic_sizes, topic_words, word_scores), columns=['topic_id', 'topic_sizes', 'topic_words', 'word_scores'])

In [64]:
df_topics

Unnamed: 0,topic_id,topic_sizes,topic_words,word_scores
0,0,3882,"[barrels_day, bpd, crude, opec, glut, oil, bar...","[0.7554733, 0.7397255, 0.7277012, 0.7076352, 0..."
1,1,2972,"[touching_face, hands_clean, sick, rate_dippe,...","[0.31247112, 0.20629022, 0.20435627, 0.2030590..."
2,2,2739,"[nongaap, gaap, ebitdare, divestiture, teekay,...","[0.26054233, 0.23778984, 0.22579505, 0.1927944..."
3,3,2512,"[vaccine, pfizer_biontech, pfizerbiontech, pfi...","[0.80440575, 0.7686322, 0.7623384, 0.7583906, ..."
4,4,2155,"[hedge_fund, insider_monkey, hedgie, similarly...","[0.7718193, 0.57161814, 0.5356776, 0.51193756,..."
...,...,...,...,...
1768,1768,19,"[safehaven, crude_future, greenback, japanese_...","[0.55273753, 0.49876994, 0.49868113, 0.4946041..."
1769,1769,19,"[strategist, choppy, treasury_yield, selloff, ...","[0.44933143, 0.42233846, 0.41821185, 0.3859547..."
1770,1770,18,"[gift, card, debit, wallet, credit_card, press...","[0.5080503, 0.49814865, 0.40992847, 0.384071, ..."
1771,1771,18,"[dare, commercialize, nda, bioscience, investi...","[0.50853056, 0.4186489, 0.3713347, 0.3697553, ..."


In [65]:
df_topics.loc[1084]

topic_id                                                    1084
topic_sizes                                                   95
topic_words    [ghost, spinoff, episode, spoiler, tease, stor...
word_scores    [0.67582524, 0.6156854, 0.5654878, 0.55437475,...
Name: 1084, dtype: object

### Search for topics than contain keywords
topic_words: For each topic the top 50 words are returned, in order of semantic similarity to topic.  
word_scores: For each topic the cosine similarity scores of the top 50 words to the topic are returned.  
topic_scores: For each topic the cosine similarity to the search keywords will be returned.  
topic_nums: The unique index of every topic will be returned.  

In [67]:
keywords = ["supply_chain"]
#keywords = ["digital_transformation"]
topic_words, word_scores, topic_scores, topic_ids = model.search_topics(keywords=keywords, num_topics=5)
df_topic_kw = pd.DataFrame(data=zip(topic_ids, topic_words, word_scores, topic_scores), columns=['topic_id', 'topic_words', 'word_scores', 'topic_scores'])

In [68]:
df_topic_kw

Unnamed: 0,topic_id,topic_words,word_scores,topic_scores
0,859,"[generic, pharmaceutical, drug, novartis, phar...","[0.5529777, 0.41644132, 0.4071582, 0.37119797,...",0.332119
1,914,"[garment, bangladesh, boohoo, clothing, clothe...","[0.7322744, 0.50282276, 0.46981946, 0.46685526...",0.287537
2,93,"[tools_checklist, reinvent, transformation, ag...","[0.5508231, 0.49446157, 0.48091435, 0.477627, ...",0.252878
3,825,"[wto, multilateral, directorgeneral, bilateral...","[0.78679776, 0.5122057, 0.4868957, 0.46711993,...",0.246379
4,1132,"[respirator, fema, surgical_mask, protective_g...","[0.5727843, 0.45496583, 0.4493539, 0.43521297,...",0.242761


In [70]:
df_topic_kw['topic_words'][1]

array(['garment', 'bangladesh', 'boohoo', 'clothing', 'clothe', 'apparel',
       'textile', 'fashion', 'factory', 'cambodia', 'migrant_worker',
       'cotton', 'myanmar', 'bof', 'footwear', 'adida', 'designer',
       'gucci', 'levi', 'leather', 'burberry', 'exporter', 'supply_chain',
       'nepal', 'shoe', 'tshirt', 'retailer', 'minimum_wage', 'malaysian',
       'malaysia', 'gown', 'ngo', 'nike', 'exploitation', 'jc_penney',
       'casual', 'wage', 'jacket', 'vietnam', 'dress', 'fabric',
       'store_closure', 'informal', 'ethiopia', 'livelihood', 'worker',
       'precarious', 'remittance', 'leicester', 'shirt'], dtype='<U15')

### Search articles by topic

After finding the relevant topic number can then search by this  
documents: The documents in a list, the most similar are first.  
doc_scores: Semantic similarity of document to topic. The cosine similarity of the document and topic vector.  
doc_ids: Unique ids of documents. If ids were not given, the index of document in the original corpus.  

In [71]:
topic_num=914
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=2)
    
result_df = df.iloc[document_ids]
result_df["document_scores"] = document_scores

for index,row in result_df.iterrows():
    print(f"Document: {index}, Score: {row.document_scores}")
    print(f"Title: {row.title}")
    print("-----------")
    print(row.content)
    print("-----------")
    print()

Document: 30278, Score: 0.8098168969154358
Title: Coronavirus threatens jobs of garment workers in Southeast Asia — Quartz
-----------
Across much of Asia, Europe, and the US, Covid-19 has brought shopping for anything but necessities practically to a standstill. Among the industries most at risk is clothing. Fashion retailers have shut stores as part of social-distancing measures and watched sales plunge as home-bound shoppers pause spending on non-essential items. The impact is rippling through fashion’s supply chain, putting at risk the livelihoods of garment workers, who are already some of the most vulnerable workers in the global economy. To keep costs down, mass-market fashion companies do much of their manufacturing in low-wage countries across Southeast Asia, on the peripheries of Europe, and in locations such as Ethiopia. The pay can be barely enough for workers and their families to survive on, providing just the thinnest of protections from poverty. Many garment factories w

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df["document_scores"] = document_scores


### Search articles by Keywords

In [33]:
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=["supply_chain", "disrupt"], num_docs=2)
result_df = df.iloc[document_ids]
result_df["document_scores"] = document_scores

for index,row in result_df.iterrows():
    print(f"Document: {index}, Score: {row.document_scores}")
    print(f"Title: {row.title}")
    print("-----------")
    print(row.content)
    print("-----------")
    print()

Document: 262146, Score: 0.4479290246963501
Title: Risk, resilience, and rebalancing in global value chains
-----------
People create and sustain change. Unleash their potential. Digital upends old models. Reinvent your business. Most transformations fail. Flip the odds. Practical resources to help leaders navigate to the next normal: guides, tools, checklists, interviews and more. Our flagship business publication has been defining and informing the senior-management agenda since 1964. Practical resources to help leaders navigate to the next normal: guides, tools, checklists, interviews and more Learn what it means for you, and meet the people who create it In recent decades, value chains have grown in length and complexity as companies expanded around the world in pursuit of margin improvements. Since 2000, the value of intermediate goods traded globally has tripled to more than $10 trillion annually. Businesses that successfully implemented a lean, global model of manufacturing achi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df["document_scores"] = document_scores


### Find Similar Words

In [36]:
# Get words in vocab
vocab_length = len(model._get_word_vectors())
print(vocab_length)

vocab = []
for n in range(vocab_length):
    vocab.append(model._index2word(n))

54336


In [39]:
[x for x in vocab if 'digital' in x]

['digital',
 'digitally',
 'digitalization',
 'digitalisation',
 'digitalfirst',
 'digitalonly',
 'digitalize',
 'alldigital',
 'digitale',
 'digitalise',
 'digitalready',
 'interdigital',
 'digitallyenable']

In [52]:
print(f'vocabulary length: {len(model._get_word_vectors())}')

words_model, word_scores = model.similar_words(keywords=["digitalization"], num_words=20)
for word, score in zip(words_model, word_scores):
    print(f"{word} {score}")

vocabulary length: 54336
digitization 0.8093907367346862
digitalize 0.7284561756506189
digitalisation 0.6994283029295847
digitisation 0.6479801355874251
digital 0.6282951760960649
efficiency 0.6036911458771772
agile 0.5995618315335657
accelerate 0.5975370370189995
agility 0.5948758156594689
innovation 0.591975953598644
competitiveness 0.5851379284566436
transformation 0.5846937745295936
solution 0.5845464684656394
customercentric 0.5804240349831157
enable 0.5761792252716045
automation 0.5668671251419835
ecosystem 0.5617250565108862
efficient 0.5605026217141098
digitalise 0.5577485939121402
continuously 0.5546740647866372


In [9]:
#model._words2word_vectors(['supply'])
model._get_word_vectors() # word embeddings
model._index2word(1)

'company'