In [1]:
import pandas as pd
#from top2vec import Top2Vec
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types

from tensorflow.keras.layers import Input
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Reshape, Embedding, Concatenate, dot
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.losses import cosine_similarity
from tensorflow.keras.callbacks import TensorBoard
from tensorboard.plugins import projector


In [2]:
!which jupyter

/home/ubuntu/thesis_env2/bin/jupyter


In [36]:
import os
#os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

tf.debugging.set_log_device_placement(True)

physical_devices = tf.config.list_physical_devices('GPU')
for gpu_instance in physical_devices: 
    tf.config.experimental.set_memory_growth(gpu_instance, True)
print(physical_devices)
#tf.config.set_visible_devices(physical_devices[0],'GPU')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print("GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))
os.getcwd()


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
Num GPUs Available:  2
GPUs:  2


'/home/ubuntu/thesis/Thesis'

In [4]:
df = pd.read_pickle('./data/df_processed_bigrams.pickle')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365200 entries, 0 to 369046
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181507 non-null  object        
 1   date               365200 non-null  datetime64[ns]
 2   domain             365200 non-null  object        
 3   title              365115 non-null  object        
 4   url                365200 non-null  object        
 5   content            365200 non-null  object        
 6   topic_area         365200 non-null  object        
 7   content_processed  365200 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 25.1+ MB


In [6]:
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...


In [7]:
# Note to do - need to add time element

def log_newline(self, how_many_lines=1):
    file_handler = None
    if self.handlers:
        file_handler = self.handlers[0]

    # Switch formatter, output a blank line
    file_handler.setFormatter(self.blank_formatter)
    for i in range(how_many_lines):
        self.info('')

    # Switch back
    file_handler.setFormatter(self.default_formatter)

def logger_w2v():
    
    log_file = os.path.join('./data', 'word2vec.log')
    print('log file location: ', log_file)
    
    log_format= '%(asctime)s - %(levelname)s - [%(module)s]\t%(message)s'
    formatter = logging.Formatter(fmt=(log_format))
    
    fhandler = logging.FileHandler(log_file)
    fhandler.setFormatter(formatter)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    logger.default_formatter = formatter
    logger.blank_formatter = logging.Formatter(fmt="")
    logger.newline = types.MethodType(log_newline, logger)
    
    return logger
    

In [37]:
class Word2Vec:
    """
    apply word2vec to text
    """

    def __init__(self, logger, vocab_size, vector_dim, input_target, input_context,
                 load_pretrained_weights, weights_file_name, train_model_flag, checkpoint_file):
        """
        Args:
            vocab size: integer of number of words to form vocabulary from
            vector_dim: integer of number of dimensions per word
            input_target: tensor representing target word
            input_context: tensor representing context word
        """
        self.logger = logger        
        self.vocab_size = vocab_size
        self.vector_dim = vector_dim
        self.input_target = input_target
        self.input_context = input_context
        self.load_pretrained_weights = load_pretrained_weights
        self.weights_file_name = weights_file_name
        self.checkpoint_file = checkpoint_file
        self.train_model_flag = train_model_flag
        self.model = self.create_model()
        
    def build_dataset(self, words):
        """
        :process raw inputs into a dataset

        Args:
            words: list of strings

        Returns:
            tuple:
                data: list of integers representing words in words
                count: list of count of most frequent words with size n_words
                dictionary: dictionary of word to unique integer
                reverse dictionary: dictionary of unique integer to word
        """
        self.logger.info("Building dataset")

        count = [['UNK', -1]]
        words = [item for sublist in words for item in sublist]
        print(len(words))
        count.extend(collections.Counter(words).most_common(self.vocab_size - 1))
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0        
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        self.dictionary = dictionary

        # Save dictionary
        dict_path = './data'
        dict_file = 'dictionary.csv'
        dict_file = os.path.join(dict_path,dict_file)
        
        with open(dict_file, 'w') as f:
            for key in dictionary.keys():
                f.write("%s,%s\n"%(key,dictionary[key]))

        return data, count, dictionary, reversed_dictionary
    
    def get_training_data(self, data, window_size):
        """
        :create text and label pairs for model training

        Args:
            data: list of integers representing words in words
            window_size: integer of number of words around the target word that
                         will be used to draw the context words from.

        Returns:
            tuple:
                word_target: list of arrays representing target word in integer form
                word_context: list of arrays representing context word in 
                              relation to target word in integer form
                labels: list containing 1 for true context, 0 for false context
                couples: list of pairs of word indexes aligned with labels
        """
        # the probability of sampling the word i-th most common word 
        sampling_table = sequence.make_sampling_table(self.vocab_size)
        
        self.logger.info("finding training data with labels")
        couples, labels = skipgrams(data, self.vocab_size, window_size=window_size, 
                                    sampling_table=sampling_table)

        print(len(couples))
        self.logger.info("define target and context variables")
        #word_target, word_context = zip(*couples) cannot handle long lists
        word_target = [c[0] for c in couples]
        word_context = [c[1] for c in couples]
        self.logger.info("converting to numpy arrays")
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")
        
        self.logger.info("training data acquired")

        return word_target, word_context, labels

    def create_model(self):
        """
        :keras functional API and embedding layers

        Returns:
            model: untrained word2vec model
        """
        
        # embedding layer
        embedding = Embedding(self.vocab_size, self.vector_dim, input_length=1, name='embedding')

        # embedding vectors
        target = embedding(self.input_target)
        target = Reshape((self.vector_dim, 1))(target)
        context = embedding(self.input_context)
        context = Reshape((self.vector_dim, 1))(context)

        # dot product operation to get a similarity measure
        dot_product = dot([target, context], axes=1, normalize=False)
        dot_product = Reshape((1,))(dot_product)

        # add the sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)

        # create the training model
        self.model = Model(inputs=[self.input_target, self.input_context], outputs=output)

        return self.model

    def train_model(self, epochs, batch_size, word_target, word_context, labels):
        """
        :trains word2vec model

        Args:
            model: word2vec model
            epochs: integer of number of iterations to train model on
            batch_size: integer of number of words to pass to epoch
            word_target: list of arrays representing target word 
            word_context: list of arrays representing context word in relation 
                          to target word
            labels: list containing 1 for true context, 0 for false context

        Returns:
            model: trained word2vec model
        """
        optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
        #loss = tf.keras.losses.BinaryCrossentropy()
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer)

        # tensorboard callback
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir='tensorboard_log/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)

        if self.load_pretrained_weights:
            self.load_prior_weights()
            if not self.train_model_flag:
                return self.model

        arr_1 = np.zeros((batch_size,))
        arr_2 = np.zeros((batch_size,))
        arr_3 = np.zeros((batch_size,))
        
        for i in range(epochs):
            idx = np.random.choice(list(range(len(labels))), size=batch_size, replace=False)
            arr_1[:] = np.array([word_target[i] for i in idx])
            arr_2[:] = np.array([word_context[i] for i in idx])
            arr_3[:] = np.array([labels[i] for i in idx])
            loss = self.model.train_on_batch([arr_1, arr_2], arr_3)
            with summary_writer.as_default():
                tf.summary.scalar('loss', loss, step=i)
            if (i+1) % 500 == 0:
                print("Iteration {}, loss={}".format(i+1, loss))
            if (i+1) % 1000 == 0:
                checkpoint_dir = './model/model_weights'
                checkpoint_file = f"cp-epoch-{i+1:010d}.h5"
                checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
                self.model.save_weights(checkpoint_path)
                self.embedding_projector(log_dir)

        return self.model
    
    def embedding_projector(self, log_dir):
        """
        :visualise embeddings in tensorboard
        """
        # Save Labels separately on a line-by-line manner.
        with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
            for subwords in self.dictionary.keys():
                f.write("{}\n".format(subwords))
            # Fill in the rest of the labels with "unknown"
            for unknown in range(1, self.vocab_size - len(self.dictionary.keys())):
                f.write("unknown #{}\n".format(unknown))

        # Save the weights we want to analyse as a variable. 
        weights = tf.Variable(self.model.layers[2].get_weights()[0])
        checkpoint_w = tf.train.Checkpoint(embedding=weights)
        checkpoint_w.save(os.path.join(log_dir, "embedding.ckpt"))

        # Set up config
        config_tb = projector.ProjectorConfig()
        embedding_tb = config_tb.embeddings.add()
        embedding_tb.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
        embedding_tb.metadata_path = 'metadata.tsv'
        projector.visualize_embeddings(log_dir, config_tb)
        
        
    def load_prior_weights(self):
        """
        :load prior weights if load_pretrained_weights = True in main file
        """ 
        #abs_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..'))
        #checkpoint_dir = os.path.join(abs_path, self.config['model']['model_dir'], self.config['model']['model_weights'])
        #checkpoint_path = os.path.join(checkpoint_dir,self.checkpoint_file)
        checkpoint_dir = './model/model_weights'
        checkpoint_file = self.weights_file_name
        checkpoint_path = os.path.join(checkpoint_dir,checkpoint_file)
        self.model.load_weights(checkpoint_path)
        self.logger.info('Loaded pre trained wweights from {}'.format(str(checkpoint_path)))

In [9]:
def get_word_vectors(model):
    
    embedding_weights = model.layers[2].get_weights()[0]
    #word_embeddings = {w:embedding_weights[idx] for w, idx in dictionary.items()}
    
    return embedding_weights

In [10]:
def tokenise_dataset(df):

    tokens = df['content_processed'].str.split(" ")

    return tokens

In [11]:
#words = df['content_processed'][:50000]
words = tokenise_dataset(df)
sorted(words[4][:10])

['carpets',
 'celebrities',
 'coronavirus',
 'life',
 'normal',
 'pandemic',
 'premieres',
 'red',
 'returns',
 'walk']

In [38]:
logger = logger_w2v()

vocab_size = 10000
vector_dim = 300
input_target = Input((1,))
input_context = Input((1,))
load_pretrained_weights = False
weights_file_name = f"cp-epoch-0000001000-210808.h5"
checkpoint_file = None
train_model_flag = True

word2vec = Word2Vec(logger, vocab_size, vector_dim, input_target, input_context,
                    load_pretrained_weights, weights_file_name, train_model_flag, checkpoint_file)

data, count, dictionary, reversed_dictionary = word2vec.build_dataset(words)

log file location:  ./data/word2vec.log
152734382


In [16]:
print(len(dictionary))
print(len(data))
#count
#reversed_dictionary
#dictionary

10000
152734382


In [17]:
data[:5]

[75, 9, 4405, 216, 12]

In [25]:
#dictionary.keys()

In [18]:
dictionary['supply_chain']

KeyError: 'supply_chain'

In [28]:
process = False
window_size = 3

if process:
    word_target, word_context, labels = word2vec.get_training_data(data, window_size)
    np.save('word_target', word_target)
    np.save('word_context', word_context)
    labels = np.array(labels, dtype="int32")
    np.save('labels', labels)
else:
    word_target = np.load('word_target.csv.npy')
    word_context = np.load('word_context.csv.npy')
    labels = np.load('labels.csv.npy')
    
print(len(word_target))
print(len(word_context))
print(len(labels))

378097114
378097114
378097114


In [29]:
print(word_target[:5])
print(word_context[:5])
print(labels[:5])

[1371  178 2317  352 1146]
[9860 1223  328   16 1834]
[0 1 0 1 1]


### Parameter Notes

BATCH SIZE ADJ  
20210414-082710 - shows training improvement from 0.69 to 0.43 loss  
aritcles 50,000  
batch_size 1000  
epochs 5000  

ARTICLES PROCESSED ADJ   
20210413-204616 - shows training improvement from 0.69 to 0.52 loss  
aritcles 20,000  
batch_size 100  
epochs 5000 

LEARNING RATE ADJ  
20210413-161618 - shows training improvement from 0.69 to 0.68 loss  
aritcles 50,000  
batch_size 100  
epochs 5000
learning rate = 1e-4 (normally 1e-3)

In [39]:
epochs = 2000
batch_size = 1000

logger.info("Training model with {} epochs".format(epochs))

model = word2vec.train_model(epochs, batch_size, word_target, word_context, labels)


KeyboardInterrupt: 

# Document Vectors and Topic Modelling

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
#import umap
import umap.umap_ as umap
import hdbscan
from sklearn.preprocessing import normalize
from sklearn.cluster import dbscan

In [14]:
word_embeddings = get_word_vectors(model)
print(len(word_embeddings))
word_embeddings

NameError: name 'model' is not defined

In [48]:
documents = df['content_processed'][:50000]

In [49]:
print(type(word_embeddings))
print(type(documents))
print(type(reversed_dictionary))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'dict'>


In [73]:
class Doc2VecCustom:
    """
    apply doc2vec to text
    """

    def __init__(self, logger, documents, reversed_dictionary, word_embeddings):
        """
        Args:
        """
        self.logger = logger        
        self.documents = documents
        self.index_to_word_dict = reversed_dictionary
        self.word_embeddings = word_embeddings

        logger.info('Pre-processing documents for training')

        train_corpus = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
        
        logger.info('Creating joint document/word embedding')
        #self.model = Doc2Vec(**doc2vec_args)
        self.model = Doc2Vec(vector_size = 250,
                min_count = 50, # ignores words with total frequency lower than this
                window = 15, # maximum distance between the current and predicted word within a sentence
                sample = 1e-5, # threshold for configuring which higher frequency words are randomly downsampled
                workers = 8, # CPU's to use
                negative = 0, # 0 = no negative sampling
                hs = 1, # 1 = hierarchical softmax, 0 + neg non-zero = negative sampling
                epochs = 200,
                dm = 0, # 0 = Distributed bag of words (PV_DBOW), 1 = Distributed memory (PV-DM)
                dbow_words = 0, # 1 = train word-vecctors, 0 = only train doc-vectors
                documents = train_corpus)
        
        print('point 1')
        
        
        # create 5D embeddings of documents
        logger.info('Creating lower dimension embedding of documents')
        
        umap_args = {'n_neighbors': 15,
                     'n_components': 5,
                     'metric': 'cosine'}
        
        umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))

        # find dense areas of document vectors
        logger.info('Finding dense areas of documents')
        
        hdbscan_args = {'min_cluster_size': 15,
                         'metric': 'euclidean',
                         'cluster_selection_method': 'eom'}

        cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
        
        # calculate topic vectors from dense areas of documents
        logger.info('Finding topics')

        # create topic vectors
        self._create_topic_vectors(cluster.labels_)

        # deduplicate topics
        self._deduplicate_topics()

        # find topic words and scores
        self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
        
        # assign documents to topic
        self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
                                                                      self._get_document_vectors())

        # calculate topic sizes
        self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)

        # re-order topics
        self._reorder_topics(hierarchy=False)
        
    @staticmethod
    def _l2_normalize(vectors):

        if vectors.ndim == 2:
            return normalize(vectors)
        else:
            return normalize(vectors.reshape(1, -1))[0]
    
    def _get_document_vectors(self, norm=True):

        if norm:
            #self.model.dv.init_sims()
            #return self.model.dv.vectors_docs_norm
            return self.model.dv.get_normed_vectors()
        else:
            return self.model.dv.vectors
    
    def _create_topic_vectors(self, cluster_labels):

        unique_labels = set(cluster_labels)
        if -1 in unique_labels:
            unique_labels.remove(-1)
        self.topic_vectors = self._l2_normalize(
            np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]]
                      .mean(axis=0) for label in unique_labels]))

    def _deduplicate_topics(self):
        core_samples, labels = dbscan(X=self.topic_vectors,
                                      eps=0.1,
                                      min_samples=2,
                                      metric="cosine")

        duplicate_clusters = set(labels)

        if len(duplicate_clusters) > 1 or -1 not in duplicate_clusters:

            # unique topics
            unique_topics = self.topic_vectors[np.where(labels == -1)[0]]

            if -1 in duplicate_clusters:
                duplicate_clusters.remove(-1)

            # merge duplicate topics
            for unique_label in duplicate_clusters:
                unique_topics = np.vstack(
                    [unique_topics, self._l2_normalize(self.topic_vectors[np.where(labels == unique_label)[0]]
                                                       .mean(axis=0))])

            self.topic_vectors = unique_topics
            
    def _index2word(self, index):
        return self.index_to_word_dict[index]

    def _get_word_vectors(self):
        return self.word_embeddings
            
    def _find_topic_words_and_scores(self, topic_vectors):
        topic_words = []
        topic_word_scores = []

        res = np.inner(topic_vectors, self._get_word_vectors())
        top_words = np.flip(np.argsort(res, axis=1), axis=1)
        top_scores = np.flip(np.sort(res, axis=1), axis=1)

        for words, scores in zip(top_words, top_scores):
            topic_words.append([self._index2word(i) for i in words[0:50]])
            topic_word_scores.append(scores[0:50])

        topic_words = np.array(topic_words)
        topic_word_scores = np.array(topic_word_scores)

        return topic_words, topic_word_scores
    
    @staticmethod
    def _calculate_documents_topic(topic_vectors, document_vectors, dist=True):
        batch_size = 10000
        doc_top = []
        if dist:
            doc_dist = []

        if document_vectors.shape[0] > batch_size:
            current = 0
            batches = int(document_vectors.shape[0] / batch_size)
            extra = document_vectors.shape[0] % batch_size

            for ind in range(0, batches):
                res = np.inner(document_vectors[current:current + batch_size], topic_vectors)
                doc_top.extend(np.argmax(res, axis=1))
                if dist:
                    doc_dist.extend(np.max(res, axis=1))
                current += batch_size

            if extra > 0:
                res = np.inner(document_vectors[current:current + extra], topic_vectors)
                doc_top.extend(np.argmax(res, axis=1))
                if dist:
                    doc_dist.extend(np.max(res, axis=1))
            if dist:
                doc_dist = np.array(doc_dist)
        else:
            res = np.inner(document_vectors, topic_vectors)
            doc_top = np.argmax(res, axis=1)
            if dist:
                doc_dist = np.max(res, axis=1)

        if dist:
            return doc_top, doc_dist
        else:
            return doc_top
        
    def _calculate_topic_sizes(self, hierarchy=False):
        if hierarchy:
            topic_sizes = pd.Series(self.doc_top_reduced).value_counts()
        else:
            topic_sizes = pd.Series(self.doc_top).value_counts()

        return topic_sizes
    
    def _reorder_topics(self, hierarchy=False):

        if hierarchy:
            self.topic_vectors_reduced = self.topic_vectors_reduced[self.topic_sizes_reduced.index]
            self.topic_words_reduced = self.topic_words_reduced[self.topic_sizes_reduced.index]
            self.topic_word_scores_reduced = self.topic_word_scores_reduced[self.topic_sizes_reduced.index]
            old2new = dict(zip(self.topic_sizes_reduced.index, range(self.topic_sizes_reduced.index.shape[0])))
            self.doc_top_reduced = np.array([old2new[i] for i in self.doc_top_reduced])
            self.hierarchy = [self.hierarchy[i] for i in self.topic_sizes_reduced.index]
            self.topic_sizes_reduced.reset_index(drop=True, inplace=True)
        else:
            self.topic_vectors = self.topic_vectors[self.topic_sizes.index]
            self.topic_words = self.topic_words[self.topic_sizes.index]
            self.topic_word_scores = self.topic_word_scores[self.topic_sizes.index]
            old2new = dict(zip(self.topic_sizes.index, range(self.topic_sizes.index.shape[0])))
            self.doc_top = np.array([old2new[i] for i in self.doc_top])
            self.topic_sizes.reset_index(drop=True, inplace=True)

In [74]:
doc2vec = Doc2VecCustom(logger, documents, reversed_dictionary, word_embeddings)

point 1


In [75]:
print(len(doc2vec.topic_words))
doc2vec.topic_words

183


array([['hiring rebounded faster', 'making detection difficult', 'rel=',
        ..., 'cabinet', 'regulation services provider', 'dermatologists'],
       ['tsx', 'company', 'cnw/', ..., 'greenback', 'equity $',
        'gas stations'],
       ['equity $', 'jamie freed', 'total liabilities', ...,
        'people republic', 'listing rules', 'investment properties'],
       ...,
       ['ordered', 'businesses', 'service', ..., 'impacting',
        'resilience', 'mortgages'],
       ['contact', 'circular', 'accuracy', ..., 'http', 'tissues',
        'cnw/'],
       ['time', 'highest', 'reduction', ..., 'previous year', 'notes',
        'repayable']], dtype='<U90')

In [76]:
doc2vec.topic_words[0]

array(['hiring rebounded faster', 'making detection difficult', 'rel=',
       'gives small employers significant leeway',
       'international financial reporting standards', 'symptoms overlap',
       'constitution', 'mark heinrich', 'disinfect hard surfaces',
       'unexpected improvement', 'unedited', 'deny leave', 'uefa',
       'proposed', 'coronavirus symptoms',
       'consolidated financial statements', 'irrational panic shoppers]',
       'declare', 'larry king', 'include toe lesions', 'bats',
       'damp conditions', 'nonporous', 'union', 'finance leases', 'nhl',
       'meeting', 'geneva', 'international olympic committee',
       'accepts responsibility', 'ias', 'http', 'new standard',
       'investor relations', 'financing activities', 'nofollow',
       'stay-at-home mandates began', 'secretary', 'form 10-q',
       'paragraph', 'like part-time', 'surface transmission',
       'non-ifrs measures', 'general meeting', 'measure excludes',
       'visit http', 'psycholog

In [77]:
doc2vec.topic_words[10]

array(['stand ready', 'charities', 'essential travel', 'sun',
       'february reported losing', 'studio', 'sick pay', 'foreign',
       'currently working', 'quarantined', 'association',
       'text-decoration', 'watching', 'clip', 'world', 'email', 'coffee',
       'co', 'trains', 'stockholm', 'commonwealth office',
       'coronavirus lockdown', 'britain brave nhs heroes', 'showing',
       'self-isolate', 'cyprus', 'welcoming', 'childcare', 'school',
       'issues', 'live blog', 'welcomed', 'quote', 'settled', 'ended —',
       'pakistan', 'thank', 'tom hanks', 'monaco', 'furloughed',
       'workers living', 'self-employed people', 'actors', 'like', 'boat',
       'migration', 'come home', 'australians', 'flight attendants',
       'wednesday afternoon'], dtype='<U90')

In [80]:
doc2vec.topic_words[75]

array(['total liabilities', 'up-to-the-minute coverage',
       'share attributable', 'use disinfecting wipes', 'rmb2',
       'total current liabilities', 'non-current',
       'disinfect hard surfaces', 'potentially sick people', 'seat',
       'accrued expenses', 'hands', 'ordinary shareholders', 'compared',
       'allocation', 's$', 'repayable', 'total assets', 'mouth',
       'non-controlling interest', 'un agency advises people', 'equity $',
       'breadth', 'owners', 'payables', 'authorisation', 'rmb',
       'specific cure', 'mainly attributable', 'changyou',
       'restricted cash', 'new shares', 'turnover', 'ordinary shares',
       'money markets', 'rmb3', 'directors', 'mortgage-backed securities',
       'advances', 'rmb0', 'properties', 'household earning', 'net sales',
       'unaudited', 'coronavirus symptoms', 'billion yuan', 'primarily',
       'oral statements', 'dispose', 'like part-time'], dtype='<U90')

In [81]:
doc2vec.topic_vectors

array([[-0.08656428, -0.10526719, -0.00490898, ...,  0.07338336,
        -0.07652663, -0.04370673],
       [ 0.0728389 ,  0.03991343,  0.01279974, ...,  0.07452159,
        -0.06001463, -0.0064584 ],
       [-0.07564057,  0.13048364,  0.07646126, ...,  0.005602  ,
        -0.04271905,  0.05587185],
       ...,
       [ 0.05725852, -0.06412213, -0.06018567, ..., -0.03334264,
        -0.05080355,  0.02289374],
       [ 0.15514138, -0.11167125, -0.00980208, ..., -0.0782685 ,
         0.00740562, -0.03646227],
       [ 0.1544364 , -0.11470329, -0.10446167, ..., -0.02850814,
         0.03240791,  0.01526658]], dtype=float32)

In [3]:
tuples_test = [(56,65), (67,76), (87,78)]
tuples_test_col1, tuples_test_col2 = zip(*tuples_test) 
print(type(tuples_test_col1), tuples_test_col1)
print(tuples_test_col2)

<class 'tuple'> (56, 67, 87)
(65, 76, 78)
