In [83]:
import pandas as pd
#from top2vec import Top2Vec
import os
import collections
import csv
import logging

from tensorflow.keras.layers import Input

In [84]:
df = pd.read_pickle('./Data/df_processed.pickle')

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367947 entries, 0 to 367946
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181781 non-null  object        
 1   date               367947 non-null  datetime64[ns]
 2   domain             367947 non-null  object        
 3   title              367862 non-null  object        
 4   url                367947 non-null  object        
 5   content            367947 non-null  object        
 6   topic_area         367947 non-null  object        
 7   content_processed  367947 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 22.5+ MB


In [86]:
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[end, year, corner, s past time, think, positi..."


In [87]:
def logger_w2v():
    
    log_file = os.path.join('./Data', 'word2vec.log')
    print('log file location: ', log_file)
    fhandler = logging.FileHandler(log_file)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    
    return logger
    

In [88]:
class Word2Vec:
    """
    apply word2vec to text
    """

    def __init__(self, logger, vocab_size, vector_dim, input_target, input_context,
                 load_pretrained_weights, checkpoint_file):
        """
        Args:
            vocab size: integer of number of words to form vocabulary from
            vector_dim: integer of number of dimensions per word
            input_target: tensor representing target word
            input_context: tensor representing context word
        """
        self.logger = logger        
        self.vocab_size = vocab_size
        self.vector_dim = vector_dim
        self.input_target = input_target
        self.input_context = input_context
        self.load_pretrained_weights = load_pretrained_weights
        self.checkpoint_file = checkpoint_file
        #self.model = self.create_model()
        
    def build_dataset(self, words):
        """
        :process raw inputs into a dataset

        Args:
            words: list of strings

        Returns:
            tuple:
                data: list of integers representing words in words
                count: list of count of most frequent words with size n_words
                dictionary: dictionary of word to unique integer
                reverse dictionary: dictionary of unique integer to word
        """
        self.logger.info("Building dataset")

        count = [['UNK', -1]]
        words = [item for sublist in words for item in sublist]
        count.extend(collections.Counter(words).most_common(self.vocab_size - 1))
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        self.dictionary = dictionary

        # Save dictionary
        dict_path = './Data'
        dict_file = 'dictionary.csv'
        dict_file = os.path.join(dict_path,dict_file)
        
        with open(dict_file, 'w') as f:
            for key in dictionary.keys():
                f.write("%s,%s\n"%(key,dictionary[key]))

        return data, count, dictionary, reversed_dictionary

In [93]:
words = df['content_processed'][:10]
sorted(words[4][:10])

['awards shows',
 'celebrities',
 'coronavirus pandemic',
 'life returns',
 'like',
 'normal',
 'parties',
 'premieres',
 'things',
 'walk red carpets']

In [94]:
logger = logger_w2v()

vocab_size = 10000
vector_dim = 250
input_target = Input((1,))
input_context = Input((1,))
load_pretrained_weights = False
checkpoint_file = None

word2vec = Word2Vec(logger, vocab_size, vector_dim, input_target, input_context,
                    load_pretrained_weights, checkpoint_file)

data, count, dictionary, reversed_dictionary = word2vec.build_dataset(words)

log file location:  ./Data/word2vec.log


In [97]:
dictionary
#count

{'UNK': 0,
 'year': 1,
 'company': 2,
 '': 3,
 'stock': 4,
 'stocks': 5,
 'market': 6,
 'according': 7,
 'analysts': 8,
 'expected': 9,
 'tesla': 10,
 'said': 11,
 'time': 12,
 'way': 13,
 'business': 14,
 'companies': 15,
 'people': 16,
 'like': 17,
 'rise': 18,
 'outlook': 19,
 'today': 20,
 'dividend': 21,
 'view': 22,
 'wuhan': 23,
 'going': 24,
 'investors': 25,
 'don': 26,
 'normal': 27,
 'https': 28,
 'growth': 29,
 'years': 30,
 'know': 31,
 'low': 32,
 'coronavirus': 33,
 'millions': 34,
 'week': 35,
 'pneumonia': 36,
 'lennar': 37,
 'think': 38,
 'far': 39,
 'write': 40,
 'style=': 41,
 'text-decoration': 42,
 'font-weight': 43,
 'color': 44,
 'target=': 45,
 'blank': 46,
 'rel=': 47,
 'nofollow': 48,
 'href=': 49,
 'www': 50,
 'ame': 51,
 'ricanconsumernews': 52,
 'net/scripts/click': 53,
 'aspx': 54,
 'caterpillar': 55,
 'deal': 56,
 'expect': 57,
 'looking': 58,
 'impact': 59,
 'china': 60,
 'increase': 61,
 'yield': 62,
 'complete': 63,
 'form': 64,
 'receive': 65,
 'late