In [21]:
import pandas as pd
import numpy as np
import multiprocessing
from time import time

In [22]:
df = pd.read_csv('../data/tweets&sentiment&absoluteprices&topics.csv', index_col=0)
final_test_df = pd.read_csv('../data/test_data_tweets&sentiment&topic&absoluteprices.csv', index_col=0).drop(columns=["index"])

In [23]:
df.shape

(26231, 29)

In [25]:
final_test_df.shape

(1249, 29)

## Word embedding
Word2Vec is the chosen text embedding method for several reasons:
* The mapping between the target word to its context word implicitly embeds the sub-linear relationship into the vector space of words, so that relationships like “king:man as queen:woman” can be infered by word vectors.
* It is less computationally expensive than deep language models such as GloVe, BERT, ElMo. BERT + transfer learning with BiLSTM was initially chosen for this problem but due to the relatively large dataset and limited computational power, training was extremely slow.

### Further process text for Word2Vec

In [28]:
import nltk

In [29]:
def process_text_w2v(paragraph):    
    result = list()
    for line in nltk.sent_tokenize(paragraph):
        tokenizer = nltk.RegexpTokenizer(r"\w+")
        tokens = [token for token in tokenizer.tokenize(line)]
        result.append(tokens) 
    return result

In [30]:
df['cleaned_text_w2v'] = df['cleaned_text'].apply(lambda x: process_text_w2v(x))
final_test_df['cleaned_text_w2v'] = final_test_df['cleaned_text'].apply(lambda x: process_text_w2v(x))

In [31]:
# compile all sentences together to compose the corpus for later usage.
df_sentences = [sent for x in df['cleaned_text_w2v'].values.tolist() for sent in x]
final_test_df_sentences = [sent for x in final_test_df['cleaned_text_w2v'].values.tolist() for sent in x]

sentences = df_sentences + final_test_df_sentences

### Set parameters for word2vec model
`min_count` is set to 1 since we would like to obtain the embeddings of all words in our vocabulary for subsequent modelling to work. Normally, `min_count` is set to a larger value.

In [33]:
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

In [34]:
w2v_model = Word2Vec(min_count=1, 
                     window=3,
                     size=64,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count())

### Building the Vocabulary Table
Digest all the words and filter out the unique words, and doing some basic counts on them.

In [40]:
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.07 mins


### Train Word2Vec

In [41]:
start_time = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - start_time) / 60, 2)))

Time to train the model: 0.23 mins


In [42]:
w2v_model.save('../model/word2vec/word2vec.model')

In [43]:
w2v_model.init_sims(replace=True)

### Generate Word2Vec vectors
generate and write the word embedding vectors to directory

In [144]:
# # load the model from the model file
# w2v_model = Word2Vec.load('../model/word2vec.model')

In [47]:
def write_vectors_to_file(df, filename):
    
    # Store the vectors in a csv file
    path = '../model/word2vec/' + filename
    
    with open(path, 'w+') as word2vec_file:
        
        for index, row in df.iterrows():
            model_vector = (np.mean([w2v_model[token] for token in row['cleaned_text_w2v'][0]], axis=0)).tolist()

            if index == 0:
                header = ",".join(str(ele) for ele in range(64))
                word2vec_file.write(header)
                word2vec_file.write("\n")

            # Check if the line exists else it is vector of zeros
            if type(model_vector) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(64)])

            word2vec_file.write(line1)
            word2vec_file.write('\n')

In [48]:
write_vectors_to_file(df, 'word2vec_train.csv')
write_vectors_to_file(final_test_df, 'word2vec_test.csv')

  if __name__ == '__main__':


### Concatenate the word embeddings with other features in the datasets

In [53]:
w2v_embeddings_train = pd.read_csv('../model/word2vec/word2vec_train.csv')
w2v_embeddings_test = pd.read_csv('../model/word2vec/word2vec_test.csv')

In [54]:
df = pd.concat([df, w2v_embeddings_train], axis=1)
final_test_df = pd.concat([final_test_df, w2v_embeddings_test], axis=1)

In [55]:
df.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,54,55,56,57,58,59,60,61,62,63
0,Twitter for iPhone,Thank you @HerschelWalker! https://t.co/XjlYe8...,thank,2020-09-30 23:45:25,19616,65721,False,1311512518800470016,[],['@HerschelWalker'],...,0.056449,0.303183,0.076234,0.011941,0.095509,-0.203082,-0.040352,-0.033276,-0.246406,0.033626
1,Twitter for iPhone,RT @GOPChairwoman: Big news!A Maine court side...,big news maine court side rnc uphold ban ballo...,2020-09-30 23:25:31,29393,0,True,1311507509958471680,[],[],...,-0.03375,0.029573,0.057895,-0.1129,0.000638,-0.08221,-0.112283,0.068804,-0.176641,0.058202
2,Twitter for iPhone,Thank you Paul! https://t.co/aAk1sfww0d,thank paul,2020-09-30 23:00:33,15992,63294,False,1311501225423073281,[],[],...,0.07169,0.201051,0.034127,-0.064866,-0.021847,-0.134962,-0.028476,0.004518,-0.294078,0.040799
3,Twitter for iPhone,100000 DEFECTIVE BALLOTS IN NEW YORK. THEY WAN...,defective ballot new york want replace happen ...,2020-09-30 22:59:02,51445,190750,False,1311500843309387781,[],[],...,-0.031608,-0.04166,0.075459,-0.142272,-0.035155,-0.01433,-0.070449,0.004803,-0.224101,0.003555
4,Twitter for iPhone,In just 3 and a half years we have secured Ame...,half years secure americas border rebuild awes...,2020-09-30 22:51:05,18885,70838,False,1311498845860196355,['#MAGA'],[],...,-0.06339,0.071681,0.035991,-0.082486,0.025676,-0.096789,-0.039431,0.056586,-0.196989,0.061467


In [56]:
final_test_df.head()

Unnamed: 0,source,text,cleaned_text,created_at,retweet_count,favorite_count,is_retweet,id_str,hashtag,mention,...,54,55,56,57,58,59,60,61,62,63
0,Twitter for iPhone,I won the debate big based on compilation of p...,debate big base compilation poll etc thank,2020-10-01 11:14:28,44961,337926,False,1311685923097260034,[],[],...,0.077325,0.078596,0.117261,-0.024885,-0.12847,-0.038628,-0.079148,-0.02373,-0.231218,0.10082
1,Twitter for iPhone,Why would I allow the Debate Commission to cha...,would allow debate commission change rule seco...,2020-10-01 14:15:26,41516,247053,False,1311731462589292544,[],[],...,-0.022978,0.015267,0.10507,-0.088523,-0.06597,-0.024742,-0.055147,0.007281,-0.172681,0.104142
2,Twitter for iPhone,THANK YOU! #MAGA https://t.co/nGfbRmfmG7,thank maga,2020-10-01 15:09:17,18014,63744,False,1311745016780460033,['#MAGA'],[],...,-0.022568,0.298728,0.0663,0.070448,0.074991,-0.213843,-0.072632,-0.007993,-0.230358,0.095514
3,Twitter for iPhone,Exclusive Excerpt--Lewandowski &amp; Bossie: ‘...,exclusive excerpt lewandowski amp bossie trump...,2020-10-01 17:12:22,6407,23646,False,1311775992847818754,[],['@BreitbartNews'],...,0.083277,0.078566,0.054099,-0.041604,0.008532,-0.16547,-0.03643,0.098156,-0.152624,0.033352
4,Twitter for iPhone,RT @GreggJarrett: Corrupt Comey conveniently c...,corrupt comey conveniently claim no memory par...,2020-10-01 17:14:12,4755,0,True,1311776453717942272,[],[],...,0.060734,-0.04741,0.050447,-0.125226,-0.044856,-0.033562,-0.058906,0.115426,-0.164322,-0.070981
