# Word2Vec embeddings for tweets
Refer: https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-10-neural-network-with-a6441269aa3c

Train embeddings using tweets from January - July 2018. 

Total unique tweets = 1899851

Embeddings are trained using both Word2Vec CBOW and Skipgram models for unigrams and bigrams (phrases).

### Saving all unique tweets from Jan-July 2018 in separate file

In [1]:
import pandas as pd
import pickle
import numpy as np


In [2]:
def get_tweets(file):
    df = pd.read_table(file)
    tweets = df['text'].tolist()
    tweets = list(set(tweets))
    tweets_all.extend(tweets)
    print(len(tweets_all))

In [3]:
import os
directory = '/pylon5/be5fpap/sanyabt/RITHM/parser_out/jan_july_2018/'
tweets_all = []
for filename in os.listdir(directory):
    if filename.endswith(".tsv"):
        print(filename)
        get_tweets(os.path.join(directory, filename))
        continue
    else:
        continue

20180227000000_data.tsv
9167
20180320000000_data.tsv
9708
20180625000000_data.tsv
21732
20180130000000_data.tsv
32411
20180722000000_data.tsv
42632
20180510000000_data.tsv
53602
20180205000000_data.tsv
62284
20180409000000_data.tsv
72775
20180417000000_data.tsv
83511
20180112000000_data.tsv
92292
20180607000000_data.tsv
105784
20180619000000_data.tsv
117650
20180725000000_data.tsv
131709
20180622000000_data.tsv
145191
20180320005116_data.tsv
155809
20180129000000_data.tsv
165099
20180327000000_data.tsv
176204
20180220000000_data.tsv
185030
20180707000000_data.tsv
196361
20180719000000_data.tsv
209369
20180115000000_data.tsv
217165
20180402131247_data.tsv
223281
20180410000000_data.tsv
227713
20180509000000_data.tsv
238480
20180517000000_data.tsv
253838
20180319162233_data.tsv
253839
20180202000000_data.tsv
263137
20180617000000_data.tsv
273755
20180609000000_data.tsv
283634
20180102000000_data.tsv
284472
20180710000000_data.tsv
299773
20180215000000_data.tsv
307709
20180419000000_data.

In [27]:
with open('/pylon5/be5fpap/sanyabt/RITHM/parser_out/jan_july_2018/unique_tweets.pickle', 'wb') as file_o:
    pickle.dump(tweets_all, file_o)

## Clean tweets for embeddings

1. Remove URL's, mentions and unicodes
2. Remove HTML characters and special symbols
3. Lowercase text
4. Expand contractions
5. Remove numbers
6. Remove punctuation symbols
7. Tokenize text

In [4]:
from nltk import tokenize
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup

contractions_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
neg_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')
tzer = tokenize.RegexpTokenizer(r'[A-Za-z_]+')   

pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[^ ]+'
pat3 = r'www\.[^ ]+'
pat4 = r'\\u[^ ]+'
combined_pat = r'|'.join((pat1, pat2, pat3, pat4))
re_pat = re.compile(combined_pat)

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text().lower()      # lowercase the whole thing here
    bomgone = souped.replace('ï¿½', ' ')
    re_cleaned = re_pat.sub(' ', bomgone)
    neg_handled = neg_pattern.sub(lambda x: contractions_dict[x.group()], re_cleaned)
    tokenized = tzer.tokenize(neg_handled)
    return " ".join(tokenized)

def remove_underscores(text):
    words = text.split(' ')
    for word in words:
        if '_' in word:
            if '_emoj' not in word:
                words[words.index(word)] = ''
    return ' '.join(words)

In [5]:
tweets_all[:100]

[' # ecig # vapecommunity # vapeon # vapefam FIRST LOOK - Stentorian Basilisk: Please note this is NOT a review -  this is a first look at the product only. Review will be up later this month. The Late Late UK Vape Show is ... https://t.co/fkbpuhBpah # VapingWithVic # Vape # vapingAuction https://t.co/FM5F9zlumV ',
 ' # ecig # vapecommunity # vapeon # vapefam Times Vape / TenaciousTX Vapes Dreamer Review - A solid mech mod...: The Times Vape and TenaciousTXVapes Dreamer mech has been creating a lot of waves over the past couple of months. ... https://t.co/EMT9NskKYJ # VapingWithVic # Vape # vapingdays https://t.co/gtWkPg5hzM ',
 ' all of my co workers including the manager r outside vaping and im alone at front counter ',
 " When I 'm too old to smoke I 'm gonna start vaping. Until then -  my lungs better do their job. ",
 ' Kansas man sues vape shop for e-cigarette battery explosion. In other words -  man attempts to blame his stupidity on others -  instead of owning it. Lawyer vultur

In [6]:
testing = tweets_all[:100]
result = []
for t in testing:
    tweet = tweet_cleaner(t)
    result.append(remove_underscores(tweet))
result

['ecig vapecommunity vapeon vapefam first look stentorian basilisk please note this is not a review this is a first look at the product only review will be up later this month the late late uk vape show is vapingwithvic vape vapingauction',
 'ecig vapecommunity vapeon vapefam times vape tenacioustx vapes dreamer review a solid mech mod the times vape and tenacioustxvapes dreamer mech has been creating a lot of waves over the past couple of months vapingwithvic vape vapingdays',
 'all of my co workers including the manager r outside vaping and im alone at front counter',
 'when i m too old to smoke i m gonna start vaping until then my lungs better do their job',
 'kansas man sues vape shop for e cigarette battery explosion in other words man attempts to blame his stupidity on others instead of owning it lawyer vultures looking for a dollar',
 'vapour freaks hannibal on ice juice review _emoj_en_dash_ empire vape co',
 'ecig vapecommunity vapeon vapefam wotofo serpent smm review coiling 

In [10]:
nums = [0,400000,800000,1200000,1600000,1899851]
print("Cleaning and parsing the tweets...\n")
clean_tweets = []
for i in range(nums[0],nums[1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets_all[i]))

Cleaning and parsing the tweets...

Tweets 10000 of 400000 has been processed
Tweets 20000 of 400000 has been processed
Tweets 30000 of 400000 has been processed
Tweets 40000 of 400000 has been processed
Tweets 50000 of 400000 has been processed
Tweets 60000 of 400000 has been processed
Tweets 70000 of 400000 has been processed
Tweets 80000 of 400000 has been processed
Tweets 90000 of 400000 has been processed
Tweets 100000 of 400000 has been processed
Tweets 110000 of 400000 has been processed
Tweets 120000 of 400000 has been processed
Tweets 130000 of 400000 has been processed
Tweets 140000 of 400000 has been processed
Tweets 150000 of 400000 has been processed
Tweets 160000 of 400000 has been processed
Tweets 170000 of 400000 has been processed
Tweets 180000 of 400000 has been processed
Tweets 190000 of 400000 has been processed
Tweets 200000 of 400000 has been processed
Tweets 210000 of 400000 has been processed
Tweets 220000 of 400000 has been processed
Tweets 230000 of 400000 has

In [11]:
for i in range(nums[1],nums[2]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets_all[i]))

Tweets 410000 of 400000 has been processed
Tweets 420000 of 400000 has been processed
Tweets 430000 of 400000 has been processed
Tweets 440000 of 400000 has been processed
Tweets 450000 of 400000 has been processed
Tweets 460000 of 400000 has been processed
Tweets 470000 of 400000 has been processed
Tweets 480000 of 400000 has been processed
Tweets 490000 of 400000 has been processed
Tweets 500000 of 400000 has been processed
Tweets 510000 of 400000 has been processed
Tweets 520000 of 400000 has been processed
Tweets 530000 of 400000 has been processed
Tweets 540000 of 400000 has been processed
Tweets 550000 of 400000 has been processed
Tweets 560000 of 400000 has been processed
Tweets 570000 of 400000 has been processed
Tweets 580000 of 400000 has been processed
Tweets 590000 of 400000 has been processed
Tweets 600000 of 400000 has been processed
Tweets 610000 of 400000 has been processed
Tweets 620000 of 400000 has been processed
Tweets 630000 of 400000 has been processed
Tweets 6400

In [12]:
for i in range(nums[2],nums[3]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets_all[i]))

Tweets 810000 of 400000 has been processed
Tweets 820000 of 400000 has been processed
Tweets 830000 of 400000 has been processed
Tweets 840000 of 400000 has been processed
Tweets 850000 of 400000 has been processed
Tweets 860000 of 400000 has been processed
Tweets 870000 of 400000 has been processed
Tweets 880000 of 400000 has been processed
Tweets 890000 of 400000 has been processed
Tweets 900000 of 400000 has been processed
Tweets 910000 of 400000 has been processed
Tweets 920000 of 400000 has been processed
Tweets 930000 of 400000 has been processed
Tweets 940000 of 400000 has been processed
Tweets 950000 of 400000 has been processed
Tweets 960000 of 400000 has been processed
Tweets 970000 of 400000 has been processed
Tweets 980000 of 400000 has been processed
Tweets 990000 of 400000 has been processed
Tweets 1000000 of 400000 has been processed
Tweets 1010000 of 400000 has been processed
Tweets 1020000 of 400000 has been processed
Tweets 1030000 of 400000 has been processed
Tweets 

In [13]:
for i in range(nums[3],nums[4]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets_all[i]))

Tweets 1210000 of 400000 has been processed
Tweets 1220000 of 400000 has been processed
Tweets 1230000 of 400000 has been processed
Tweets 1240000 of 400000 has been processed
Tweets 1250000 of 400000 has been processed
Tweets 1260000 of 400000 has been processed
Tweets 1270000 of 400000 has been processed
Tweets 1280000 of 400000 has been processed
Tweets 1290000 of 400000 has been processed
Tweets 1300000 of 400000 has been processed
Tweets 1310000 of 400000 has been processed
Tweets 1320000 of 400000 has been processed
Tweets 1330000 of 400000 has been processed
Tweets 1340000 of 400000 has been processed
Tweets 1350000 of 400000 has been processed
Tweets 1360000 of 400000 has been processed
Tweets 1370000 of 400000 has been processed
Tweets 1380000 of 400000 has been processed
Tweets 1390000 of 400000 has been processed
Tweets 1400000 of 400000 has been processed
Tweets 1410000 of 400000 has been processed
Tweets 1420000 of 400000 has been processed
Tweets 1430000 of 400000 has bee

In [14]:
for i in range(nums[4],nums[5]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[1] ))                                                                   
    clean_tweets.append(tweet_cleaner(tweets_all[i]))

Tweets 1610000 of 400000 has been processed
Tweets 1620000 of 400000 has been processed
Tweets 1630000 of 400000 has been processed
Tweets 1640000 of 400000 has been processed
Tweets 1650000 of 400000 has been processed
Tweets 1660000 of 400000 has been processed
Tweets 1670000 of 400000 has been processed
Tweets 1680000 of 400000 has been processed
Tweets 1690000 of 400000 has been processed
Tweets 1700000 of 400000 has been processed
Tweets 1710000 of 400000 has been processed
Tweets 1720000 of 400000 has been processed
Tweets 1730000 of 400000 has been processed
Tweets 1740000 of 400000 has been processed
Tweets 1750000 of 400000 has been processed
Tweets 1760000 of 400000 has been processed
Tweets 1770000 of 400000 has been processed
Tweets 1780000 of 400000 has been processed
Tweets 1790000 of 400000 has been processed
Tweets 1800000 of 400000 has been processed
Tweets 1810000 of 400000 has been processed
Tweets 1820000 of 400000 has been processed
Tweets 1830000 of 400000 has bee

In [15]:
len(clean_tweets)

1899851

In [16]:
with open('/pylon5/be5fpap/sanyabt/RITHM/parser_out/jan_july_2018/clean_tweets.pickle', 'wb') as outfile:
    pickle.dump(clean_tweets, outfile)

## Word2Vec embeddings with tweets


In [17]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing
from sklearn import utils

In [2]:
with open('/pylon5/be5fpap/sanyabt/RITHM/parser_out/jan_july_2018/clean_tweets.pickle', 'rb') as file_i:
    tweets = pickle.load(file_i)

In [19]:
all_x = pd.Series(tweets)
def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(TaggedDocument(t.split(), [prefix + '_%s' % i]))
    return result

In [20]:
all_x_w2v = labelize_tweets_ug(all_x, 'all')
cores = multiprocessing.cpu_count()

### Word2Vec CBOW model train - unigram only

Parameters:
1. Embedding size = 100
2. Window size = 2
3. Min_count of words = 2
4. Training epochs = 30
5. Alpha = 0.065

In [21]:
model_ug_cbow = Word2Vec(sg=0, size=150, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 1899851/1899851 [00:00<00:00, 2556822.68it/s]


In [22]:
%%time
for epoch in range(30):
    model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_cbow.alpha -= 0.002
    model_ug_cbow.min_alpha = model_ug_cbow.alpha

100%|██████████| 1899851/1899851 [00:00<00:00, 2580541.66it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2611236.14it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2549674.13it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2288972.72it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2698154.00it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2665086.49it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2464568.91it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2652797.48it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2666535.71it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2730134.46it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2649098.76it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2496412.17it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2656492.37it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2535541.46it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2669439.76it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2593798.

CPU times: user 37min 10s, sys: 24.5 s, total: 37min 34s
Wall time: 10min 28s


In [25]:
model_ug_cbow.save('embedding_150/word2vec_cbow.model')

In [26]:
model_ug_cbow.most_similar('juul')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('juuls', 0.6018027067184448),
 ('jewel', 0.5215368866920471),
 ('chapstick', 0.5040896534919739),
 ('cucumber', 0.49874627590179443),
 ('phone', 0.4916815757751465),
 ('girlfriend', 0.4847780764102936),
 ('dumbass', 0.4741102159023285),
 ('laptop', 0.4717305302619934),
 ('blinker', 0.46918725967407227),
 ('blunt', 0.46792906522750854)]

In [27]:
model_ug_cbow.most_similar('hit')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('rip', 0.8320730328559875),
 ('smack', 0.7084908485412598),
 ('borrow', 0.6496420502662659),
 ('hits', 0.6466327905654907),
 ('hitting', 0.6198492050170898),
 ('rips', 0.6154642105102539),
 ('sip', 0.6020081639289856),
 ('shove', 0.5909634232521057),
 ('pull', 0.5908858776092529),
 ('ripped', 0.5871677398681641)]

In [28]:
model_ug_cbow.most_similar('twitter')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('ig', 0.6586094498634338),
 ('instagram', 0.6573286056518555),
 ('insta', 0.6345866918563843),
 ('facebook', 0.6197099089622498),
 ('snapchat', 0.6184684038162231),
 ('twt', 0.5588493943214417),
 ('finsta', 0.5585911273956299),
 ('reddit', 0.5442312359809875),
 ('youtube', 0.5370053648948669),
 ('tl', 0.5360217690467834)]

In [29]:
model_ug_cbow.most_similar('teen')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('youth', 0.5745497941970825),
 ('teenage', 0.5454083681106567),
 ('teens', 0.46677231788635254),
 ('student', 0.46478596329689026),
 ('epidemic', 0.4534418284893036),
 ('trend', 0.4261566698551178),
 ('child', 0.4252815842628479),
 ('teenager', 0.414495050907135),
 ('underage', 0.41388779878616333),
 ('teenagers', 0.4111449718475342)]

### Word2vec skipgram model train - unigram

In [30]:
model_ug_sg = Word2Vec(sg=1, size=150, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)])

100%|██████████| 1899851/1899851 [00:00<00:00, 2390972.74it/s]


In [31]:
%%time
for epoch in range(30):
    model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_sg.alpha -= 0.002
    model_ug_sg.min_alpha = model_ug_sg.alpha

100%|██████████| 1899851/1899851 [00:00<00:00, 2560350.30it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2734093.11it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2644887.02it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2632814.48it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2600494.62it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2796271.84it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2864254.14it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2649239.68it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2706488.82it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2774624.62it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2765352.02it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2709288.07it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2614955.95it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2616467.12it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2707182.11it/s]
100%|██████████| 1899851/1899851 [00:00<00:00, 2742401.

CPU times: user 1h 21min 10s, sys: 16.6 s, total: 1h 21min 26s
Wall time: 10min 59s


In [32]:
model_ug_sg.save('embedding_150/word2vec_sg.model')

In [33]:
model_ug_sg.most_similar('juul')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('juuls', 0.6332941651344299),
 ('sallisaw', 0.5805323123931885),
 ('juulip', 0.5532362461090088),
 ('juulinator', 0.5477495789527893),
 ('vickii', 0.5462276339530945),
 ('thxxxxx', 0.5436975955963135),
 ('oneof', 0.5336681008338928),
 ('kaid', 0.531792163848877),
 ('_ansuryan', 0.5306997299194336),
 ('lipsss', 0.5270533561706543)]

In [34]:
model_ug_sg.most_similar('hit')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('rip', 0.7213974595069885),
 ('hits', 0.6625513434410095),
 ('smack', 0.6397694945335388),
 ('borrow', 0.5885465741157532),
 ('hitting', 0.584056556224823),
 ('bathroon', 0.583244264125824),
 ('pleaz', 0.5803359746932983),
 ('bunkmate', 0.5573672652244568),
 ('pleeaaseee', 0.5569705367088318),
 ('lwmkfkfkfkfkkfkfkflhlflgk', 0.5565747022628784)]

In [35]:
model_ug_sg.most_similar('twitter')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('flavourbossuk', 0.6281245946884155),
 ('instagram', 0.6255084276199341),
 ('insta', 0.6050515174865723),
 ('pizzabottle', 0.6037068367004395),
 ('psvxfacbgj', 0.5976096391677856),
 ('_clout', 0.5944842100143433),
 ('olyvaporworks', 0.5920782089233398),
 ('promoteyoutube', 0.5849149227142334),
 ('jrlnkebpi', 0.5764425992965698),
 ('facebook', 0.5750778317451477)]

### Load vectors and concatenate to give 200 dimensional vector for each word

In [36]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('embedding_150/word2vec_cbow.model')
model_ug_sg = KeyedVectors.load('embedding_150/word2vec_sg.model')

In [37]:
type(model_ug_cbow)

gensim.models.word2vec.Word2Vec

In [38]:
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = np.append(model_ug_cbow.wv[w],model_ug_sg.wv[w])

In [39]:
#save embeddings
with open('embedding_150/word2vec_unigram_combined.pickle', 'wb') as file_embed:
    pickle.dump(embeddings_index, file_embed)

### Word2vec phrase model - include bigrams in both CBOW and skipgram

(Other parameters same as unigram model)

In [40]:
from gensim.models.phrases import Phrases, Phraser

In [41]:
sent = [row.split() for row in tweets]

In [42]:
phrases = Phrases(sent, min_count=5, progress_per=10000)

In [43]:
bigram = Phraser(phrases)
sentences = bigram[sent]

In [44]:
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=2,
                     negative=5,
                     window=2,
                     size=150,
                     alpha=0.065, 
                     min_alpha=0.065,
                     workers=cores)

In [45]:
from time import time
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 1.59 mins


In [46]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 52.84 mins


In [47]:
w2v_model.init_sims(replace=True)

In [48]:
w2v_model.most_similar('juul')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('sourin', 0.43230533599853516),
 ('breakfast_lunch', 0.409820020198822),
 ('inhaler', 0.3965938687324524),
 ('quesadilla', 0.3898640275001526),
 ('dropkit', 0.3821324110031128),
 ('flash_drive', 0.37918415665626526),
 ('inlost', 0.37634336948394775),
 ('syste', 0.3712458908557892),
 ('portable_charger', 0.36969733238220215),
 ('jagerbombs', 0.36863893270492554)]

In [49]:
w2v_model.save('embedding_150/word2vec_bg_cbow.model')

In [50]:
w2v_bg_sg = Word2Vec(sg=1,
                    min_count=2,
                     negative=5,
                     window=2,
                     size=150,
                     alpha=0.065, 
                     min_alpha=0.065,
                     workers=cores)

In [51]:
from time import time
t = time()
w2v_bg_sg.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 1.49 mins


In [52]:
t = time()

w2v_bg_sg.train(sentences, total_examples=w2v_bg_sg.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 52.66 mins


In [53]:
w2v_bg_sg.save('embedding_150/word2vec_bg_sg.model')

In [54]:
w2v_bg_sg.most_similar('hit')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('justbro', 0.5578741431236267),
 ('bro_please', 0.5501075983047485),
 ('rip', 0.5482180714607239),
 ('phat_rip', 0.5365564823150635),
 ('promose', 0.5363010168075562),
 ('one_timmeeeee', 0.5326591730117798),
 ('fix_bro', 0.5315108299255371),
 ('remwmber', 0.5282833576202393),
 ('borrow', 0.52227383852005),
 ('_elalami', 0.5201666355133057)]

Combine bigram models

In [55]:
from gensim.models import KeyedVectors
model_bg_cbow = KeyedVectors.load('embedding_150/word2vec_bg_cbow.model')
model_bg_sg = KeyedVectors.load('embedding_150/word2vec_bg_sg.model')

In [56]:
type(model_ug_cbow)

gensim.models.word2vec.Word2Vec

In [57]:
embeddings_index_bg = {}
for w in model_bg_cbow.wv.vocab.keys():
    embeddings_index_bg[w] = np.append(model_bg_cbow.wv[w],model_bg_sg.wv[w])

In [58]:
#save embeddings
with open('embedding_150/word2vec_bigram_combined.pickle', 'wb') as file_embed2:
    pickle.dump(embeddings_index_bg, file_embed2)

In [59]:
w2v_bg_sg.most_similar('vape_shop')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('vape_bar', 0.7845531105995178),
 ('vape_life_parovarcustomvape', 0.7380775213241577),
 ('vapestationrussia', 0.7064079642295837),
 ('vapelife_vapeforlive', 0.7003614902496338),
 ('custom_vapeparts', 0.6982174515724182),
 ('buycapeoil_onlinecbdoil', 0.6335662603378296),
 ('cravevapes', 0.6272263526916504),
 ('haugb_xf', 0.6172439455986023),
 ('vapestationrussia_vapecommunity', 0.6148149371147156),
 ('very_very_vape', 0.5990586280822754)]

In [60]:
model_ug_cbow.most_similar('pod')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('pods', 0.6260148286819458),
 ('forist', 0.5223135352134705),
 ('charger', 0.49703001976013184),
 ('closedpod', 0.46530017256736755),
 ('julep', 0.4650610387325287),
 ('phix', 0.46202394366264343),
 ('yaclut', 0.4532976746559143),
 ('cartridge', 0.4521150290966034),
 ('demandan', 0.4488767087459564),
 ('podge', 0.4307107925415039)]

In [61]:
model_ug_cbow.most_similar('pods')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('pod', 0.626014769077301),
 ('forist', 0.5551401972770691),
 ('chargers', 0.5112075209617615),
 ('cartridges', 0.4846333861351013),
 ('julips', 0.48204678297042847),
 ('gos', 0.46834009885787964),
 ('carts', 0.4624638557434082),
 ('shells', 0.4553850591182709),
 ('oreos', 0.4523954689502716),
 ('vonearl', 0.4454173445701599)]

In [62]:
model_ug_cbow.most_similar('_emoj_dash_')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('thechallengexxx', 0.7865304946899414),
 ('pufffffff', 0.7846342325210571),
 ('partnerwanted', 0.7763417959213257),
 ('_emoj_wind_blowing_face_', 0.7755203247070312),
 ('freebrxvn', 0.7680383920669556),
 ('mmxii', 0.7558403611183167),
 ('evox', 0.7524769902229309),
 ('vapetesla', 0.7509241104125977),
 ('bluntugly', 0.7381649017333984),
 ('_tg', 0.7282710075378418)]

In [63]:
model_ug_sg.most_similar('_emoj_dash_')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('dailytweet', 0.8055098056793213),
 ('seshhh', 0.7885046601295471),
 ('partnerwanted', 0.7725472450256348),
 ('connectsesh', 0.771475613117218),
 ('oyats', 0.7617258429527283),
 ('evox', 0.7600874304771423),
 ('tothetop', 0.7587347626686096),
 ('thechallengexxx', 0.7572304010391235),
 ('gameofthedaywinner', 0.7520869374275208),
 ('mobliquidlabs', 0.7393802404403687)]

In [64]:
model_ug_cbow.most_similar('_emoj_wind_blowing_face_')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('_emoj_dash_', 0.7755203247070312),
 ('connectsesh', 0.6801813840866089),
 ('ridgeforlife', 0.6672093272209167),
 ('mmxii', 0.6649066805839539),
 ('freebrxvn', 0.6639013290405273),
 ('thechallengexxx', 0.6637577414512634),
 ('pufffffff', 0.645920991897583),
 ('planetavapeo', 0.6434254050254822),
 ('calicrushergrinders', 0.6321915984153748),
 ('_tg', 0.6208866834640503)]

In [65]:
model_ug_cbow.most_similar('_emoj_en_dash_')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('rexambassador', 0.4956991374492645),
 ('_emoj_latin_ve_', 0.4739377498626709),
 ('navalisty', 0.45488712191581726),
 ('via', 0.45333296060562134),
 ('prost', 0.4365857243537903),
 ('recenze', 0.42468711733818054),
 ('review', 0.42369547486305237),
 ('vapejournal', 0.41930273175239563),
 ('vapertube_italia', 0.40626704692840576),
 ('perfrom', 0.40563878417015076)]

Count vocabulary of tweets used for embeddings

In [4]:
len(tweets)

1899851

In [5]:
from nltk import tokenize

In [6]:
vocab = []
for tweet in tweets:
    tokens = tokenize.word_tokenize(tweet)
    vocab.extend(tokens)
len(vocab)

31071566

In [7]:
vocab = list(set(vocab))
len(vocab)

343573