In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

from multiprocessing import cpu_count

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open('data/tweet_data.pkl', 'rb') as f:
    tweet_data = pickle.load(f)

In [3]:
tweet_data.head(3)

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"


## Count vectorizer

In [4]:
count_vectorizer = CountVectorizer(max_df=0.9, max_features=1000, stop_words='english')

In [5]:
tweet_stemmed = tweet_data['tweet_stemmed'].apply(lambda x: ' '.join(x))

In [6]:
bow_stemmed_tweet = count_vectorizer.fit_transform(tweet_stemmed)

In [7]:
feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bow_stemmed_tweet.toarray(), columns = feature_names).head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
tweet_lemmatized = tweet_data['tweet_lemmatized'].apply(lambda x: ' '.join(x))

In [9]:
bow_lemmatized_tweet = count_vectorizer.fit_transform(tweet_lemmatized)

In [10]:
pd.DataFrame(bow_lemmatized_tweet.toarray(), columns = feature_names).head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF IDF vectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=1000, stop_words='english')

In [12]:
bow_tfidf_tweet_stemmed = tfidf_vectorizer.fit_transform(tweet_stemmed)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(bow_tfidf_tweet_stemmed.toarray(), columns = tfidf_feature_names).head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
bow_tfidf_tweet_lemmatized = tfidf_vectorizer.fit_transform(tweet_lemmatized)
pd.DataFrame(bow_tfidf_tweet_lemmatized.toarray(), columns = tfidf_feature_names).head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Word2Vec

In [14]:
data_tweet_token = tweet_data['tweet_token'].tolist()

In [15]:
modelW2V = Word2Vec(data_tweet_token, size=200, window=5,\
                 min_count=2, sg=1, hs=0, negative=10, workers=cpu_count(), seed=34)
                

    modelW2V.train(data_tweet_token, total_examples=len(data_tweet_token), epochs=20)

(10511001, 13485498)

In [16]:
modelW2V.most_similar(positive=['dinner'])

[('bihdaydinner', 0.5515379905700684),
 ('bolognese', 0.5208319425582886),
 ('newfriends', 0.5125149488449097),
 ('sissy', 0.5076766610145569),
 ('shawarma', 0.5060603618621826),
 ('hamburger', 0.5059299468994141),
 ('spaghetti', 0.5030279159545898),
 ('whoopppp', 0.5024327039718628),
 ('waterloo', 0.5008033514022827),
 ('sizzle', 0.49714696407318115)]

In [17]:
modelW2V.most_similar(positive=['trump'])

[('donald', 0.5408417582511902),
 ('suppoer', 0.5393491983413696),
 ('conman', 0.5132254362106323),
 ('donaldtrump', 0.5104513168334961),
 ('fuhered', 0.504746675491333),
 ('bernieorbust', 0.5041683912277222),
 ('impeachment', 0.503799557685852),
 ('unfavorability', 0.5008257627487183),
 ('unfit', 0.49849027395248413),
 ('jeffsessions', 0.4936893582344055)]

In [18]:
modelW2V['food'] 

array([ 2.84502178e-01, -9.02844906e-01, -4.70967174e-01,  5.86584151e-01,
       -7.31627285e-01, -4.31441158e-01,  5.20067692e-01, -1.81481346e-01,
        3.77497852e-01, -2.81644940e-01, -9.83989090e-02,  3.12586010e-01,
       -2.14669526e-01,  2.54437536e-01,  1.29709795e-01,  5.29490635e-02,
       -6.46490812e-01,  3.53596002e-01,  4.36361656e-02,  2.84376770e-01,
        4.86700445e-01, -3.66829515e-01,  4.04221267e-01, -7.42760241e-01,
        4.64694172e-01,  3.72989237e-01,  3.55989844e-01, -1.76287554e-02,
        3.74701530e-01, -1.14910007e+00,  2.44913295e-01, -2.60831714e-01,
       -2.71866083e-01,  2.85908934e-02, -1.30958050e-01, -4.54350829e-01,
       -1.94879875e-01, -2.75699235e-02,  8.12170133e-02,  4.31748271e-01,
       -2.49697208e-01, -1.33608103e-01,  1.74197674e-01, -1.94327813e-02,
        6.66695297e-01,  4.98798519e-01, -3.39991674e-02, -4.04605269e-02,
        7.33403936e-02, -1.06123783e-01, -2.57514238e-01,  5.69797158e-01,
        1.88739464e-01,  

In [24]:
def text2vec(text_list, size=200):
    n_words = len(text_list)
    vec = np.zeros((1, size))
    if n_words == 0:
        return vec
    for word in text_list:
        try:
            vec += modelW2V[word].reshape((1, size))
        except KeyError:
            continue
    return vec / n_words


In [25]:
wordvec_df = tweet_data['tweet_token'].apply(text2vec)
wordvec_df = np.concatenate(wordvec_df)
wordvec_df.shape

(49159, 200)