[Sentiment analysis on Twitter using word2vec and keras](https://ahmedbesbes.com/sentiment-analysis-on-twitter-using-word2vec-and-keras.html)

In [1]:
from copy import deepcopy
from string import punctuation
from random import shuffle

import pandas as pd
import numpy as np
import gensim
from gensim.models.word2vec import Word2Vec
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [2]:
pd.options.mode.chained_assignment = None
LabeledSentence = gensim.models.doc2vec.LabeledSentence
tqdm.pandas(desc="progress-bar")
tokenizer = TweetTokenizer()

In [3]:
data = pd.read_csv('dados/tweets.csv', 
                names=['Sentiment', 'a', 'b', 'c', 
                'd','SentimentText'], 
                header=None,
                nrows=1000000)

In [4]:
data.tail()

Unnamed: 0,Sentiment,a,b,c,d,SentimentText
999995,4,1879942807,Thu May 21 23:36:19 PDT 2009,NO_QUERY,divabat,"@healingsinger thank you, i needed that"
999996,4,1879942922,Thu May 21 23:36:20 PDT 2009,NO_QUERY,nick1975,@vactress http://bit.ly/cADea Maybe this is m...
999997,4,1879942975,Thu May 21 23:36:21 PDT 2009,NO_QUERY,znmeb,"@Brat13 Hell, Windows 7 will be out of my pric..."
999998,4,1879943113,Thu May 21 23:36:22 PDT 2009,NO_QUERY,virmani,@jigardoshi neah.. i wish! just reminiscing r...
999999,4,1879943219,Thu May 21 23:36:24 PDT 2009,NO_QUERY,redcomet81,@MsTeagan ...and by the way: I rewatched Sun G...


In [5]:
data.drop(['a', 'b', 'c', 'd'], axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,Sentiment,SentimentText
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
data['Sentiment'] = data['Sentiment'].map(int)

In [8]:
data = data[data['SentimentText'].isnull() == False]

In [9]:
data.size

2000000

In [10]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return tokens
    except:
        return 'NC'

In [11]:
def postprocess(data, n=1000000):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

In [12]:
data = postprocess(data)

progress-bar: 100%|██████████| 1000000/1000000 [01:46<00:00, 9359.03it/s]


In [13]:
data.head()

Unnamed: 0,Sentiment,SentimentText,tokens
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[-, awww, ,, that's, a, bummer, ., you, should..."
1,0,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face..."
2,0,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, ., man..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,"@nationwideclass no, it's not behaving at all....","[no, ,, it's, not, behaving, at, all, ., i'm, ..."


# Criando o modelo word2vec

In [14]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data.tokens),
                                                    np.array(data.Sentiment), test_size=0.2)

In [15]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [16]:
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

799677it [00:06, 119267.29it/s]
199920it [00:01, 105874.80it/s]


In [17]:
x_train[0]

TaggedDocument(words=[u'probly', u'gonna', u'arrive', u'about', u'1am', u'/', u'2am', u'thursday', u'morning', u'as', u'am', u'working', u'wednesday'], tags=['TRAIN_0'])

In [18]:
tweet_w2v = Word2Vec(size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])

100%|██████████| 799677/799677 [00:00<00:00, 1250610.91it/s]


In [19]:
tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)

100%|██████████| 799677/799677 [00:00<00:00, 1342656.87it/s]


42971735

In [20]:
tweet_w2v['good']

  if __name__ == '__main__':


array([ -2.38566160e+00,   4.06078398e-01,  -5.04667908e-02,
         9.52386409e-02,   1.51315844e+00,  -1.00473917e+00,
         5.99583328e-01,  -9.32454228e-01,  -1.46664023e+00,
        -1.29542005e+00,   1.79037178e+00,   3.02402568e+00,
        -7.52400458e-01,  -6.26823783e-01,  -6.27870262e-01,
        -1.66928872e-01,   8.67299616e-01,   1.43131435e+00,
         1.54591215e+00,   1.38691127e+00,   3.11750472e-01,
        -1.84880066e+00,  -1.01360857e-01,   5.55457532e-01,
        -4.43711281e-01,   2.42189899e-01,   1.30460238e+00,
        -5.48027754e-01,  -1.92330432e+00,  -2.06450534e+00,
        -2.21499741e-01,   8.07935178e-01,  -2.17683449e-01,
        -8.87013972e-01,   1.43433422e-01,  -3.72754723e-01,
        -1.39276350e+00,  -7.72497356e-01,  -5.39225519e-01,
        -5.93225539e-01,  -1.19251192e-01,  -1.16920114e+00,
        -4.26614672e-01,   2.10708454e-01,   1.10911560e+00,
         3.17453244e-03,   1.92699254e+00,   9.76916432e-01,
        -1.18815207e+00,

In [21]:
tweet_w2v.most_similar('good')

  if __name__ == '__main__':


[(u'goood', 0.7151153087615967),
 (u'great', 0.6884024739265442),
 (u'fantastic', 0.6366561651229858),
 (u'rough', 0.6182270646095276),
 (u'pleasant', 0.6160627603530884),
 (u'successful', 0.6137896776199341),
 (u'tough', 0.6098060011863708),
 (u'brilliant', 0.6058828830718994),
 (u'nice', 0.6028398275375366),
 (u'terrible', 0.6026292443275452)]

In [22]:
tweet_w2v.most_similar('bar')

  if __name__ == '__main__':


[(u'cafe', 0.7435959577560425),
 (u'table', 0.6988866329193115),
 (u'grill', 0.6891735792160034),
 (u'pub', 0.6614775657653809),
 (u'restaurant', 0.6462408900260925),
 (u'club', 0.6290731430053711),
 (u'strip', 0.6282809972763062),
 (u'barn', 0.623151957988739),
 (u'tin', 0.6225892305374146),
 (u'lounge', 0.6155617237091064)]

How about visualizing these word vectors? We first have to reduce their dimension to 2 using t-SNE. Then, using an interactive visualization tool such as Bokeh, we can map them directly on 2D plane and interact with them.
Here's the script, and the bokeh chart below.

In [23]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [24]:
# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

In [25]:
# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]];

  from ipykernel import kernelapp as app


In [26]:
# dimensionality reduction. converting the vectors to 2d vectors

from sklearn.manifold import TSNE

'''
tsne_model = TSNE(n_components=2, verbose=1, 
                  random_state=0, perplexity=30, n_iter=300)
'''
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.078s...
[t-SNE] Computed neighbors for 5000 samples in 8.100s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.267522
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.819160
[t-SNE] Error after 1000 iterations: 2.865301


In [27]:
# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]

In [28]:
tsne_df.head()

Unnamed: 0,x,y,words
0,-41.760887,13.542832,woods
1,-14.564546,21.860941,spiders
2,-38.69706,13.825593,hanging
3,9.601433,17.217356,woody
4,-19.935688,-24.787437,canes


In [29]:
from bokeh.models import ColumnDataSource

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', 
                   y='y', 
                   source=ColumnDataSource(tsne_df))
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

# Criando um classificador de sentimentos

In [30]:
print 'building tf-idf matrix ...'
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print 'vocab size :', len(tfidf)

building tf-idf matrix ...
vocab size : 23033


In [31]:
# Now let's define a function that, given a list of tweet tokens, creates an averaged tweet vector.
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [32]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

100%|██████████| 799677/799677 [03:41<00:00, 3602.62it/s]
100%|██████████| 199920/199920 [00:55<00:00, 3618.57it/s]


We should now be ready to feed these vectors into a neural network classifier. In fact, using Keras is very easy to define layers and activation functions.
Here is a basic 2-layer architecture.

In [34]:
from keras.models import Sequential
from keras.layers import Dense, Activation


model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)

Using TensorFlow backend.


Epoch 1/9
 - 31s - loss: -3.9312e+00 - acc: 0.4648
Epoch 2/9
 - 34s - loss: -4.1829e+00 - acc: 0.4816
Epoch 3/9
 - 28s - loss: -4.2458e+00 - acc: 0.4812
Epoch 4/9
 - 35s - loss: -4.2757e+00 - acc: 0.4809
Epoch 5/9
 - 28s - loss: -4.2959e+00 - acc: 0.4806
Epoch 6/9
 - 31s - loss: -4.3068e+00 - acc: 0.4807
Epoch 7/9
 - 37s - loss: -4.3167e+00 - acc: 0.4805
Epoch 8/9
 - 28s - loss: -4.3183e+00 - acc: 0.4810
Epoch 9/9
 - 30s - loss: -4.3265e+00 - acc: 0.4808


<keras.callbacks.History at 0x7fb515fd8050>

In [35]:
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print score[1]

0.481642657065
