In [2]:
import pandas as pd
import numpy as np
import math
import re
from nltk import tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Mining Sentiment on Twitter


### Overview

I experiement with four classifiers, and here are the overall results:

    unigram(tf-idf) - GradientBoosting : 70.7%
    bigram(tf-idf) - GradientBoosting : 62.7%
    word2vec - GradientBoosting : 61.3%
    LSTM (2 layer) on 12 epochs: 93.6%


### Part 1: Streaming tweets 

I streamed 'happy' and 'sad' tweets using the streaming API with the aid of the search API for historical data, as there are less tweets with ":(" than there are with ":)". I also set the language to be English('en') only and the region to be in the United States. Overall I have 29521 positive tweets, and 35422 negative tweets.

The code for streaming is in the appendex. 


In [5]:
happy = pd.read_csv("./positive-emo_2018-03-08.json", sep="\n", error_bad_lines=False, warn_bad_lines=False, engine='python')
happy.columns = ["tweets", "json"]
happy = happy.drop(['json'], axis=1)
sad = pd.read_csv("negative-emo_2018-03-04_to_2018-03-09.json", error_bad_lines=False, warn_bad_lines=False, engine='python')
sad.columns = ["tweets"]

In [218]:
print("There are", len(happy), "positive tweets")
print("There are", len(sad), "negative tweets")

There are 29521 positive tweets
There are 35422 negative tweets



### Part 2: Tweets cleanup

Now, we add in labels to mark the tweets as ":(" or ":)" for happy and sad faces. Then we merge the two datasets into one. 

Also we prepare the stopwords from nltk package, along with the symbols we see a lot in tweets - we add then into a set called stopws, along with the ":)" and ":(" emoticon from the tweets. Also nltk has a TweetTokenizer that helps with the clean up process.

Then we use a for loop to go through each tweets, using regex and stopwords we remove websites, usernames, emoticons, reweets etc; convert to all lowercases, and finally use the tokenize the tweets into lists of words/tokens.

In [6]:
happy['emo'] = ":)"
sad['emo'] = ":("
tweet = happy.merge(sad, how='outer')

In [7]:
stopws = stopwords.words("english")
stopws = set(stopws)
stopws.add("rt")
stopws.add(":)")
stopws.add(":(")
stopws.add(":")
stopws.add(",")
stopws.add(".")
stopws.add("!")

In [8]:
#Remove posts that contain both happy and sad emo
#Remove stop words, websites ("http..."), usernames ("@shilad"), and anything else that seems weird to you.

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
for i in range(len(tweet.tweets)):
    tweet.tweets[i] = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet.tweets[i]).split())
    tweet.tweets[i] = tknzr.tokenize(tweet.tweets[i])    
    for word in tweet.tweets[i]: # iterate over word_list
        if word in stopws: 
            tweet.tweets[i].remove(word)

tweet.head()

Unnamed: 0,tweets,emo
0,"[well, means, with, guys, and, pizza, on, firs...",:)
1,"[circle, giveaway, give, appreciation, an, awe...",:)
2,"[trump, so, strong, wonder, cnn, find, make, s...",:)
3,"[aksener, sanada, pembe, zarfta, koyarlar, par...",:)
4,"[hi, guys, m, writing, paper, my, public, rela...",:)


Now that we have the tokenize text data, we can go ahead and calculate the frequeny matrices for postive and negative tweets separately. 


### Part 3: Frequency matrix

In [224]:
#Create the frequency matrix.

pos_counts = defaultdict(int)
neg_counts = defaultdict(int)
for i in range(len(tweet)):    
    if tweet.emo[i] == ":)":
        for token in tweet.tweets[i]:
            pos_counts[token] += 1
    else:  #emo == ":("
        for token in tweet.tweets[i]:
            neg_counts[token] += 1
    
    pos_total = sum(pos_counts.values())
    neg_total = sum(neg_counts.values())
    
    for token in pos_counts:
        pos_counts[token] /= pos_total
    for token in neg_counts:
        neg_counts[token] /= neg_total

In [57]:
#print("The frequency matrix for postive tweets is ", pos_counts)
#print("The frequency matrix for negative tweets is ", neg_counts)

    - Output too long, didn't print. 


I realize that later on the CountVectorizer function only takes in strings, so I add in one more column that contains the string of tweets separated by space. So now our data frame looks like the following.

In [10]:
tweet['text'] = 0
for i in range(len(tweet)):
    tweet.text[i] = ' '.join(tweet.tweets[i])

tweet.head()

Unnamed: 0,tweets,emo,text
0,"[well, means, with, guys, and, pizza, on, firs...",:),well means with guys and pizza on first name b...
1,"[circle, giveaway, give, appreciation, an, awe...",:),circle giveaway give appreciation an awesome d...
2,"[trump, so, strong, wonder, cnn, find, make, s...",:),trump so strong wonder cnn find make sound lik...
3,"[aksener, sanada, pembe, zarfta, koyarlar, par...",:),aksener sanada pembe zarfta koyarlar pardon oy...
4,"[hi, guys, m, writing, paper, my, public, rela...",:),hi guys m writing paper my public relations cl...


### Part 4: Build Sentiment Classifier

Now we can train our classifier. First we split the data into 80% training data and the rest 20% testing data.

Then we use CountVectorizer and a TfidfTransformer functions to extract the Tf-Idf matrix of the training data, fit into the GradientBoosting Classifier, and use the confusion matrix to show the accuracy rate, which is about 70.7%.

In [232]:
#Split your tweets into training data (80% of your data) and testing data (20% of your data)
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(tweet.text)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

(X_tfidf_train, X_tfidf_test, Y_train, Y_test) = train_test_split(X_tfidf, 
                                                                  tweet.emo, 
                                                                  test_size=0.2, random_state=1)

In [233]:
#Train your sentiment analysis classifier as we did in class, using the Tf-Idf matrix
gbc = GradientBoostingClassifier().fit(X_tfidf_train, Y_train)
Y_pred_uni = gbc.predict(X_tfidf_test)

In [81]:
#Show the confusion matrix for unigram
confusion_matrix(Y_test, Y_pred_uni)

array([[6365,  623],
       [3185, 2816]])

In [231]:
print("accuracy is ", (6365+2816)/(6365+2816+623+3185))

accuracy is  0.7068288551851567


Now we would like to experiment with bag of 2 words instead, so we use a TfidfVectorizer, which is similar to a functor of CountVectorizer follow by a TfidfTransformer. We set the ngram_range to be exact 2, and perform the same procedure as above. 

Yet, the accuracy rate is only 62.7%, which is actually 8% worse than just using unigram.

In [120]:
#Experiment with using bi-grams. Do they help?
bigram_vect_tfidf = TfidfVectorizer(ngram_range=(2, 2))
X_bigram = bigram_vect_tfidf.fit_transform(tweet.text)

(X_bigram_train, X_bigram_test, Y_bigram_train, Y_bigram_test) = train_test_split(X_bigram, 
                                                                  tweet.emo, 
                                                                  test_size=0.2, random_state=1)

In [121]:
gbc_bigram = GradientBoostingClassifier().fit(X_bigram_train, Y_bigram_train)
Y_pred_bi = gbc_bigram.predict(X_bigram_test)

In [122]:
#confusion matrix for bigram
confusion_matrix(Y_bigram_test, Y_pred_bi)

array([[6943,   45],
       [4806, 1195]])

In [225]:
print("accuracy is ", (6943+1195)/(6943+1195+45+4806))

accuracy is  0.626530140888444


### Part 5: Sentiment Classification in Action

Since we were querying the tweets using emoticons, we are training on a biased data. We are wondering how the trained classifier would perform on tweets without emoticons.

I decided to use the hashtag #MeToo, and queried the 14016 tweets in a 100 sec.

In [240]:
#Collect a stream of at least 1000 tweets and predict the sentiment of each

hashtag = pd.read_csv("hashtag.json", error_bad_lines=False, warn_bad_lines=False, engine='python')
hashtag.columns = ["tweets"]

In [238]:
print("There are", len(hashtag), "tweets with #MeToo tag")

There are 14016 tweets with #MeToo tag


We follow the same procedure as above to do data cleaning - remove websites, retweets, usernames, etc. And add in the text column into the data frame, and our hashtag data looks like the following after clean up.

In [241]:
# hashtag tweets clean up

for i in range(len(hashtag.tweets)):
    hashtag.tweets[i] = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", hashtag.tweets[i]).split())
    hashtag.tweets[i] = tknzr.tokenize(hashtag.tweets[i])    
    for word in hashtag.tweets[i]:
        if word in stopws: 
            hashtag.tweets[i].remove(word)

hashtag['text'] = 0
for i in range(len(hashtag)):
    hashtag.text[i] = ' '.join(hashtag.tweets[i])

hashtag.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,tweets,text
0,"[wanted, get, mad, this, she, right, i, was, a...",wanted get mad this she right i was always wok...
1,"[we, u2019re, hiring, front, end, dev, join, e...",we u2019re hiring front end dev join engineeri...
2,"[u201csexual, violence, knows, race, class, ge...",u201csexual violence knows race class gender t...
3,"[u6027, u66b4, u529b, u88ab, u5bb3, u3092, u54...",u6027 u66b4 u529b u88ab u5bb3 u3092 u544a u767...
4,"[hey, please, help, spread, word, s, latest, f...",hey please help spread word s latest film shin...


In [242]:
#vocab = count_vect.transform(tweet.text)
count_vect2 = CountVectorizer(vocabulary=count_vect.vocabulary_)
hashtag_counts = count_vect2.fit_transform(hashtag.text)
hashtag_tfidf = tfidf_transformer.fit_transform(hashtag_counts)
hashtag_pred = gbc.predict(hashtag_counts)

In [248]:
hashtag_pred

array([':)', ':)', ':(', ..., ':(', ':)', ':('], dtype=object)

After looking through the results, the performance of our classifier on the new dataset is worse than the predictions on our original datasets. Given the length of the new tweets I shall not print out the actual tweets, but have identified the following patterns:

1. The classifier tend to identify netural tweets or tweets with the hashtag but unrelated content as positive.
2. I might have not specified a correct encoder so there are many words that has a format similar to 'u00b4', and the classifier seems to tag it as positive for the most of the time.
3. Tweets show vulnerability but not negative seems to be classify as negative
4. Tweets that people accknowledge negative feelings/things but then show postive affirmations also tend to be labeled as negative. 

Overall there is no clear trend of how the classifier performs, and I have listed the top words that the classier has relied on predicting, other than some obvious positive words such as 'happy', 'great', 'good', 'awesome', the dictionary itself is not good enough with the prediction.

In [249]:
#hashtag.text
top_indexes = gbc.feature_importances_.argsort()[::-1]
names = count_vect.get_feature_names()
print([names[i] for i in top_indexes[:100]])

['u00b4', 'u2026', 'miss', 'yoongi', 'thanks', 'he', 'gracias', 'thank', 'cute', 'hemos', 'que', 'eu', 'you', 'sad', 'hermanas', 'happy', 'great', 'feliz', 'seokjin', 'day', 'how', 'stans', 'sorry', 'video', 'u0e19', 'quiero', 'queria', 'bts', 'ub2e4', 'quero', 'remember', 'smash', 'triste', 'weloveyouseokjin', 'uac', 'hi', 'ud558', 'u2019s', 'wish', 'u0e2d', 'like', 'internationalwomensday', 'love', 'u00e9', 'gt', 'u00e3o', 'good', 'retweet', 'so', 'uff', 'jin', 'jackson', 'uae', 'your', 'the', 'u0e22', 'era', 'check', 'new', 'smol', 'u00f1o', 'un', 'u2019t', 'alg', 'podem', 'para', 'oi', 'my', 'u0644', 'yo', 'u306a', 'abra', 'look', 'awesome', 'htt', 'know', 'switch', 'lt', 'rt', 'guys', 'uc774', 'women', 'today', 'necesito', 'welcome', 'ko', 'his', 'kca', 'meu', 'want', 'tweet', 'u06cc', 'us', 'pero', 'stop', 'btsarmy', 'u00fan', 'u0e29', 'de', 'birthday']


## Part 6: NPL and Deep Learning approaches: 

### Part 1: Word2Vec model 

After the experiment with the tf-idf matrix, we wonder if word2vec would perform better. We train a Word2Vec model using the gensim package by getting the word vectors of each tweets, then normalize on the column by column, and then build the vector using the word freqency from all the tweets in our data.


Again, we split the tweets into 80% training data and 20% testing data. But the result only yields a 61.3% accuracy. One observation though, the word2vec model seems to be able to predict the negative tweets better than the previous tf-idf martics.

In [86]:
# Word2Vec
import gensim
import logging
import os.path

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.Word2Vec(tweet.tweets, workers=4, size=50, sg=1, min_count=5)

In [250]:
tweet_vectors = []

num_known_words = 0
num_unknown_words = 0
for t in tweet.text:    
    v = np.zeros(50)
    n = 0
    for word in gensim.utils.tokenize(t, lowercase=True):
        if word in model:
            v += model[word]
            n += 1
            num_known_words += 1
        else:
            num_unknown_words += 1
    if n > 0: v /= n # average the word vectors
    tweet_vectors.append(v)
print(num_known_words, num_unknown_words)

tweet_w2v = np.array(tweet_vectors)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


1231081 130107


In [208]:
(X_w2v_train, X_w2v_test, Y_w2v_train, Y_w2v_test) = train_test_split(tweet_w2v, 
                                                              tweet.emo, 
                                                              test_size=0.2, random_state=1)

In [209]:
gbc_w2v = GradientBoostingClassifier().fit(X_w2v_train, Y_w2v_train)
Y_pred_w2v = gbc_w2v.predict(X_w2v_test)

In [212]:
#show confusion matrix for 
confusion_matrix(Y_w2v_test, Y_pred_w2v)

array([[5548, 1440],
       [2027, 3974]])

In [213]:
print("accuracy is ", (5548+3974)/(5548+3974+2027+3974))

accuracy is  0.6134123558590479


### Part 2: LSTM model

Disappointed in the previous accuracy only stayed at low 70%, I wonder whether a neural network would perform better. 

Few years ago I heard the state of the art for sentiment classification is using Tree-LSTM model, given the limited computational power in my personal computer, I decided to go with a single LSTM model instead.

The implementation of the code, and especially the tunning parameters are inspired by the discussion under the Kaggle challenge: https://www.kaggle.com/crowdflower/first-gop-debate-twitter-sentiment/


I selected about 5000 features, and used softmax function for activation function instead of the traditional reLU on text for we are using the embeddings of categorical variables. 

I trained the model on 12 epochs, and we can see from the training progress output there, the 2nd epoch's accuracy already above 80%. While the 12th epoch has the accuracy of 94.5%. 

In the end we evaluated the trained model against the testing dataset, and got 93.6% accuracy - not bad!

In [15]:
tokenizer = Tokenizer(nb_words=5000, split=' ')
tokenizer.fit_on_texts(tweet.text.values)
X = tokenizer.texts_to_sequences(tweet.text.values)
X = pad_sequences(X)



In [16]:
X.shape

(64943, 241)

In [37]:
model = Sequential()
model.add(Embedding(5000, 241 ,input_length = X.shape[1]))
model.add(LSTM(200, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

  This is separate from the ipykernel package so we can avoid doing imports until


In [38]:
Y = pd.get_dummies(tweet.emo).values
X_lstm_train, X_lstm_test, Y_lstm_train, Y_lstm_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)
# print(X_train.shape,Y_train.shape)
# print(X_test.shape,Y_test.shape)

In [41]:
model.fit(X_lstm_train, Y_lstm_train, epochs = 12, batch_size=32, verbose = 2)

Epoch 1/12
 - 2915s - loss: 0.4233 - acc: 0.7892
Epoch 2/12
 - 2913s - loss: 0.3553 - acc: 0.8279
Epoch 3/12
 - 2910s - loss: 0.3147 - acc: 0.8505
Epoch 4/12
 - 2910s - loss: 0.2784 - acc: 0.8672
Epoch 5/12
 - 2922s - loss: 0.2428 - acc: 0.8840
Epoch 6/12
 - 2933s - loss: 0.2133 - acc: 0.8990
Epoch 7/12
 - 2916s - loss: 0.1869 - acc: 0.9117
Epoch 8/12
 - 2917s - loss: 0.1666 - acc: 0.9218
Epoch 9/12
 - 2913s - loss: 0.1511 - acc: 0.9283
Epoch 10/12
 - 2921s - loss: 0.1384 - acc: 0.9354
Epoch 11/12
 - 2914s - loss: 0.1262 - acc: 0.9408
Epoch 12/12
 - 2923s - loss: 0.1199 - acc: 0.9447


<keras.callbacks.History at 0x1a3c08a438>

In [54]:
acc, score = model.evaluate(X_lstm_test, Y_lstm_test, batch_size = 32, verbose = 2)

In [53]:
print("accuracy is %.3f" % (acc))

accuracy is 0.936


### Appendex

Here are the queries I ran to get the tweets. For security reason, I have removed the API token and keys.

Credit to many stackoverflow posts over setting the histrical time limit and Tweep overtime error.

In [None]:
import tweepy
from tweepy import OAuthHandler
import json
import datetime as dt
import time
import os
import sys

def load_api():
  
    consumer_key = ''
    consumer_secret = ''
    access_token = ''
    access_secret = ''
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    return tweepy.API(auth)

def tweet_search():
    search_phrases = ['#MeToo']
    time_limit = 1                           
    max_tweets = 2000                          
    min_days_old, max_days_old = 1, 7         
    USA = '39.8,-95.583068847656,2500km'
    api = load_api()
    for search_phrase in search_phrases:
        print('Search phrase =', search_phrase)
        name = search_phrase.split()[0]
        json_file_root = name + '/'  + name
        os.makedirs(os.path.dirname(json_file_root), exist_ok=True)

        if max_days_old - min_days_old == 1:
            d = dt.datetime.now() - dt.timedelta(days=min_days_old)
            day = '{0}-{1:0>2}-{2:0>2}'.format(d.year, d.month, d.day)
        else:
            d1 = dt.datetime.now() - dt.timedelta(days=max_days_old-1)
            d2 = dt.datetime.now() - dt.timedelta(days=min_days_old)
            day = '{0}-{1:0>2}-{2:0>2}_to_{3}-{4:0>2}-{5:0>2}'.format(
                  d1.year, d1.month, d1.day, d2.year, d2.month, d2.day)
        json_file = json_file_root + '_' + day + '.json'


        start = dt.datetime.now()
        end = start + dt.timedelta(hours=3)
        while dt.datetime.now() < end:
            tweets, max_id = tweet_search(api, search_phrase, max_tweets,
                                          max_id=max_id, since_id=since_id,geocode=USA)
                searched_tweets = []
            while len(searched_tweets) < max_tweets:
                remaining_tweets = max_tweets - len(searched_tweets)
                    try:
                        new_tweets = api.search(q=query, count=remaining_tweets,
                                    since_id=str(since_id), max_id=str(max_id-1))

                        print('found',len(new_tweets),'tweets')
                        if not new_tweets:
                            break
                    searched_tweets.extend(new_tweets)
                    max_id = new_tweets[-1].id
                    except tweepy.TweepError:
                        print('exception raised, waiting 15 minutes')
                        print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
                        time.sleep(15*60)
                        break # stop the loop
                    if tweets:
                        with open(json_file, 'a') as f:
                            for tweet in tweets:
                                json.dump(tweet.text, f)
                                    f.write('\n')
        print("done!")
