In [1]:
import json
import oauth2 as oauth
import os
import emoji as emo
import urllib.parse as parse
import csv
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation
import numpy
import pymongo
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [2]:
# This function will use an online ready-made dictionary to find the sentiment of an emoji

emoji_scores = {}
with open('Emoji_Sentiment_Data_v1.0.csv', newline = '') as emoji_csv:
    emoji_reader = csv.reader(emoji_csv)
    next(emoji_reader, None)
    for row in emoji_reader:
        emoji = row[0]
        neg_score = float(int(row[4])/int(row[2]))
        neut_score = float(int(row[5])/int(row[2]))
        pos_score = float(int(row[6])/int(row[2]))
        sent_score = (-1 * neg_score + pos_score) - .305
        emoji_scores[emoji] = sent_score

In [3]:
consumer_key = os.environ.get('CONSUMER_KEY')
consumer_secret = os.environ.get('CONSUMER_SECRET')

access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')

consumer = oauth.Consumer(key=consumer_key, secret=consumer_secret)
access_token = oauth.Token(key=access_token, secret=access_token_secret)
client = oauth.Client(consumer, access_token)

In [4]:
# Text retrieval
mongo_uri = 'mongodb://heroku_xgnhblcr:' + os.environ.get('MONGODB_PASSWORD') + '@ds149511.mlab.com:49511/heroku_xgnhblcr'
client = pymongo.MongoClient(mongo_uri)
db = client.get_default_database()
myresults = list(db.Justin_Bieber.find())

In [5]:
# Text preprocessing
tweets = []
emoji_tweets = set()
for entry in myresults:
    tweets.append(entry['text'])
for tweet in tweets:
    tweet_emojis = []
    for char in tweet:
        if char in emo.UNICODE_EMOJI:
            tweet_emojis.append(char)
    if tweet_emojis:
        for emoji in tweet_emojis:
            tweet_emoji_scores = []
            try:
                tweet_emoji_scores.append(emoji_scores[emoji]) # emoji_scores from emoji dictionary
            except KeyError as e: # Some emojis are not in the emoji dictionary - these are ignored
                continue
                
            # Assigns the tweet an emoji score corresponding to the emoji with a score furthest from zero
            if tweet_emoji_scores:
                if max(tweet_emoji_scores) == max(numpy.absolute(tweet_emoji_scores)):
                    tweet_score = max(tweet_emoji_scores)
                else:
                    tweet_score = min(tweet_emoji_scores)
                emoji_tweets_element = tweet, tweet_score
                emoji_tweets.add(emoji_tweets_element)

In [6]:
# Remove retweets (starting with "RT")
# emoji_tweets must be a list of arbitrary order to be fed into the classifier
# Note that emoji_tweets was originally a set, in order to remove duplicates
emoji_tweets = [tweet for tweet in emoji_tweets if not tweet[0].startswith('RT')]

tweet_text = [tweet[0] for tweet in emoji_tweets]
    

In [7]:
print(len(emoji_tweets)) #This is the number of tweets containing emojis that are not duplicates and not retweets

14964


In [8]:
def preprocessor(tweet):
    
    # Removes accents
    normalized = unicodedata.normalize('NFKD', tweet)
    if normalized != tweet:
        tweet = ''.join([c for c in normalized if not unicodedata.combining(c)])
        
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = re.sub(r'https?:\/\/.*?(\s|$)', '', tweet)
    
    tweet = re.sub(r'@\w*?(\s|$)', '', tweet)
    
    tweet = re.sub(r'#\w*?(\s|$)', '', tweet)
    
    # Note - the preprocessor already seems to be removing emojis - no need for extra code to do this
    
    # Regularize apostrophes from different parts of unicode
    tweet = re.sub('|'.join(['‘', '’', 'ʻ', 'ʼ']), '\'', tweet)
    
    return tweet
    

In [19]:
def tokenizer(tweet, verbose = False):
    
    tokens = []
    token_pattern = '(?=(^|(?!(\w|\')).)((\w|\')+)\W)'
    token_matches = re.finditer(token_pattern, tweet)
    for token_match in token_matches:
        if verbose:
            print(token_match)
        tokens.append(token_match.group(3))
    return tokens

print(tokenizer(preprocessor("Hard pass...I'll give as much respect as y'all did Barack...none 😒 https://t.co/UJcr4mrZRA")))

['hard', 'pass', "i'll", 'give', 'as', 'much', 'respect', 'as', "y'all", 'did', 'barack', 'none']


In [14]:
vectorizer = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, binary = True)
fit_tr = vectorizer.fit_transform(tweet_text)
# print(vectorizer.get_feature_names()) # Returns a list of all the words in all tweets (with emojis)

In [15]:
# This block is not currently used later in this notebook because binary is True in CountVectorizer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(fit_tr)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [16]:
# Tweets must have arbitrary order to be used with Multinomial NB

three_class = True # Set to False to reduce number of classes to two
threshold = 0.15 # If three classes, will set the range of the "neutral" class to -threshold to threshold
# Set to try to override the emoji score of the emoji "tears of joy" and make it negative
# This can be done if it is believed that users are using the "tears of joy" emoji for negative sentiment
tears_joy_neg = False

tweet_emos = []
for tweet in emoji_tweets:
    if tears_joy_neg and numpy.absolute(tweet[1] + 0.084) < 0.01:
        tweet_emos.append('neg')
        continue
    if three_class:
        if tweet[1] < -1*threshold:
            tweet_emos.append('neg')
        elif tweet[1] > threshold:
            tweet_emos.append('pos')
        else:
            tweet_emos.append('neu')
    else:
        tweet_emos.append(tweet[1] > 0)
        
    
# Cross-validation
from sklearn.model_selection import cross_val_score
          
# This takes a long time at cv=10, changed tfidf to fit_tf while binary in CountVectorizer is True    
scores = cross_val_score(MultinomialNB(alpha=.01), fit_tr, tweet_emos, cv=5)
print('MNB accuracy : %s' % scores)

# Accounts for the fact most tweets are positive anyway, so accuracy is not the best metric of model performance
scores2 = cross_val_score(MultinomialNB(alpha=.01), fit_tr, tweet_emos, cv=5, scoring = 'f1_macro')
print('MNB F-score: %s' % scores2)

# For three classes, these scores should be above .333 to indicate performance than randomly guessing sentiment
# For two classes, the scores should be above .5

MNB accuracy : [0.56613226 0.55763448 0.5606415  0.54278075 0.55180481]
MNB F-score: [0.52524635 0.5161027  0.51761293 0.5019726  0.51154371]


In [17]:
clf = MultinomialNB(alpha = .01)
clf.fit(fit_tr, tweet_emos)

# Fake tweets to test model

positive_tweet = "I love Justin Bieber. He is amazing!"
positive_tweet_vector = vectorizer.transform([positive_tweet])

neutral_tweet = "There are many people here."
neutral_tweet_vector = vectorizer.transform([neutral_tweet])

negative_tweet = "I hate Justin Bieber.  He sucks!"
negative_tweet_vector = vectorizer.transform([negative_tweet])

# Predictions are [[(Probability tweet is negative) (Probability of neutral) (Probability of positive)]]
numpy.set_printoptions(suppress=True)
print('Positive prediction: %s' % clf.predict_proba(positive_tweet_vector))
print('Neutral predication: %s' % clf.predict_proba(neutral_tweet_vector))
print('Negative prediction: %s' % clf.predict_proba(negative_tweet_vector))

Positive prediction: [[0.13977664 0.12565331 0.73457005]]
Neutral predication: [[0.53826463 0.37636321 0.08537216]]
Negative prediction: [[0.71088329 0.21909802 0.0700187 ]]
