In [3]:
#Find a way to save the tweet to disk

import json
import oauth2 as oauth
import os
import emoji as emo
import urllib.parse as parse
import csv
from sklearn.feature_extraction.text import CountVectorizer
from stemming.porter2 import stem
from string import punctuation
import numpy
import pymongo
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [4]:
#This function will use an online ready-made dictionary to find the sentiment of an emoji

#load csv info into dictionary
emoji_scores = {}
with open('Emoji_Sentiment_Data_v1.0.csv', newline = '') as emoji_csv:
    emoji_reader = csv.reader(emoji_csv)
    next(emoji_reader, None)
    for row in emoji_reader:
        emoji = row[0]
        neg_score = float(int(row[4])/int(row[2]))
        neut_score = float(int(row[5])/int(row[2]))
        pos_score = float(int(row[6])/int(row[2]))
        sent_score = (-1 * neg_score + pos_score) - .305
        emoji_scores[emoji] = sent_score
        
#print(emoji_scores)
        

In [5]:
consumer_key = os.environ.get('CONSUMER_KEY')
consumer_secret = os.environ.get('CONSUMER_SECRET')

access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')

consumer = oauth.Consumer(key=consumer_key, secret=consumer_secret)
access_token = oauth.Token(key=access_token, secret=access_token_secret)
client = oauth.Client(consumer, access_token)

In [6]:
# text preprocessing (partially from before partially from August)
mongo_uri = 'mongodb://heroku_xgnhblcr:' + os.environ.get('MONGODB_PASSWORD') + '@ds149511.mlab.com:49511/heroku_xgnhblcr'
client = pymongo.MongoClient(mongo_uri)
db = client.get_default_database()
myresults = list(db.Justin_Bieber.find())
tweets = []
emoji_tweets = set()
for entry in myresults:
    tweets.append(entry['text'])
#Doing the next part based on the earlier REST-API-using cell
for tweet in tweets:
    tweet_emojis = []
    for char in tweet:
        if char in emo.UNICODE_EMOJI:
            tweet_emojis.append(char)
    if tweet_emojis:
        for emoji in tweet_emojis:
            tweet_emoji_scores = []
            try:
                tweet_emoji_scores.append(emoji_scores[emoji]) # emoji_scores from emoji dictionary
            except KeyError as e:
                continue
            if tweet_emoji_scores:
                if max(tweet_emoji_scores) == max(numpy.absolute(tweet_emoji_scores)):
                    tweet_score = max(tweet_emoji_scores)
                else:
                    tweet_score = min(tweet_emoji_scores)
                emoji_tweets_element = tweet, tweet_score
                emoji_tweets.add(emoji_tweets_element)
    
print(emoji_tweets)



In [7]:
# Remove retweets (starting with "RT")
# emoji_tweets ust be a list of arbitrary order to be fed into the classifier
# note that emoji_tweets was originally a set
emoji_tweets = [tweet for tweet in emoji_tweets if not tweet[0].startswith('RT')]

tweet_text = [tweet[0] for tweet in emoji_tweets]
    

In [8]:
print(len(emoji_tweets)) #This is the tweets that are not duplicates and not retweets

11486


In [119]:
def preprocessor(tweet):
    
    #remove accents
    normalized = unicodedata.normalize('NFKD', tweet)
    if normalized != tweet:
        tweet = ''.join([c for c in normalized if not unicodedata.combining(c)])
        
    tweet = tweet.lower()
    
    #remove URLs
    tweet = re.sub(r'https?:\/\/.*?(\s|$)', '', tweet)
    tweet = re.sub(r'@\w*?(\s|$)', '', tweet)
    tweet = re.sub(r'#\w*?(\s|$)', '', tweet)
    
    #Note - the preprocessor already seems to be removing emojis - no need for extra code
    
    # Regularize apostrophes from different parts of unicode
    tweet = re.sub('|'.join(['‘', '’', 'ʻ', 'ʼ']), '\'', tweet)
    
    return tweet
    

In [120]:
def tokenizer(tweet, verbose = False): #Currently troubleshooting function
    
    tokens = []
    token_pattern = '(?=(^|(?!(\w|\')).)((\w|\')+)\W)'
    token_matches = re.finditer(token_pattern, tweet)
    for token_match in token_matches:
        if verbose:
            print(token_match)
        tokens.append(token_match.group(3))
    return tokens

print(tokenizer("Hard pass...I'll give as much respect as y'all did Barack...none 😒 https://t.co/UJcr4mrZRA"))

['Hard', 'pass', "I'll", 'give', 'as', 'much', 'respect', 'as', "y'all", 'did', 'Barack', 'none', 'https', 't', 'co']


In [121]:
vectorizer = CountVectorizer(preprocessor=preprocessor, tokenizer=tokenizer, binary = True) #Make binary?  Does this prevent it from picking up one word twice in a tweet?
fit_tr = vectorizer.fit_transform(tweet_text)
print(vectorizer.get_feature_names())
#   for word in emoji_tweets.split(): #This and following line should be removed and use regex instead.  Follow Downloads/pak-paroubek.pdf for preprocessing ideas, as well as downloads that were downloaded at a similar time
#      if word[0:4] == 'http':
            
#CountVectorizer().build_tokenizer()(tweet_text) #stop_words param to 'english' once you have only Eng tweets, this was indented within for loop at the beginning of August



In [122]:
transformer = TfidfTransformer() # Not used because binary is true in CountVectorizer
tfidf = transformer.fit_transform(fit_tr)

In [178]:
#Tweets must have arbitrary order to be used with Multinomial NB
three_class = True
threshold = 0.15
tears_joy_neg = False

tweet_emos = []
for tweet in emoji_tweets:
    if tears_joy_neg and numpy.absolute(tweet[1] + 0.084) < 0.01:
        tweet_emos.append('neg')
        continue
    if three_class:
        if tweet[1] < -1*threshold:
            tweet_emos.append('neg')
        elif tweet[1] > threshold:
            tweet_emos.append('pos')
        else:
            tweet_emos.append('neu')
    else:
        tweet_emos.append(tweet[1] > 0)
        
    
    #Cross-validation
from sklearn.model_selection import cross_val_score
          
scores = cross_val_score(MultinomialNB(alpha=.01), fit_tr, tweet_emos, cv=5) #this takes forever at cv=10, changed tfidf to fit_tf  while binary in CountVectorizer is true
print('MNB accuracy : %s' % scores)

#accounts for the fact most tweets are positive anyway
scores2 = cross_val_score(MultinomialNB(alpha=.01), fit_tr, tweet_emos, cv=5, scoring = 'f1_macro') #this takes forever at cv=10, changed tfidf to fit_tf  while binary in CountVectorizer is true
print('MNB F-score: %s' % scores2)
    
    

MNB accuracy : [ 0.59330144  0.58162821  0.5755333   0.58206356  0.58275261]
MNB F-score: [ 0.53945077  0.52406726  0.52818048  0.52690259  0.52913068]


In [181]:
clf = MultinomialNB(alpha = .01)
clf.fit(fit_tr, tweet_emos)

positive_tweet = "I love Justin Bieber. He is amazing!"
positive_tweet_vector = vectorizer.transform([positive_tweet])

neutral_tweet = "There are many people here."
neutral_tweet_vector = vectorizer.transform([neutral_tweet])

negative_tweet = "I hate Justin Bieber.  He sucks!"
negative_tweet_vector = vectorizer.transform([negative_tweet])

numpy.set_printoptions(suppress=True)
print('Positive prediction: %s' % clf.predict_proba(positive_tweet_vector))
print('Neutral predication: %s' % clf.predict_proba(neutral_tweet_vector))
print('Negative prediction: %s' % clf.predict_proba(negative_tweet_vector))

Positive prediction: [[ 0.04091313  0.10169943  0.85738744]]
Neutral predication: [[ 0.61408808  0.22107124  0.16484068]]
Negative prediction: [[ 0.90124642  0.00129526  0.09745832]]


In [191]:
clf.predict(vectorizer.transform([emoji_tweets[0:50]][0]))

TypeError: normalize() argument 2 must be str, not tuple

In [24]:
    
    
    print('_' * 80)
    print("Training: ")
    print(MultinomialNB(alpha=.01))
    t0 = time()
    MultinomialNB(alpha=.01).fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

________________________________________________________________________________
Training: 
MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)


NameError: name 'time' is not defined