In [1]:
!pip install nltk



In [45]:
import nltk
import re
import string
import numpy as np
from os import getcwd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import twitter_samples
import joblib

In [14]:
nltk.download('stopwords')
nltk.download('twitter_samples')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [15]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [16]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # removing stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # removing old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # removing hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # removing hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # removing stopwords
                word not in string.punctuation):  # removing punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [17]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [18]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos   = all_positive_tweets[4000:]
train_pos  = all_positive_tweets[:4000]
test_neg   = all_negative_tweets[4000:]
train_neg  = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x  = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y  = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [20]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned tweet
print(process_tweet(custom_tweet))

['hello', 'great', 'day', ':)', 'good', 'morn']


In [21]:
def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    for y,tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
    return result

In [22]:
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1,0,0,0,0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [23]:
freqs = count_tweets({}, train_x, train_y)

In [24]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    N_pos=N_neg=V_pos=V_neg=0

    for pair in freqs.keys():
        if pair[1] > 0:
            V_pos += 1
            N_pos += freqs[pair]

        else:
            V_neg += 1
            N_neg += freqs[pair]

    D = train_y.shape[0]
    D_pos = train_y[train_y == 1].shape[0]
    D_neg = train_y[train_y == 0].shape[0]
    logprior  = np.log(D_pos / D) - np.log(D_neg / D)

    for word in vocab:
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)

    return logprior, loglikelihood

In [25]:
logprior,loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9161


In [26]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    word_l = process_tweet(tweet)
    p = 0
    p += logprior
    for word in word_l:

        if word in loglikelihood:

            p += loglikelihood[word]
    return p

In [27]:
my_tweet = 'She smiled.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 1.557492820301094


In [28]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    acuracy = 0
    y_hats = []
    for tweet in test_x:

        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:

            y_hat_i = 1
        else:

            y_hat_i = 0

        y_hats.append(y_hat_i)

    error = np.mean(np.abs(np.asarray(y_hats) - test_y))

    accuracy = 1 - error


    return accuracy

In [29]:
print("Naive Bayes accuracy = %0.4f" % (test_naive_bayes(test_x,test_y,logprior,loglikelihood)))

Naive Bayes accuracy = 0.9955


In [30]:
some_tweets = ['I am happy', 'I am bad', 'this movie should have been great.',
               'great', 'great great', 'great great great', 'great great great great']

for tweet in some_tweets:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.14
I am bad -> -1.31
this movie should have been great. -> 2.12
great -> 2.13
great great -> 4.26
great great great -> 6.39
great great great great -> 8.52


In [31]:

my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.837962482271395

In [34]:
def lookup(freqs, word, label):
    """
    Input:
    freqs: A dictionary with keys as (word, label) and values as the count of word occurrences for the given label.
    word: The word you want to look up.
    label: The label (1 for positive, 0 for negative) you want to search for.

    Output:
    The count of the word in the specified label. If the word-label pair doesn't exist in freqs, return 0.
    """
    if (word, label) in freqs:
        return freqs[(word, label)]
    else:
        return 0

def get_ratio(freqs, word):
    """
    Input:
    freqs: A dictionary containing the words and their frequencies.
    word: The word for which you want to calculate the positive-to-negative ratio.

    Output:
    A dictionary with keys 'positive', 'negative', and 'ratio'.
    Example: {'positive': 10, 'negative': 20, 'ratio': 0.5}
    """
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    pos_neg_ratio['negative'] = lookup(freqs, word, 0)
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1) / (pos_neg_ratio['negative'] + 1)

    return pos_neg_ratio


In [35]:
get_ratio(freqs,'happi')

{'positive': 162, 'negative': 18, 'ratio': 8.578947368421053}

In [36]:
def get_words_by_threshold(freqs,label,threshold):
    '''
    Input:
        freqs: dictionary of words
        pos_neg_ratio: dictionary of positive counts, negative counts, and ratio of positive / negative counts.
        label: 1 for positive, 0 for negative
        threshold: ratio that will be used as the cutoff for including a word in the returned dictionary
    Output:
        word_set: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts.
        example of a key value pair:
        {'happi':
            {'positive': 10, 'negative': 20, 'ratio': 0.5}
        }
    '''
    word_list = []
    for key in freqs.keys():
        word, _ = key
        pos_neg_ratio = get_ratio(freqs, word)
        word_ratio_dict = {word: pos_neg_ratio}
        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
            word_list.append(word_ratio_dict)
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:

            word_list.append(word_ratio_dict)

    return word_list

In [37]:
get_words_by_threshold(freqs,label=0,threshold=0.05)

[{':(': {'positive': 1, 'negative': 3675, 'ratio': 0.000544069640914037}},
 {':(': {'positive': 1, 'negative': 3675, 'ratio': 0.000544069640914037}},
 {':-(': {'positive': 0, 'negative': 386, 'ratio': 0.002583979328165375}},
 {'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05}},
 {'26': {'positive': 0, 'negative': 20, 'ratio': 0.047619047619047616}},
 {'>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728}},
 {'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05}},
 {'♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996}},
 {'》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996}},
 {'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}},
 {'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}},
 {'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}},
 {'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}},
 {'ｍｅ': {'positive': 0, 'negative': 35, 

In [38]:

print('Truth Predicted Tweet')
for x,y in zip(test_x,test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print ('%d\t%0.2f\t%s' % (y, np.sign(y_hat)>0, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :d'
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'sr financi analyst expedia inc bellevu wa financ expediajob job job hire'


In [43]:
my_tweet = 'I am happy because I am learning :)'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print(p)


9.570227756170972


In [46]:

joblib.dump((logprior, loglikelihood), 'sentiment_model.joblib')

['sentiment_model.joblib']