## Tasks:

- Train a Naive Bayes model on a Sentiment Analysis Task
- Test using the model
- Compute ratios of positive words to negative words
- Do some error analysis
- Predict on custom tweets

In [1]:
import pdb
import string
from os import getcwd

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples

from utils import process_tweet, lookup

In [2]:
nltk.download('stopwords')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [3]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into training and testing sets
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]
train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[4000:]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all positive tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

## Part 1: Process the Data
- Remove noise. Remove any words that doesn't tell much about the content.
- Remove stock market tickers, hyperlinks, and hashtags.
- Remove all punctuations. We don't want to treat "happy", "happy!", "happy?", as different words.
- Use stemming to keep track of different variations of one word. 

In [4]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

# print cleaned_tweet
process_tweet(custom_tweet)

['hello', 'great', 'day', ':)', 'good', 'morn']

In [5]:
def count_tweets(result, tweets, ys):
    """
    A function count_tweets() that takes a list of tweets as input, cleans all of them, and returns a dictionary.
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    """
    for tweet, y in zip(tweets, ys):
        for word in process_tweet(tweet):
            pair = (word, y)
            result[pair] = result.get(pair, 0) + 1
            
    return result

In [6]:
# Testing count_tweets()

result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

## Part 2: Training the model using Naive Bayes

In [7]:
# Build the freqs dictionary for later uses

freqs = count_tweets({}, train_x, train_y)

In [8]:
def train_naive_bayes(freqs, train_x, train_y):
    """
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels corresponding to the tweets (0,1)
    Output:
        logprior: the log prior. 
        loglikelihood: the log likelihood of the Naive Bayes equation.
    """
    loglikelihood = {}
    logprior = 0
    
    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    
    # calculate N_pos and N_neg (number of unique positive and negative words)
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += 1
        else:
            N_neg += 1
    
    # calculate D, the number of documents (tweets)
    D = len(train_x)
    
    # calculate D_pos and D_neg (number of positive and negative tweets)
    D_pos = sum(train_y)
    D_neg = D - D_pos
    
    # calculate log prior
    logprior = np.log(D_pos) - np.log(D_neg)
    
    # for each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency
        freq_pos = lookup(freqs, word, 1)
        freq_neg = lookup(freqs, word, 0)
        
        # calculate the probability that each word is positive or negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        
        # calculate the log likelihood
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)
        
    return logprior, loglikelihood

In [9]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9089


## Part 3: Test the Naive Bayes

In [10]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    """
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the loglikelihoods of each word in the tweet (if found in the dictionary) + logprior
    """
    # process the tweet to get the list of words
    word_l = process_tweet(tweet)
    
    # initialize the probability and add logprior
    p = logprior
    
    for word in word_l:
        p += loglikelihood.get(word, 0)
        
    return p

In [11]:
# Experiment with my own tweet

my_tweet = 'She smiled.'
naive_bayes_predict(my_tweet, logprior, loglikelihood)

1.5602159227154768

In [12]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Function to check the accuracy of the predictions.
    Input: 
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the log prior
        loglikelihood: a dictionary with the loglikelihoods of each word
    Output:
        accuracy: (# of tweets predicted correctly) / (total # of tweets)
    """
    accuracy = 0
    
    y_hats = []
    for tweet in test_x:
        y_hat_i = 1 if naive_bayes_predict(tweet, logprior, loglikelihood) > 0 else 0
        y_hats.append(y_hat_i)
        
    # error is the average of the abs values of the differences between y_hats and test_y
    error = sum(np.abs(y_hats - test_y)) / len(y_hats)
    accuracy = 1 - error
    
    return accuracy

In [13]:
print("Naive Bayes accuracy = %0.4f" % test_naive_bayes(test_x, test_y, logprior, loglikelihood))

Naive Bayes accuracy = 0.9945


In [14]:
# Testing the model 

for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 
              'great', 'great great', 'great great great', 'great great great great']:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.13
I am bad -> -1.31
this movie should have been great. -> 2.12
great -> 2.12
great great -> 4.25
great great great -> 6.37
great great great great -> 8.50


In [16]:
my_tweet = "you are bad :("

naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.829246519761071

## Part 4: Filter Words by Ratio of positive to negative counts

In [17]:
def get_ratio(freqs, word):
    """
    A function to compare the positive frequency of a word to its negative frequency
    Input:
        freqs: a dictionary containing the words
        word: string to lookup
    Output:
        a dictionary with keys 'positive', 'negative' and 'ratio'
    """
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0}
    
    pos_neg_ratio['positive'] = lookup(freqs, word, 1)
    pos_neg_ratio['negative'] = lookup(freqs, word, 0)
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1) / (pos_neg_ratio['negative'] + 1)
    
    return pos_neg_ratio

In [18]:
get_ratio(freqs, 'happi')

{'positive': 161, 'negative': 18, 'ratio': 8.526315789473685}

In [19]:
def get_words_by_threshold(freqs, label, threshold):
    """
    Input: 
        freqs: a dictionary of words
        label: 1 for positive, 0 for negative
        threshold: ratio that will be used as the cutoff for including word in a dictionary
    Output:
        word_set: a dictionary containing the word and information on its positive count, negative count, 
                  and ratio of positive to example of a key-value pair
    """
    word_list = {}
    
    for word, _ in freqs.keys():
        pos_neg_ratio = get_ratio(freqs, word)
        
        if label == 1 and pos_neg_ratio['ratio'] >= threshold:
            word_list[word] = pos_neg_ratio
        
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
            word_list[word] = pos_neg_ratio
            
    return word_list

In [20]:
# Testing the function; find words at or below the threshold
get_words_by_threshold(freqs, label=0, threshold=0.05)

{':(': {'positive': 1, 'negative': 3663, 'ratio': 0.0005458515283842794},
 ':-(': {'positive': 0, 'negative': 378, 'ratio': 0.002638522427440633},
 'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '26': {'positive': 0, 'negative': 20, 'ratio': 0.047619047619047616},
 '>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728},
 'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｍｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}}

In [21]:
# Testing the function; find words at or above the threshold
get_words_by_threshold(freqs, label=1, threshold=10)

{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},
 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},
 ':)': {'positive': 2847, 'negative': 2, 'ratio': 949.3333333333334},
 'flipkartfashionfriday': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':D': {'positive': 498, 'negative': 0, 'ratio': 499.0},
 ':p': {'positive': 103, 'negative': 0, 'ratio': 104.0},
 'influenc': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':-)': {'positive': 543, 'negative': 0, 'ratio': 544.0},
 "here'": {'positive': 20, 'negative': 0, 'ratio': 21.0},
 'youth': {'positive': 14, 'negative': 0, 'ratio': 15.0},
 'bam': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'shout': {'positive': 11, 'negative': 0, 'ratio': 12.0},
 ';)': {'positive': 22, 'negative': 0, 'ratio': 23.0},
 'stat': {'positive': 51, 'negative': 0, 'ratio': 52.0},
 'arriv': {'positive': 57, 'negative': 4, 'ratio': 11.6},
 'via': {'positive': 60, 'negative': 1, 

## Part 5: Error Analysis

In [24]:
print("Truth Predicted Tweet")
for x, y in zip(test_x, test_y):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print("%d\t%0.2f\t%s" % (y, np.sign(y_hat) > 0, ' '.join(
        process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b''
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :D'
1	0.00	b''
1	0.00	b''
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'


## Part 6: Predict with custom tweet

In [25]:
my_tweet = 'I am happy because I am learning :)'

naive_bayes_predict(my_tweet, logprior, loglikelihood)

9.53333314227002