In [6]:
import numpy as np 
import pandas as pd
import nltk 
import string 

In [7]:
# nltk.download('stopwords')
# nltk.download('twitter_samples')
from nltk.corpus import stopwords,twitter_samples

In [8]:
positive_tweet = twitter_samples.strings('positive_tweets.json')
negative_tweet = twitter_samples.strings('negative_tweets.json')

In [13]:
train_pos = positive_tweet[:4000]
test_pos = positive_tweet[4000:]
train_neg = negative_tweet[:4000]
test_neg = negative_tweet[4000:]

train_X = train_pos + train_neg
train_y = np.append(np.ones(len(train_pos)),np.zeros(len(train_neg)))

test_X = test_neg + test_pos
test_y = np.append(np.zeros(len(test_neg)),np.ones(len(test_pos)))

In [31]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [47]:
#Data Preprocessing 
# removing hyperlink 
# tokenization
# removing stopwords  
# stemming

def process_tweet(tweet):
    # remove old style retweet text "RT"
    tweet_ = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet= re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet= re.sub(r'#', '', tweet)
    
    tokenizer = TweetTokenizer(preserve_case= False,strip_handles=True,reduce_len=True)
    tweet = tokenizer.tokenize(tweet)
    stopwords_english = stopwords.words('english')
    new_tweet=[]
    for word in tweet:
        if(word not in stopwords_english and word not in string.punctuation):
            new_tweet.append(word)
    stemmer = PorterStemmer()
    final_stemmed_word = []
    for word in new_tweet:
        stemmed_word = stemmer.stem(word)
        final_stemmed_word.append(stemmed_word)
    return final_stemmed_word

In [48]:
a = process_tweet('#FollowFriday @NGourd @Locita @D_Robert_Kelly for being top influencers in my community this week :)')
print(a)

['followfriday', 'top', 'influenc', 'commun', 'week', ':)']


In [138]:
def map_pair_with_freq(result,tweets,y_value_sentiment):
    for tweet, y_value in zip(tweets,y_value_sentiment):
        for word in process_tweet(tweet):
            #defining a key consisting of word and label tuple
            pair = (word,y_value)
            # if key exist in dictionary increment count 
            if pair in result:
                result[pair] +=1
            else:
                result[pair] = 1
    return result

In [139]:
result = {}
tweets = ['i am happy', 'he is happy','i am tricked', 'i am sad', 'i am blessed', 'i am tired']
y_value_sentiment = [1,1,0,0,1,0]
map_pair_with_freq(result,tweets,y_value_sentiment)

{('happi', 1): 2,
 ('trick', 0): 1,
 ('sad', 0): 1,
 ('bless', 1): 1,
 ('tire', 0): 1}

In [140]:
#Creating Frequency Dictionary 
freqs = map_pair_with_freq({},train_X,train_y)

In [153]:
#freqs

In [83]:
len(list(filter(lambda x: x > 0, train_y)))

4000

In [95]:
def lookup(freqs,word,label):
    n = 0
    pair = (word,label)
    if pair in freqs:
        n = freqs[pair] #n: the number of times the word with its corresponding label appears.
    return n

#freqs: a dictionary with the frequency of each pair (or tuple)
#word: the word to look up
#label: the label corresponding to the word

In [120]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))

    # Calculate D_neg, the number of negative documents
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)
        
    return logprior, loglikelihood

In [143]:
logprior, loglikelihood = train_naive_bayes(freqs, train_X, train_y)

In [144]:
def calculate_numerical_value_of_sentence(tweet,logprior,loglikelihood):
    # process the tweet to get a list of words
    list_of_words = process_tweet(tweet)

    # initialize probability to zero
    probability = 0

    # add the logprior
    probability += logprior

    for word in list_of_words:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            probability += loglikelihood[word]
            
    return probability

In [146]:
my_tweet = 'I am very happy. I am good. I am fortunate. I am lucky.'
p = calculate_numerical_value_of_sentence(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is 2.505426965863066


In [149]:
def test_naive_bayes(test_X,test_y,logprior,loglikehood):
    y_hats = []
    for tweet in test_X:
        if calculate_numerical_value_of_sentence(tweet,logprior,loglikehood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0
        y_hats.append(y_hat_i)
    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats-test_y))

    # Accuracy is 1 minus the error
    accuracy = 1-error
    
    return accuracy

In [150]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_X, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940


In [151]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    p = naive_bayes_predict(tweet, logprior, loglikelihood)
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.15
I am bad -> -1.29
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.41
great great great great -> 8.55


In [152]:
my_tweet = 'you are bad :('
naive_bayes_predict(my_tweet, logprior, loglikelihood)

-8.801897103167159