# Naive Bayes

* Text Preprocessing
* Create frequency mappig dictionary
* Claculate likelihood with Laplacian smoothing
* Calculate log likelihood
* Calculate log prior

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, twitter_samples   # Library for twitter_samples and stopwords
from termcolor import colored

In [2]:
# get the sets of positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# printing length of positive and negative tweets
print(len(positive_tweets), len(negative_tweets))

5000 5000


In [3]:
# Splitting data into train and test
train_pos = positive_tweets[:4000]
train_neg = negative_tweets[:4000]
test_pos = positive_tweets[4000:]
test_neg = negative_tweets[4000:]

X_train = train_pos + train_neg
X_test = test_pos + test_neg

y_train = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
y_test = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [4]:
# Preprocessing
from utils import process_tweet

print(colored(X_train[0], 'green'))
print(colored('#'*100, 'blue'))
print(colored(process_tweet(X_train[0]), 'red'))

[32m#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)[0m
[34m####################################################################################################[0m
[31m['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)'][0m


# Create frequency mappig dictionary

* frequency_mapping_dict = {('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}
* lookup will give out values of a pair from frequency_mapping_dict ex: for pair ('tire', 0) the value is 2.

In [39]:
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

class utils:
    
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords_english = stopwords.words('english')
        self.tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    
    def pre_process(self, text):
        text = re.sub(r'\$\w*', '', text)
        text = re.sub(r'^RT[\s]+', '', text)
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
        text = re.sub(r'#', '', text)
        text_tokens = self.tokenizer.tokenize(text)

        text_clean = []
        for word in text_tokens:
            if (word not in self.stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
                # tweets_clean.append(word)
                stem_word = self.stemmer.stem(word)  # stemming word
                text_clean.append(stem_word)
        return text_clean
    
    def build_frequency(self, X, y):
        freq = {}
        for text, label in zip(X, y):
            for word in self.pre_process(text):
                pair = (word, label)
                freq[pair] = freq.get(pair, 0) +1
        return freq
    
    def look_up(self, freqs, word, label):
        n = 0
        pair = (word, label)
        if (pair in freqs):
            n = freqs[pair]
        return n
        

## Naive Bayes Classifier

* NBpred method predicts for a single tweet
* predict method gives prediction for the entire batch

In [42]:
class NaiveBayes:
    
    def __init__(self):
        self.utils = utils()
        self.logprior = None
        self.loglikelihood = None
    
    def fit(self, X, y):
        self.loglikelihood = {}
        self.logprior = 0
        freqs = self.utils.build_frequency(X, y)
        Vocab = set([pair[0] for pair in freqs.keys()])
        V = len(Vocab)
        
        total_pos = total_neg = 0
        for pair in freqs.keys():
            if pair[1]>0:
                total_pos += freqs[pair]
            else:
                total_neg += freqs[pair]

        D_pos = (len(list(filter(lambda x: x > 0, y_train))))
        D_neg = (len(list(filter(lambda x: x <= 0, y_train))))
        
        # Calculate log prior
        self.logprior = np.log(D_pos) - np.log(D_neg)
        
        # Calculate positive and negative frequency
        for word in Vocab:
            pos_freq = self.utils.look_up(freqs, word, 1)
            neg_freq = self.utils.look_up(freqs, word, 0)
            
            # Calculate the probability that each word is positive, and negative
            p_pos = (pos_freq + 1) / (total_pos + V)
            p_neg = (neg_freq + 1) / (total_neg + V)
            
            # Claculate loglikelihood
            self.loglikelihood[word] = np.log(p_pos/p_neg)
            
    
    def NB_pred(self, x):
        p = 0
        p += self.logprior

        for word in self.utils.pre_process(x):
            if word in self.loglikelihood:
                p += self.loglikelihood[word]

        return p
        
    def predict(self, X):
        y_hats = []
        for tweet in X:
            # if the prediction is > 0
            if self.NB_pred(tweet) > 0:
                y_hat_i = 1
            else:
                y_hat_i = 0

            y_hats.append(y_hat_i)
        return y_hats


In [43]:
clf = NaiveBayes()

In [44]:
clf.fit(X_train, y_train)

In [45]:
print(clf.logprior, len(clf.loglikelihood))

0.0 9085


In [46]:
y_hats = clf.predict(X_test)

In [47]:
error = np.mean(np.absolute(y_hats-y_test))

In [48]:
accuracy = 1-error
print(error)
print(accuracy)

0.006
0.994


In [49]:
my_tweet = 'She smiled.'
p = clf.NB_pred(my_tweet)
print('The expected output is', p)

The expected output is 1.5737794405738943


In [50]:
l = ['She smiled.', 'I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']
for tweet in l:
    p = clf.NB_pred(tweet)
    print(f'{tweet} -> {p:.2f}')

She smiled. -> 1.57
I am happy -> 2.15
I am bad -> -1.29
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.41
great great great great -> 8.55


In [51]:
my_tweet = 'you are bad :('
clf.NB_pred(my_tweet)

-8.802119484044237

## Error Analysis on misclassified points

In [52]:
# Some error analysis done for you
print('Truth Predicted Tweet')
for x, y in zip(X_test, y_test):
    y_hat = clf.NB_pred(x)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(process_tweet(x)).encode('ascii', 'ignore')))

Truth Predicted Tweet
1	0.00	b''
1	0.00	b'truli later move know queen bee upward bound movingonup'
1	0.00	b'new report talk burn calori cold work harder warm feel better weather :p'
1	0.00	b'harri niall 94 harri born ik stupid wanna chang :D'
1	0.00	b''
1	0.00	b''
1	0.00	b'park get sunlight'
1	0.00	b'uff itna miss karhi thi ap :p'
0	1.00	b'hello info possibl interest jonatha close join beti :( great'
0	1.00	b'u prob fun david'
0	1.00	b'pat jay'
0	1.00	b'whatev stil l young >:-('


# Interpretability in naive bayes

### positive negative ratio
* {'positive': 161, 'negative': 18, 'ratio': 8.526315789473685}

In [56]:
u = utils()

def get_ratio(freqs, word):
    """
    Input:
        freqs: frequency mapping of positive and negative words
        
    Output: a dictionary with keys: "positive", "negative", "ratio"
    """
    pos_neg_ratio = {'positive': 0, 'negative': 0, 'ratio': 0.0}
    
    pos_neg_ratio['positive'] =  u.look_up(freqs, word, 1)
    pos_neg_ratio['negative'] = u.look_up(freqs, word, 0) 
    
    pos_neg_ratio['ratio'] = (pos_neg_ratio['positive'] + 1)/(pos_neg_ratio['negative'] + 1)
    
    return pos_neg_ratio
    

In [58]:
freqs = u.build_frequency(X_train, y_train)
get_ratio(freqs, 'happi')

{'positive': 161, 'negative': 18, 'ratio': 8.526315789473685}

In [59]:
def get_words_by_threshold(freqs, label, threshold):
    '''
    Input:
        freqs: dictionary of words
        pos_neg_ratio: dictionary of positive counts, negative counts, and ratio of positive / negative counts.
        label: 1 for positive, 0 for negative
        threshold: ratio that will be used as the cutoff for including a word in the returned dictionary
    Output:
        word_set: dictionary containing the word and information on its positive count, negative count, and ratio of positive to negative counts.
        example of a key value pair:
        {'happi':
            {'positive': 10, 'negative': 20, 'ratio': 0.5}
        }
    '''
    word_list = {}

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    for key in freqs.keys():
        word, _ = key

        # get the positive/negative ratio for a word
        pos_neg_ratio = get_ratio(freqs, word)

        # if the label is 1 and the ratio is greater than or equal to the threshold...
        if label == 1 and pos_neg_ratio['ratio'] >= threshold :

            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

        # If the label is 0 and the pos_neg_ratio is less than or equal to the threshold...
        elif label == 0 and pos_neg_ratio['ratio'] <= threshold:
            # Add the pos_neg_ratio to the dictionary
            word_list[word] = pos_neg_ratio

    return word_list

In [60]:
# Test your function: find negative words at or below a threshold
get_words_by_threshold(freqs, label=0, threshold=0.05)

{':(': {'positive': 1, 'negative': 3663, 'ratio': 0.0005458515283842794},
 ':-(': {'positive': 0, 'negative': 378, 'ratio': 0.002638522427440633},
 'zayniscomingbackonjuli': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '26': {'positive': 0, 'negative': 20, 'ratio': 0.047619047619047616},
 '>:(': {'positive': 0, 'negative': 43, 'ratio': 0.022727272727272728},
 'lost': {'positive': 0, 'negative': 19, 'ratio': 0.05},
 '♛': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 '》': {'positive': 0, 'negative': 210, 'ratio': 0.004739336492890996},
 'beli̇ev': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'wi̇ll': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'justi̇n': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｓｅｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776},
 'ｍｅ': {'positive': 0, 'negative': 35, 'ratio': 0.027777777777777776}}

In [61]:
# Test your function; find positive words at or above a threshold
get_words_by_threshold(freqs, label=1, threshold=10)

{'followfriday': {'positive': 23, 'negative': 0, 'ratio': 24.0},
 'commun': {'positive': 27, 'negative': 1, 'ratio': 14.0},
 ':)': {'positive': 2847, 'negative': 2, 'ratio': 949.3333333333334},
 'flipkartfashionfriday': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':D': {'positive': 498, 'negative': 0, 'ratio': 499.0},
 ':p': {'positive': 103, 'negative': 0, 'ratio': 104.0},
 'influenc': {'positive': 16, 'negative': 0, 'ratio': 17.0},
 ':-)': {'positive': 543, 'negative': 0, 'ratio': 544.0},
 "here'": {'positive': 20, 'negative': 0, 'ratio': 21.0},
 'youth': {'positive': 14, 'negative': 0, 'ratio': 15.0},
 'bam': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'warsaw': {'positive': 44, 'negative': 0, 'ratio': 45.0},
 'shout': {'positive': 11, 'negative': 0, 'ratio': 12.0},
 ';)': {'positive': 22, 'negative': 0, 'ratio': 23.0},
 'stat': {'positive': 51, 'negative': 0, 'ratio': 52.0},
 'arriv': {'positive': 57, 'negative': 4, 'ratio': 11.6},
 'via': {'positive': 60, 'negative': 1, 