# Naive Bayes

In [1]:
# fmt: off
import os
import sys

sys.path.append(os.path.join(os.getcwd(), ".."))
# fmt: on

import math
import os

import nltk
import numpy  # Library for linear algebra and math utils
import pandas  # Dataframe library
import plotly.express

from utils import build_freqs, tweet_to_stems

## Training

### Loading Sample Dataset

In [2]:
positive_tweets = nltk.corpus.twitter_samples.strings("positive_tweets.json")
negative_tweets = nltk.corpus.twitter_samples.strings("negative_tweets.json")

train_positive_tweets = positive_tweets[:4000]
train_negative_tweets = negative_tweets[:4000]

### Building Log-Likelihood Map

In [3]:
def get_log_likelihoods(positive_tweets, negative_tweets):
    """
    Get lambda scores
    """
    freqs = build_freqs(
        tweets=positive_tweets + negative_tweets,
        ys=numpy.append(
            numpy.ones(shape=(len(positive_tweets), 1)),
            numpy.zeros(shape=(len(negative_tweets), 1)),
        ),
    )

    word_counts = {}
    positive_total = negative_total = 0
    for word in set((key[0] for key in freqs.keys())):
        word_counts[word] = {
            "positive": freqs.get((word, 1.0), 0),
            "negative": freqs.get((word, 0.0), 0),
        }
        positive_total += word_counts[word]["positive"]
        negative_total += word_counts[word]["negative"]

    # log(P(word|positive)), log(P(word|negative))
    # with Laplacian Smoothing
    return {
        word: {
            "positive": math.log((counts["positive"] + 1) / (positive_total + len(word_counts))),
            "negative": math.log((counts["negative"] + 1) / (negative_total + len(word_counts))),
        }
        for word, counts in word_counts.items()
    }

In [4]:
log_likelihoods = get_log_likelihoods(positive_tweets=train_positive_tweets, negative_tweets=train_negative_tweets)

for item in list(log_likelihoods.items())[:4]:
    print(item)

log_prior = math.log(len(train_negative_tweets)) - math.log(len(train_negative_tweets))
print(log_prior)

('payment', {'positive': -9.796125034207629, 'negative': -9.108141519655936})
('tasti', {'positive': -9.796125034207629, 'negative': -10.494435880775827})
('🍵', {'positive': -10.489272214767574, 'negative': -9.395823592107716})
('submiss', {'positive': -9.796125034207629, 'negative': -10.494435880775827})
0.0


## Prediction

In [5]:
def predict_sentiment(tweet, log_likelihoods, log_prior):
    p = log_prior
    for stem in tweet_to_stems(tweet):
        if stem in log_likelihoods:
            log_likelihood = log_likelihoods[stem]
            p += log_likelihood["positive"] - log_likelihood["negative"]

    return 1.0 if p > 0 else 0.0

In [6]:
from functools import partial

predict = partial(predict_sentiment, log_likelihoods=log_likelihoods, log_prior=log_prior)

print(train_positive_tweets[0])
print(predict(train_positive_tweets[0]))
print(train_negative_tweets[0])
print(predict(train_negative_tweets[0]))

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
1.0
hopeless for tmr :(
0.0


## Measuring Accuracy

In [7]:
test_positive_tweets = positive_tweets[4000:]
test_negative_tweets = negative_tweets[4000:]
test_tweets = test_positive_tweets + test_negative_tweets
test_sentiments = numpy.append(
    numpy.ones(shape=(len(test_positive_tweets), 1)),
    numpy.zeros(shape=(len(test_negative_tweets), 1)),
)

predictions = [predict(tweet) for tweet in test_tweets]
1 - numpy.average(numpy.absolute(numpy.array(predictions) - test_sentiments))

0.994