In [363]:
import numpy as np
import regex as re
import pandas as pd
import math

### Naive Bayes

In [384]:
#TODO: build a frequency dictionary for each class from the strings below
pos_tweet = "I am happy because I am learning NLP\nI am happy, not sad."
neg_tweet = "I am sad, I am not learning NLP\n am sad, not happy"

In [385]:
# text normalization
def preprocess(text, PATTERN = r"[^A-Za-z\s]"):
    """
    returns tokens using regex patten for filtering
    default removes anything that isn't an upper or lower character or a whitespace 
    """
    # remove anything that isn't an upper or lower character or a whitespace 
    return re.sub(PATTERN, '', text).split()

pos_tokens = preprocess(pos_tweet)
neg_tokens = preprocess(neg_tweet)

In [386]:
# build vocab
vocab = list(set(pos_tokens + neg_tokens))

In [395]:
# create word freq dictionary
def word_freq_dict(tokens, vocab):
    word_freq_dict = dict(zip(vocab, np.zeros((len(vocab)), int)))
    for token in tokens:
        if token in word_freq_dict.keys():
            word_freq_dict[token] += 1
    return word_freq_dict

neg_word_freq = word_freq_dict(neg_tokens, vocab)
pos_word_freq = word_freq_dict(pos_tokens, vocab)

In [388]:
# build dataframe to specifications

pos_freq_df = pd.DataFrame({'word':list(pos_word_freq.keys()),
                            'freq':list(pos_word_freq.values())}).set_index('word')

neg_freq_df = pd.DataFrame({'word':list(neg_word_freq.keys()),
                            'freq':list(neg_word_freq.values())}).set_index('word')

In [389]:
freq_df = pd.merge(pos_freq_df, neg_freq_df, on='word', suffixes=('_pos', '_neg'))
freq_df

Unnamed: 0_level_0,freq_pos,freq_neg
word,Unnamed: 1_level_1,Unnamed: 2_level_1
because,1,0
sad,1,2
not,1,2
am,3,3
NLP,1,1
happy,2,1
I,3,2
learning,1,1


In [390]:
# conditional probability table
cond_prob_df = freq_df / freq_df.sum()
cond_prob_df.columns = ['prob_pos', 'prob_neg']
cond_prob_df

Unnamed: 0_level_0,prob_pos,prob_neg
word,Unnamed: 1_level_1,Unnamed: 2_level_1
because,0.076923,0.0
sad,0.076923,0.166667
not,0.076923,0.166667
am,0.230769,0.25
NLP,0.076923,0.083333
happy,0.153846,0.083333
I,0.230769,0.166667
learning,0.076923,0.083333


In [391]:
# naive bayes inference
# can't divide by zero
math.prod(cond_prob_df.iloc[:,0] / cond_prob_df.iloc[:,1])

inf

In [392]:
# naive bayes inference with Laplacian smoothing
# add 1 to the numerator and the length of the vocabulary in the denonimator 
# you divide by the number of 1s you add...
cond_prob_df_ls = (freq_df + 1) / (freq_df.sum() + len(vocab))
cond_prob_df_ls.columns = ['prob_pos', 'prob_neg']
cond_prob_df_ls

Unnamed: 0_level_0,prob_pos,prob_neg
word,Unnamed: 1_level_1,Unnamed: 2_level_1
because,0.095238,0.05
sad,0.095238,0.15
not,0.095238,0.15
am,0.190476,0.2
NLP,0.095238,0.1
happy,0.142857,0.1
I,0.190476,0.15
learning,0.095238,0.1


In [393]:
math.prod(cond_prob_df_ls.iloc[:,0] / cond_prob_df_ls.iloc[:,1])

1.2032699769398878

### Log Likelihood

In [394]:
#likelihood = product( P(word|pos) / P(word|neg))
cond_prob_df_ls['ratio'] = cond_prob_df_ls.iloc[:,0] / cond_prob_df_ls.iloc[:,1]

# likelihood
print(f"likelihood: {math.prod(cond_prob_df_ls['ratio'])}")

# log likelihood (lambda is the log of the ratio)
cond_prob_df_ls['lambda'] = np.log(cond_prob_df_ls['ratio'])
print(f"log likelihood: {cond_prob_df_ls['lambda'].sum()}")


likelihood: 1.2032699769398878
log likelihood: 0.1850428315481053


In [None]:
# if dataset is imbalanced, add the log prior to the log likelihood 
# log prior = log (P(pos) / P(neg)) # ratio of pos/neg class instances 
# (i.e. how likely is it to end in pos class?)