In [1]:
import re
import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

# Data Creation

In [2]:
all_df = pd.read_csv('../Reviews.csv')
all_df = all_df[['Score', 'Text']]
all_df.drop_duplicates(inplace = True)

In [3]:
positive_reviews = all_df[all_df.Score == 5]
negative_reviews = all_df[all_df.Score != 5]

positive_reviews = positive_reviews['Text'].astype(str).values.tolist()
negative_reviews = negative_reviews['Text'].astype(str).values.tolist()

In [4]:
# split the data into two pieces, one for training and one for testing (validation set) 
pos_cutoff = int(len(positive_reviews)*0.8)
neg_cutoff = int(len(negative_reviews)*0.8)

test_pos = positive_reviews[pos_cutoff:]
train_pos = positive_reviews[:pos_cutoff]
test_neg = negative_reviews[neg_cutoff:]
train_neg = negative_reviews[:neg_cutoff]

In [5]:
train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [6]:
# combine positive and negative labels
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [7]:
# Print the shape train and test sets
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (314940,)
test_y.shape = (78735,)


# Preprocessing  
### Stemming, remove stop words

In [8]:
def process_review(review):
    """Process review function.
    Input:
        review: a string containing a review
    Output:
        reviews_clean: a list of words containing the processed review
    """
    stemmer = SnowballStemmer('english')
    stopwords_english = stopwords.words('english')
    
    # remove html tags like <br />
    review = re.sub(r'<.*?>', ' ', review) # .* is for greedy and .*? makes it not greedy
    # remove --- or --
    review = re.sub(r'---', ' ', review)
    review = re.sub(r'--', ' ', review)
    # remove numbers
    review = re.sub(r'[0-9]', '', review)
    # remove #
    review = re.sub(r'#', '', review)
    
    # tokenize review
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    review_tokens = tokenizer.tokenize(review)

    reviews_clean = []
    for word in review_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            reviews_clean.append(stem_word)

    return reviews_clean

In [9]:
def build_freqs(reviews, ys):
    """Build frequencies.
    Input:
        reviews: a list of reviews
        ys: an m x 1 array with the sentiment label of each review
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
    """
    freqs = {}
    for y, review in zip(ys, reviews):
        for word in process_review(review):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1
            
    return freqs

In [10]:
freqs = build_freqs(train_x, train_y)

# Naive bayes

Given a freqs dictionary, `train_x` (a list of reviews) and a `train_y` (a list of labels for each review), implement a naive bayes classifier.

#####  $V$
- The number of unique words that appear in the `freqs` dictionary to get $V$.

#####  $freq_{pos}$ and $freq_{neg}$
- By using `freqs` dictionary, we can compute the positive and negative frequency of each word $freq_{pos}$ and $freq_{neg}$.

#####  $N_{pos}$ and $N_{neg}$
- By using `freqs` dictionary, we can also compute the total number of positive words and total number of negative words $N_{pos}$ and $N_{neg}$.

#####  $D$, $D_{pos}$, $D_{neg}$
- By using the `train_y` input list of labels, calculate the number of documents (reviews) $D$, as well as the number of positive documents (reviews) $D_{pos}$ and number of negative documents (reviews) $D_{neg}$.
- Calculate the probability that a document (review) is positive $P(D_{pos})$, and the probability that a document (review) is negative $P(D_{neg})$

#####  logprior
- the logprior is $log(D_{pos}) - log(D_{neg})$

#####  log likelihood
- Finally, we can iterate over each word in the vocabulary, use out `lookup` function to get the positive frequencies, $freq_{pos}$, and the negative frequencies, $freq_{neg}$, for that specific word.
- Compute the positive probability of each word $P(W_{pos})$, negative probability of each word $P(W_{neg})$ using equations 4 & 5.

$$ P(W_{pos}) = \frac{freq_{pos} + 1}{N_{pos} + V}\tag{4} $$
$$ P(W_{neg}) = \frac{freq_{neg} + 1}{N_{neg} + V}\tag{5} $$

- We can then compute the loglikelihood: $log \left( \frac{P(W_{pos})}{P(W_{neg})} \right)$.

In [11]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of reviews
        train_y: a list of labels correponding to the reviews (0,1)
    Output:
        logprior: the log prior
        loglikelihood: the log likelihood of Naive bayes equation
    '''
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = sum(train_y)

    # Calculate D_neg, the number of negative documents
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1)/(N_pos + V)
        p_w_neg = (freq_neg + 1)/(N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

# Training

In [12]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

# Testing

In [13]:
def naive_bayes_predict(review, logprior, loglikelihood):
    '''
    Input:
        review: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the review (if found in the dictionary) + logprior (a number)

    '''
    # process the review to get a list of words
    word_l = process_review(review)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:
        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

In [23]:
def test_naive_bayes(test_x, logprior, loglikelihood):
    """
    Input:
        test_x: A list of reviews
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        y_hat: the predictions
    """
    y_hats = []
    
    for review in test_x:
        y_hat_i = 1 if naive_bayes_predict(review, logprior, loglikelihood) > 0 else 0
        
        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    return y_hats

In [16]:
y_hat = test_naive_bayes(test_x, logprior, loglikelihood)

# Scores

In [17]:
def scoring(test_y, y_hat):
    """
    Input:
        test_y: (m, 1) vector with the corresponding labels for the list of reviews
        y_hat: the predictions
    Output:
        accuracy: (# of reviews classified correctly) / (total # of reviews)
        precision: TP/(TP+FP)
        recall: TP/(TP+FN)
        f1_score: 2*(precision*recall)/(precision+recall)
    """
    test_y = pd.Series(test_y, name='Actual')
    y_hat = pd.Series(y_hat, name='Predicted')

    confusion = pd.crosstab(test_y, y_hat) 
    tn = confusion[0][0]
    fn = confusion[0][1]
    fp = confusion[1][0]
    tp = confusion[1][1]
    
    accuracy = (tn+tp)/(tn+tp+fp+fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = 2*(precision*recall)/(precision+recall)
    
    return accuracy, precision, recall, f1_score

In [21]:
scores = scoring(test_y, y_hat)
print(f"Naive bayes model's accuracy = {scores[0]:.4f}")
print(f"Naive bayes model's precision = {scores[1]:.4f}")
print(f"Naive bayes model's recall = {scores[2]:.4f}")
print(f"Naive bayes model's f1-score = {scores[3]:.4f}")

Naive bayes model's accuracy = 0.7927
Naive bayes model's precision = 0.8167
Naive bayes model's recall = 0.8697
Naive bayes model's f1-score = 0.8424


# Predict my own review

In [35]:
my_review = 'This food tasted meh'
print(process_review(my_review))
y_pred = naive_bayes_predict(my_review, logprior, loglikelihood)
print(y_pred)
if y_pred > 0:
    print('Positive sentiment')
else: 
    print('Negative sentiment')

['food', 'tast', 'meh']
-0.6344431419568566
Negative sentiment
