In [61]:
# import libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from nltk.stem.porter import *
from collections import defaultdict
import math

In [27]:
# nltk.download("all")

In [None]:
# Load the amazon review dataset
df = pd.read_csv('./data/IMDB Dataset.csv')

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [28]:
word_tokenize(text='AAA', language='english')

['AAA']

In [34]:
df = df[:1000]

In [50]:
# create preprocess_text function
def preprocess_text(text:str):
    
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    
    tokens = word_tokenize(text.lower())

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    stemmer =  PorterStemmer()
    
    stopwords_set = set(stopwords.words("english"))
    
    tokens_2 = [stemmer.stem(token) for token in filtered_tokens if token not in stopwords_set]

    return tokens_2

In [51]:
df

Unnamed: 0,review,sentiment,review_processed
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, 1, oz, episod, y..."
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, br, br, film, techniq..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe..."
3,Basically there's a family where a little boy ...,negative,"[basic, there, famili, littl, boy, jake, think..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st..."
...,...,...,...
995,Nothing is sacred. Just ask Ernie Fosselius. T...,positive,
996,I hated it. I hate self-aware pretentious inan...,negative,
997,I usually try to be professional and construct...,negative,
998,If you like me is going to see this in a film ...,negative,


In [None]:
def calculate_word_counts(tweets):
    word_count = defaultdict(int)
    
    for tweet in tweets:
        tokens = preprocess_text(tweet)
        
        for token in tokens:
            word_count[token] += 1
    
    return word_count

In [None]:
def calculate_likelihood(word_count, total_words, laplacian_smoothing=1):
    likelihood = {}
    
    vocabulary_size = len(word_count)

    for word, count in word_count.items():
        likelihood[word] = (count + laplacian_smoothing) / (total_words + laplacian_smoothing * vocabulary_size)

    # Return the calculated likelihood dictionary
    return likelihood

In [None]:

def calculate_log_prior(sentiment, data):
    return math.log(len(data[data['sentiment'] == sentiment]) / len(data))

In [None]:
def classify_tweet_with_scores(tweet, log_likelihood_positive, log_likelihood_negative,log_prior_positive, log_prior_negative):
    tokens = preprocess_text(tweet)

    log_score_positive = log_prior_positive + sum([log_likelihood_positive.get(token, 0) for token in tokens])
    log_score_negative = log_prior_negative + sum([log_likelihood_negative.get(token, 0) for token in tokens])

    sentiment_scores = {
        'positive': log_score_positive,
        'negative': log_score_negative
    }

    predicted_sentiment = max(sentiment_scores, key=sentiment_scores.get)
    return predicted_sentiment, sentiment_scores


In [None]:

tests = calculate_word_counts(df['review'][:150])

word_count_positive = calculate_word_counts(df[df['sentiment'] == 'positive']['review'])
word_count_negative = calculate_word_counts(df[df['sentiment'] == 'negative']['review'])

total_positive_words = sum(word_count_positive.values())
total_negative_words = sum(word_count_negative.values())

likelihood_positive = calculate_likelihood(word_count_positive, total_positive_words)
likelihood_negative = calculate_likelihood(word_count_negative, total_negative_words)

positive = calculate_log_prior('positive', df)
negative = calculate_log_prior('negative', df)

log_likelihood_positive = {word: math.log(prob) for word, prob in likelihood_positive.items()}
log_likelihood_negative = {word: math.log(prob) for word, prob in likelihood_negative.items()}

"This is like a zoology textbook, given that its depiction of animals is so accurate. However, here are a few details that appear to have been slightly modified during the transition to film:<br /><br />- Handgun bullets never hit giant Komodo dragons. It doesn't matter how many times you shoot at the Komodo, bullets just won't go near it.<br /><br />- The best way to avoid being eaten by a giant Cobra, or a giant Komodo dragon, is just to stand there. The exception to this rule is if you've been told to stay very still, in which case you should run off, until the Komodo is right next to you, and then you should stand there, expecting defeat.<br /><br />- Minutes of choppy slow motion footage behind the credits really makes for enjoyable watching.<br /><br />- $5,000 is a memory enhancement tool, and an ample substitute for losing your boating license/getting arrested.<br /><br />- Members of elite army units don't see giant Komodo dragons coming until they are within one metre of the 

In [91]:
predicted_sentiment, sentiment_scores = classify_tweet_with_scores(
   " This is like a zoology textbook",log_likelihood_positive, log_likelihood_negative,positive, negative)

In [93]:
predicted_sentiment,sentiment_scores

('positive', {'positive': -5.928964197761661, 'negative': -26.253354757345548})