# Naive Bayes for Sentiment Analysis

In [1]:
import re
import math
import nltk

import pandas as pd
import numpy as np
from numpy.lib.function_base import vectorize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

In [2]:
data = pd.read_csv('Datasets/IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
def remove_tags(string):
    result = re.sub(r'<[^>]+>', '', string) # Remove HTML tags
    result = re.sub('https?://\S+', '', result) # Remove URLs
    result = re.sub(r'[^\w\s'']', ' ', result) # remove nonalphanumeric chars
    result = result.lower()
    return result

In [4]:
data['review'] = data['review'].apply(lambda cw: remove_tags(cw))

In [5]:
# nltk.download('stopwords')

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [7]:
stop_words = set(stop_words)

In [8]:
data['review'][1]

'a wonderful little production  the filming technique is very unassuming  very old time bbc fashion and gives a comforting  and sometimes discomforting  sense of realism to the entire piece  the actors are extremely well chosen  michael sheen not only  has got all the polari  but he has all the voices down pat too  you can truly see the seamless editing guided by the references to williams  diary entries  not only is it well worth the watching but it is a terrificly written and performed piece  a masterful production about one of the great master s of comedy and his life  the realism really comes home with the little things  the fantasy of the guard which  rather than use the traditional  dream  techniques remains solid then disappears  it plays on our knowledge and our senses  particularly with the scenes concerning orton and halliwell and the sets  particularly of their flat with halliwell s murals decorating every surface  are terribly well done '

In [9]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([
        lemmatizer.lemmatize(x)
        for x in w_tokenizer.tokenize(text)
    ])

data['review'] = data['review'].apply(lemmatize_text)

In [10]:
# nltk.download('wordnet')

In [11]:
data['review'][5]

'probably my all time favorite movie a story of selflessness sacrifice and dedication to a noble cause but it s not preachy or boring it just never get old despite my having seen it some 15 or more time in the last 25 year paul lukas performance brings tear to my eye and bette davis in one of her very few truly sympathetic role is a delight the kid are a grandma say more like dressed up midget than child but that only make them more fun to watch and the mother s slow awakening to what s happening in the world and under her own roof is believable and startling if i had a dozen thumb they d all be up for this movie'

'probably time favorite movie story selflessness sacrifice dedication noble cause preachy boring never get old despite seen 15 time last 25 year paul lukas performance brings tear eye bette davis one truly sympathetic role delight kid grandma say like dressed midget child make fun watch mother slow awakening happening world roof believable startling dozen thumb movie'

In [12]:
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [13]:
data['review'][5]

'probably time favorite movie story selflessness sacrifice dedication noble cause preachy boring never get old despite seen 15 time last 25 year paul lukas performance brings tear eye bette davis one truly sympathetic role delight kid grandma say like dressed midget child make fun watch mother slow awakening happening world roof believable startling dozen thumb movie'

In [14]:
reviews = data['review'].values
labels = data['sentiment'].values

In [15]:
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [16]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels)

In [17]:
len({x for w in reviews for x in w.split()})

94333

In [18]:
vectorizer = CountVectorizer(max_features=1000, min_df=5, stop_words='english', max_df=0.8)

x = vectorizer.fit_transform(train_sentences)
vocab = vectorizer.get_feature_names_out()
x = x.toarray()

In [19]:
word_count = {l: defaultdict(int) for l in range(2)}

for i in range(x.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        count = x[i, j]
        if count > 0:
            word_count[l][vocab[j]] += count


In [31]:
def laplace_smoothing(n_label_items, vocab, word_count, word, text_label):
    a = word_count[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a / b)

In [32]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

In [None]:
def fit

In [None]:
# https://www.analyticsvidhya.com/blog/2022/03/building-naive-bayes-classifier-from-scratch-to-perform-sentiment-analysis/