# Sentiment Analysis Using Naive Bayes

Workshopping the model for chat sentiment analysis

In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
import string

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Replace dataset with a more appropriate one
data = pd.read_csv('../IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
def clean_string(text):
    final_string = ""
    
    text = text.lower()
    
    text = re.sub(r'\n', '', text)
    translator = str.maketrans('', '', string.punctuation)
    
    text = text.translate(translator)
    text = text.split()
    
    useless_words = stop_words + ['im']
    
    text_filtered = [word for word in text if not word in useless_words]
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    final_string = ' '.join(text_filtered)

    return final_string

data['review'] = data['review'].apply(lambda cw : clean_string(cw)) 
data

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode y...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary schools n...,negative
49998,going disagree previous comment side maltin on...,negative


In [4]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st

data['review'] = data.review.apply(lemmatize_text)
data

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode you...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,going disagree previous comment side maltin on...,negative


In [5]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [6]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [7]:
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = vec.get_feature_names_out()
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]

In [8]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

In [9]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

In [10]:
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [11]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
        #result.append(label_scores)
    return result

In [12]:
# TESTING
# labels = [0,1]
# n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
# pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, ['i am good', 'i am bad'])
# pred

In [13]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)

In [14]:
#pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
#print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))

## Testing the model

In [15]:
data = ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']

In [16]:
import pickle
pickle.dump(predict, open('models/sentiment_model.pkl', 'wb'))

FileNotFoundError: [Errno 2] No such file or directory: 'models/sentiment_model.pkl'

In [None]:
loaded_model = pickle.load(open('models/sentiment_model.pkl', 'rb'))

In [None]:
# loaded_model

In [None]:
# loaded_model(n_label_items, vocab, word_counts, log_label_priors, labels, ['i am sad'])

In [None]:
# arbitrary threshold for pos
pos = 5
neg = 0

for text in data:
    pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, [text])
    if pred[0] == 1:
        pos += 1
    else:
        neg += 1
        
    if pos < neg:
        print("I think you're too sad")
        break
print(pos, neg)

In [17]:
print(n_label_items, vocab, word_counts, log_label_priors, labels)

