In [23]:
# this is an example for a naive bayes sentiment analyzer from the book "Natural Language Processing in Action" by Lane, Howard and Hapke.
# i only added a few more comments and made it work without installing the NLPIA package.

import pandas as pd
from nltk.tokenize import casual_tokenize
from collections import Counter
from sklearn.naive_bayes import MultinomialNB

# read in the text file 
movies = pd.read_csv('movieReviewSnippets_GroundTruth.txt', sep='\t', index_col=0, names=['sentiment', 'text'] )

In [24]:
# lets take a quick look at our data
movies.describe()

Unnamed: 0,sentiment
count,10605.0
mean,0.004831
std,1.92205
min,-3.875
25%,-1.769231
50%,-0.08
75%,1.833333
max,3.941176


In [25]:
# the reviews are rated from -4 to 4

# let's convert them into the bag-of-words representation

bags_of_words = []
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text))) # Counter creates a dict with token: count of token

# lets create a Dataframe of bows
df_bows = pd.DataFrame.from_records(bags_of_words)
# fill all NaN with zero so we can convert them to int
df_bows = df_bows.fillna(0).astype(int)

# lets see how large our bow table has become
df_bows.shape

(10605, 20756)

In [26]:
# the 10605 reviews include 20756 different tokens
# what threw me off at first is: the bag-of-words conversion via the Counter returns dense representations - only actually present tokens are included in the bow representation of a single review.
# these become sparse by creating the dataframe - from_records creates one column per unique key in the list of bow representations and fills each column with missing values with NaN (which we then replaced by 0).

In [27]:
# lets start with the model itself
# warning: this example from the book does not split test and training data!
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0) # this converts the sentiment to a discrete label
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4 #convert binary classification to -4 or 4 for comparison.
# lets have a look on the models performance
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean().round(1)

2.4

In [28]:
# the MAE is 2.4

In [30]:
movies['sentiment_ispositive'] = (movies.sentiment > 0 ).astype(int)
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0 ).astype(int)
movies['sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'.split()].head(8)

Unnamed: 0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
1,2.266667,4,1,1
2,3.533333,4,1,1
3,-0.6,-4,0,0
4,1.466667,4,1,1
5,1.733333,4,1,1
6,2.533333,4,1,1
7,2.466667,4,1,1
8,1.266667,-4,1,0


In [31]:
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)

0.9344648750589345