In [2]:
# this is an example for a naive bayes sentiment analyzer from the book "Natural Language Processing in Action" by Lane, Howard and Hapke.
# i only added a few more comments and made it work without installing the NLPIA package.

import pandas as pd
from nltk.tokenize import casual_tokenize
from collections import Counter
from sklearn.naive_bayes import MultinomialNB

# read in the text file 
movies = pd.read_csv('movieReviewSnippets_GroundTruth.txt', sep='\t', index_col=0, names=['sentiment', 'text'] )

In [4]:
# lets take a quick look at our data
movies.describe()

Unnamed: 0,sentiment
count,10605.0
mean,0.004831
std,1.92205
min,-3.875
25%,-1.769231
50%,-0.08
75%,1.833333
max,3.941176


In [6]:
# the reviews are rated from -4 to 4

# let's convert them into the bag-of-words representation

bags_of_words = []
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text))) # Counter creates a dict with token: count of token

# lets create a Dataframe of bows
df_bows = pd.DataFrame.from_records(bags_of_words)
# fill all NaN with zero so we can convert them to int
df_bows = df_bows.fillna(0).astype(int)

# lets see how large our bow table has become
df_bows.shape

(10605, 20756)

In [8]:
# the 10605 reviews include 20756 different tokens
# what threw me off at first is: the bag-of-words conversion via the Counter returns dense representations - only actually present tokens are included in the bow representation of a single review.
# these become sparse by creating the dataframe - from_records creates one column per unique key in the list of bow representations and fills each column with missing values with NaN (which we then replaced by 0).

In [10]:
# lets start with the model itself
# warning: this example from the book does not split test and training data!
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0) # this converts the sentiment to a discrete label
movies['predicted_sentiment'] = nb.predict(df_bows) * 8 - 4 #convert binary classification to -4 or 4 for comparison.
# lets have a look on the models performance
movies['error'] = (movies.predicted_sentiment - movies.sentiment).abs()
movies.error.mean().round(1)

2.4

In [12]:
# the MAE is 2.4

In [14]:
movies['sentiment_ispositive'] = (movies.sentiment > 0 ).astype(int)
movies['predicted_ispositive'] = (movies.predicted_sentiment > 0 ).astype(int)
movies['sentiment predicted_sentiment sentiment_ispositive predicted_ispositive'.split()].head(8)

Unnamed: 0,sentiment,predicted_sentiment,sentiment_ispositive,predicted_ispositive
1,2.266667,4,1,1
2,3.533333,4,1,1
3,-0.6,-4,0,0
4,1.466667,4,1,1
5,1.733333,4,1,1
6,2.533333,4,1,1
7,2.466667,4,1,1
8,1.266667,-4,1,0


In [16]:
(movies.predicted_ispositive == movies.sentiment_ispositive).sum() / len(movies)

0.9344648750589345

In [32]:
# lets see how our model performs on a different dataset
products = pd.read_csv('amazonReviewSnippets_GroundTruth.txt', sep='\t', index_col=0, names=['sentiment', 'text'] )

bags_of_words = []
# we convert these reviews to BoW in the same way as the movie reviews
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_product_bows = pd.DataFrame.from_records(bags_of_words)

df_product_bows = df_product_bows.fillna(0).astype(int)
df_all_bows = df_bows.append(df_product_bows)
df_all_bows.columns
# we now have more columns - this corresponds to more tokens in the product reviews than in the movie reviews


     sentiment                                               text
1_1      -0.90  troubleshooting ad-2500 and ad-2600 no picture...
1_2      -0.15  repost from january 13, 2004 with a better fit...
1_3      -0.20  does your apex dvd player only play dvd audio ...
1_4      -0.10  or does it play audio and video but scrolling ...
1_5      -0.50  before you try to return the player or waste h...
   troubleshooting   ad    -  2500  and  2600   no  picture  scrolling    b  \
0              1.0  2.0  2.0   1.0  1.0   1.0  1.0      1.0        1.0  1.0   
1              NaN  NaN  NaN   NaN  NaN   NaN  NaN      NaN        NaN  NaN   
2              NaN  NaN  NaN   NaN  NaN   NaN  NaN      NaN        NaN  NaN   
3              NaN  NaN  NaN   NaN  2.0   NaN  NaN      NaN        1.0  NaN   
4              1.0  NaN  NaN   NaN  NaN   NaN  NaN      NaN        NaN  NaN   

   ...  undone  warrranty  expire  expired  voids  develops  soldier  serving  \
0  ...     NaN        NaN     NaN      NaN    Na

Index(['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'Century's',
       'new',
       ...
       'sligtly', 'owner', '81', 'defectively', 'warrranty', 'expire',
       'expired', 'voids', 'baghdad', 'harddisk'],
      dtype='object', length=23302)

In [35]:
df_product_bows = df_all_bows.iloc[len(movies):][df_bows.columns]
# we need to fill the NaN again that stem from the tokens available in the movie reviews, but not the product reviews.
df_product_bows = df_product_bows.fillna(0).astype(int)
print(df_product_bows.head())
products['ispos'] = (products.sentiment > 0).astype(int)
products['predicted_ispositive'] = nb.predict(df_product_bows.values).astype(int)


   The  Rock  is  destined  to  be  the  21st  Century's  new  ...  Ill  \
0    0     0   0         0   0   0    0     0          0    0  ...    0   
1    0     0   0         0   0   0    0     0          0    0  ...    0   
2    0     0   0         0   0   0    0     0          0    0  ...    0   
3    0     0   0         0   0   0    0     0          0    0  ...    0   
4    0     0   0         0   1   0    2     0          0    0  ...    0   

   slummer  Rashomon  dipsticks  Bearable  Staggeringly  ’  ve  muttering  \
0        0         0          0         0             0  0   0          0   
1        0         0          0         0             0  0   0          0   
2        0         0          0         0             0  0   0          0   
3        0         0          0         0             0  0   0          0   
4        0         0          0         0             0  0   0          0   

   dissing  
0        0  
1        0  
2        0  
3        0  
4        0  

[5 rows

In [36]:
(products.predicted_ispositive == products.ispos).sum() / len(products)

0.5572476029328821