In [None]:
# This started out as an exercise on naive Bayes for text classification.
# Using the data provided (text for reviews on rottentomatoes.com 
# and the reviews' fresh or not designation, i.e., the text classification),
# I instead explored some related methods I was interested in, specifically:
# 
#   1. Implementing text normalization and preprocessing
#   2. Comparing outcomes from several text classification models (Naive Bayes, Logistic Regression, and Support Vector Machines)
#   3. And doing so with and without term-frequency/inverse document frequency (TF-IDF) Vectorization


In [22]:
# Read in and set up libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
# import seaborn as sns
from six.moves import range
import re
import nltk

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

import warnings
warnings.filterwarnings("ignore")

In [23]:
# Some EDA and a look at the distribution of fresh ratings
# For some unexplained and inexplicable reason, the original exercise examined the distribution of reviewers with counts > 100
# So I did histograms for both those reviews and the entire data set

In [24]:
critics = pd.read_csv('./critics.csv')
#let's drop rows with missing quotes
critics = critics[~critics.quote.isnull()]
critics.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
1,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
5,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [25]:
n_reviews = len(critics)
n_movies = critics.rtid.unique().size
n_critics = critics.critic.unique().size

In [26]:
# recode 'fresh' reviews with a boolean
df = critics.copy()
df['fresh'] = df.fresh == 'fresh'
# df.fresh.value_counts()per

In [27]:
grp = df.groupby('critic')
counts = grp.critic.count()  
# print (counts)

In [28]:
means = grp.fresh.mean()  

In [29]:
final = pd.concat([counts, means], axis=1)

In [30]:
df3 = final[final['critic'] > 100]  

In [31]:
df3

Unnamed: 0_level_0,critic,fresh
critic,Unnamed: 1_level_1,Unnamed: 2_level_1
Bosley Crowther,141,0.695035
Dave Kehr,338,0.591716
Derek Adams,151,0.615894
Desson Thomson,391,0.588235
Edward Guthmann,119,0.638655
Emanuel Levy,108,0.62963
Geoff Andrew,349,0.707736
Hal Hinson,183,0.530055
James Berardinelli,804,0.603234
Janet Maslin,515,0.708738


In [37]:
import plotly
plotly.tools.set_credentials_file(username="Ross.Brown.Ph.D.", api_key='yPNZCAkYOyi7wAKtZrSM')

In [38]:

import plotly.offline as pyo

from plotly.graph_objs import *

import plotly.plotly as py

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import numpy as np

from plotly import tools
from plotly.graph_objs import Bar, Data, Figure, Layout, Marker, Scatter

init_notebook_mode(connected=True)

import cufflinks as cf

In [None]:
df3['percent'] = 100
df3['ratio'] = df3['fresh'] * df3['percent']
df4 = df3['ratio']

In [21]:
# only reviewers with more than 100 reviews
# Need to add a note that this is only 37 reviewers
data = [go.Histogram(x=df4,
                    nbinsx=10,
                    autobinx = False)]
layout = go.Layout(
    title='How Often do Invididual Critics Rate a Movie Fresh?',
    xaxis=dict(
        range=[30,90],
        title='Percentage of Movies Deemed Fresh'
    ),
    yaxis=dict(
        range=[0,10],
        title='Number of Critics'
    ),
    bargap=0.03
)
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

In [None]:
# do that again with all the reviews

In [42]:
final['percent'] = 100

In [43]:
final['ratio'] = final['fresh'] * final['percent']

In [44]:
df5 = final['ratio']

In [47]:

data = [go.Histogram(x=df5)]
layout = go.Layout(
    title='How Often do Invididual Critics Rate a Movie Fresh?',
    xaxis=dict(
        # range=[30,90],
        title='Percentage of Movies Deemed Fresh'
    ),
    yaxis=dict(
        # range=[0,10],
        title='Number of Critics'
    ),
    bargap=0.03
)
fig = go.Figure(data=data, layout=layout)
pyo.iplot(fig)

In [None]:
# The distribution of all the review includes many more critics with 100% of their reviews as fresh,
# compared to the distribution of reviwers with 100 or more reviews. 
# The reviewers with more reviews have almost a normal distribution in terms of the percent of time
# they award fresh reviews; the distribution skews slightly toward more frequently rating movies fresh.

In [None]:
# Next steps: Text prep and vectorizing

In [59]:
critics = pd.read_csv('./critics.csv')
critics = critics[~critics.quote.isnull()]
critics['fresh'] = critics.fresh == 'fresh'
critics.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
1,Derek Adams,True,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,True,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,True,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,True,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
5,Jonathan Rosenbaum,True,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [60]:
# Text normalization/preprocessing

In [61]:
critics['quote'] = critics['quote'].str.lower()

In [62]:
# remove numbers
critics['quote'] = critics['quote'].str.replace("\d+", "")

In [63]:
# set the classification outcome
y = critics.fresh.values.astype(np.int)
y

array([1, 1, 1, ..., 1, 1, 1])

In [64]:
# Transform the movie review text data (the individual documents) into vectors, including
# preprocessing built into sklearn, i.e., removing stop words

In [66]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(critics['quote'], y, test_size=0.33, random_state=53)

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)

count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])


['_and_', 'aaron', 'abandon', 'abandoned', 'abandonment', 'abandons', 'abbott', 'abc', 'abduct', 'abe']


In [None]:
# Fit three models on the on the vectorized corpus for comparing their performance at predicting 'fresh' reviews
# based on the text in the reviews

In [67]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)
pred = nb_classifier.predict(count_test)
metrics.accuracy_score(y_test, pred)
print("Naive Bayes classification accuracy score: {:.2f}".format(metrics.accuracy_score(y_test, pred)))



Naive Bayes classification accuracy score: 0.75


In [56]:
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf = LogisticRegression() 
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
metrics.accuracy_score(y_test, pred)
print("Logistic Regression classification accuracy score: {:.2f}".format(metrics.accuracy_score(y_test, pred)))


Logistic Regression classification accuracy score: 0.75


In [72]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier() 
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
metrics.accuracy_score(y_test, pred)
print("SGDClassifier accuracy score: {:.2f}".format(metrics.accuracy_score(y_test, pred)))

SGDClassifier accuracy score: 0.73


In [None]:
# Fit those models again with TF-IDF vectorization

In [75]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_train, X_test, y_train, y_test = train_test_split(critics['quote'], y, test_size=0.33, random_state=53)


count_vectorizer = CountVectorizer(stop_words='english')

X_train_counts = count_vectorizer.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [76]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
pred = nb_classifier.predict(count_test)
metrics.accuracy_score(y_test, pred)
print("TF-IDF V Naive Bayes classification accuracy score: {:.2f}".format(metrics.accuracy_score(y_test, pred)))

TF-IDF V Naive Bayes classification accuracy score: 0.73


In [78]:
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
clf = LogisticRegression() 
clf.fit(X_train_tfidf, y_train)
pred = clf.predict(count_test)
metrics.accuracy_score(y_test, pred)
print("TF-IDF V Logistic Regression classification accuracy score: {:.2f}".format(metrics.accuracy_score(y_test, pred)))

TF-IDF V Logistic Regression classification accuracy score: 0.74


In [79]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier() 
clf.fit(X_train_tfidf, y_train)
pred = clf.predict(count_test)
metrics.accuracy_score(y_test, pred)
print("TF-IDF V SGDClassifier accuracy score: {:.2f}".format(metrics.accuracy_score(y_test, pred)))

TF-IDF V SGDClassifier accuracy score: 0.74


In [None]:
# For this data set, outcomes were not different across the models with and without TF-IDF vectorization