In [4]:
#import statements
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

#display output directly below plotting commands inline
%matplotlib inline

In [5]:
#import review dataset using pandas, limit rows to 100,000
reviews = pd.read_csv('../input files/yelp_review.csv', nrows= 100000)

In [6]:
#create a text length column at end of data set
reviews['text length'] = reviews['text'].apply(len)

#limit reviews to either 1 or 5 stars
review_class = reviews[(reviews['stars'] == 1) | (reviews['stars'] == 5)]
review_class.shape

(55986, 10)

In [7]:
#create variables for review text and rating
X = review_class['text']
y = review_class['stars']

import string

#Global stopwords to save processing power
stoplist = stopwords.words('english')

#text tokenizer
def text_processor(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stoplist and not word.isdigit()]

In [8]:
#vectorize the review text
text_vector = CountVectorizer(analyzer=text_processor).fit(X)

#determine length
len(text_vector.vocabulary_)

101927

In [9]:
#transform X
X = text_vector.transform(X)

In [10]:
#creation and splitting of training and testing models
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [11]:
#import and train multinominal naive bayes model
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [34]:
print('Here`s a run of the mill positive review, they seem elated to have eaten here:')
print('\n')

#locate a positive review
positive_review = review_class['text'][1]
positive_review

Here`s a run of the mill positive review, they seem elated to have eaten here:




"Small unassuming place that changes their menu every so often. Cool decor and vibe inside their 30 seat restaurant. Call for a reservation. \n\nWe had their beef tartar and pork belly to start and a salmon dish and lamb meal for mains. Everything was incredible! I could go on at length about how all the listed ingredients really make their dishes amazing but honestly you just need to go. \n\nA bit outside of downtown montreal but take the metro out and it's less than a 10 minute walk from the station."

In [19]:
#vectorize and predict the star rating of the positive review
positive_review_transformed = text_vector.transform([positive_review])

print('Predicted Star Rating:')
mnb.predict(positive_review_transformed)[0]

Predicted Star Rating:


5

In [32]:
print('Here`s is an upset karaoke goer:')
print('\n')

#locate a negative review
negative_review = review_class['text'][36000]
negative_review

Here`s is an upset karaoke goer:




'You didn\'t hear it through the grapevine...you heard it straight from the horses mouth. DO NOT GO HERE FOR KARAOKE. Go for people watching if anything. I should\'ve known when I called ahead. The sweet girl on the phone gave us directions. All of my friends were adamant about other places and I was rooting for the nice bartender and the possibility of a song.\n\nWhen we arrived the place was packed! Yet she still knew I was the one who called (very Children of the Corn). I immediately (at 8:15) put 2 songs in. I said you can just pick one if it fits your line-up. Hoping they\'d call me and then one of the 4 people I strung along would add a song in.\n\nThis did not happen. \n\nWhile we each had 3-4 drinks (and 14 songs played), yes, I counted them. Not one persons name was EVER called. It was like a 2-disc CD was playing and they already knew who was next. \n\nIf you are new and you are going here to sing. Wait to be disappointed. It\'s so weird. Not like a cool dive bar weird, but l

In [26]:
#vectorize and predict the star rating of the negtive review
negative_review_transformed = text_vector.transform([negative_review])

print('Predicted Star Rating:')
mnb.predict(negative_review_transformed)[0]

Predicted Star Rating:


1