# Sentiment Analysis Yelp

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)

In [2]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(IMDB_DATA_FILE, names=COLUMN_NAMES)

In [3]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [4]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
233,"Ordered a double cheeseburger & got a single patty that was falling apart (picture uploaded) Yeah, still sucks.",0
722,If you haven't choked in your own vomit by the end (by all the cheap drama and worthless dialogue) you've must have bored yourself to death with this waste of time.,0
95,We'll never go again.,0
873,My sister has one also and she loves it.,1
23,I have yet to run this new battery below two bars and that's three days without charging.,1
667,"Looks good in the picture, but this case was a huge disappointment!!",0
22,Service is also cute.,1
581,"Verizon tech support walked my through a few procedures, none of which worked and I ended up having to do a hard re-set, wiping out all my data.",0
106,"The food was delicious, our bartender was attentive and personable AND we got a great deal!",1
576,I swung in to give them a try but was deeply disappointed.,0


In [5]:
review_data.Sentiment.value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [6]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [7]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [8]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
197,bad choice,0
533,it handles some tough issues with dignity and grace and of course has shocking spoiler here,1
57,not much seafood and like 5 strings of pasta at the bottom,0
176,if you see it you should probably just leave it on the shelf,0
398,the characters were all funny and had the peculiarity of not having a true lead character,1
76,imdb ratings only go as low 1 for awful it s time to get some negative numbers in there for cases such as these,0
856,went in for happy hour great list of wines,1
269,love this headset,1
960,insults profound deuchebaggery and had to go outside for a smoke break while serving just to solidify it,0
498,well just if you keep thinking how bad it is,0


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [10]:
tfidf = TfidfVectorizer()
log_reg = LogisticRegression()
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [12]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [13]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 81%'

In [14]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0], dtype=int64)