# Sentiment Analysis Yelp

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

In [2]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

### Reviews data

In [3]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(IMDB_DATA_FILE, names=COLUMN_NAMES)

In [4]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [5]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
875,There was hardly any meat.,0
827,Good case!.,1
30,The problem was the script.,0
978,I vomited in the bathroom mid lunch.,0
269,Love this headset!,1
225,These are certainly very comfortable and functionality is decent.,1
60,At least think to refill my water before I struggle to wave you over for 10 minutes.,0
974,This phone tries very hard to do everything but fails at it's very ability to be a phone.,0
66,Always a great time at Dos Gringos!,1
262,Works great.,1


In [6]:
review_data.Sentiment.value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [7]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [8]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [9]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
482,it was also the right balance of war and love,1
906,after the disappointing dinner we went elsewhere for dessert,0
411,i highly recommend this device to everyone,1
159,unfortunately we must have hit the bakery on leftover day because everything we ordered was stale,0
762,the chicken dishes are ok the beef is like shoe leather,0
675,a fantastic neighborhood gem,1
712,poor quality,0
325,good item low price,1
912,it always cuts out and makes a beep beep beep sound then says signal failed,0
826,not as good as i had hoped,0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [11]:
tfidf = TfidfVectorizer()
log_reg = LogisticRegression()
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [13]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [14]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 81%'

In [15]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0])