# Sentiment Analysis Yelp

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

In [2]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

### Yelp

In [3]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)

In [4]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [5]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
501,"I'd rather eat airline food, seriously.",0
364,Server did a great job handling our large rowdy table.,1
867,"For sushi on the Strip, this is the place to go.",1
363,Definitely a turn off for me & i doubt I'll be back unless someone else is buying.,0
665,I could not recommend these more.,0
625,Ambience is perfect.,1
775,Great bluetooth!.,1
358,Best fish I've ever had in my life!,1
189,"Also were served hot bread and butter, and home made potato chips with bacon bits on top....very original and very good.",1
290,Waited 2 hours & never got either of our pizzas as many other around us who came in later did!,0


In [6]:
review_data.Sentiment.value_counts()

1    1500
0    1500
Name: Sentiment, dtype: int64

In [7]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [8]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [9]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
625,ambience is perfect,1
943,not good by any stretch of the imagination,0
662,the owners are super friendly and the staff is courteous,1
551,thank you for such great service,1
348,this little device has transformed my organizational capability and made my life a whole lot easier,1
845,this place deserves no stars,0
246,excellent wallet type phone case,1
574,i ve had no trouble accessing the internet downloading ringtones or performing any of the functions,1
728,the han nan chicken was also very tasty,1
480,we asked for the bill to leave without eating and they did not bring that either,0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [17]:
tfidf = TfidfVectorizer(strip_accents=None,
                        preprocessor=None,
                        lowercase=False)
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [19]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [20]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 89%'

In [21]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0])