# Sentiment Analysis Yelp

In [17]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

In [18]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

### Yelp

In [19]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)

In [20]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [21]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
742,I'm not sure how long we stood there but it was long enough for me to begin to feel awkwardly out of place.,0
937,Don't bother coming here.,0
668,"The owner used to work at Nobu, so this place is really similar for half the price.",1
441,I'd love to go back.,1
497,"This was my first crawfish experience, and it was delicious!",1
348,5 stars for the brick oven bread app!,1
165,DELICIOUS!!,1
370,I left with a stomach ache and felt sick the rest of the day.,0
290,Great Phone.,1
341,"Of all the dishes, the salmon was the best, but all were great.",1


In [22]:
review_data.Sentiment.value_counts()

1    1500
0    1500
Name: Sentiment, dtype: int64

In [23]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [24]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [25]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
295,you get incredibly fresh fish prepared with care,1
737,try them in the airport to experience some tasty food and speedy friendly service,1
487,i dont think i will be back for a very long time,0
360,the buffet is small and all the food they offered was bland,0
647,food was average at best,0
450,i m glad i found this product on amazon it is hard to find it was not high priced,1
22,service is also cute,1
546,last night was my second time dining here and i was so happy i decided to go back,1
354,it was delicious,1
308,in the summer you can dine in a charming outdoor patio so very delightful,1


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [27]:
tfidf = TfidfVectorizer(strip_accents=None,
                        preprocessor=None,
                        lowercase=False)
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [29]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,..., penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [30]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 89%'

In [31]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0], dtype=int64)