# Sentiment Analysis Yelp

In [130]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

In [129]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

### Yelp

In [193]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)

In [194]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [195]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
365,"I find wasting food to be despicable, but this just wasn't food.",0
924,Works good.,1
529,The bartender was also nice.,1
412,Jawbone Era is awesome too!,1
248,The lighting is just dark enough to set the mood.,1
613,I was amazed at the quick arrival of the two original lg cell phone batteries and and at a fraction of the price.,1
3,Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
457,Nargile - I think you are great.,1
452,I am far from a sushi connoisseur but I can definitely tell the difference between good food and bad food and this was certainly bad food.,0
624,Pretty awesome place.,1


In [196]:
review_data.Sentiment.value_counts()

1    1500
0    1500
Name: Sentiment, dtype: int64

In [207]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [208]:
review_model_data = review_data.copy()
review_model_data.Review = review_dataset.Review.apply(clean)

In [209]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
12,if the two were seperated by a mere 5 ft i started to notice excessive static and garbled sound from the headset,0
533,if you love authentic mexican food and want a whole bunch of interesting yet delicious meats to choose from you need to try this place,1
465,the food was outstanding and the prices were very reasonable,1
136,i had a seriously solid breakfast here,1
559,none of it works just don t buy it,0
781,join the club and get awesome offers via email,1
550,i really do recommend this place you can go wrong with this donut place,1
408,plantronics bluetooth excelent buy,1
180,the burger had absolutely no flavor the meat itself was totally bland the burger was overcooked and there was no charcoal flavor,0
978,it fits so securely that the ear hook does not even need to be used and the sound is better directed through your ear canal,1


In [210]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [211]:
tfidf = TfidfVectorizer(strip_accents=None,
                        preprocessor=None,
                        lowercase=False)
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [212]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [213]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,..., penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [217]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 89%'

In [218]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0], dtype=int64)