# Sentiment Analysis Yelp

In [7]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

In [8]:
DATA_DIR = 'data/sentiment_labelled_sentences/'

IMDB_DATA_FILE = DATA_DIR + 'imdb_labelled.txt'
YELP_DATA_FILE = DATA_DIR + 'yelp_labelled.txt'
AMAZON_DATA_FILE = DATA_DIR + 'amazon_cells_labelled.txt'

COLUMN_NAMES = ['Review', 'Sentiment']

### Reviews data

In [9]:
yelp_reviews = pd.read_table(YELP_DATA_FILE, names=COLUMN_NAMES)
amazon_reviews = pd.read_table(AMAZON_DATA_FILE, names=COLUMN_NAMES)
imdb_reviews = pd.read_table(IMDB_DATA_FILE, names=COLUMN_NAMES)

In [10]:
review_data = pd.concat([amazon_reviews, imdb_reviews, yelp_reviews])

In [11]:
review_data.sample(10)

Unnamed: 0,Review,Sentiment
574,"You learn a lot about the real inside emotions of people in this movie, and a lot about the movie business itself.",1
430,Fantastic earphones.,1
635,im surprised this is a good quality car charger and there's not much reviews about it.,1
487,"There are many continuity errors: one other user commented on different cars in the garage, Joe's glasses...the one that got to me the most was the fact Joe's facial hair configuration seemed to c...",0
591,Linked to my phone without effort.,1
593,"This film highlights the fundamental flaws of the legal process, that it's not about discovering guilt or innocence, but rather, is about who presents better in court.",1
330,Whatever prompted such a documentary is beyond me!,0
205,Very good lunch spot.,1
599,"For the price on Amazon, it is an excellent product, which I would highly recommend.",1
435,Warning - Stay away.,0


In [12]:
review_data.Sentiment.value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [13]:
import re

def clean(text):
    text = re.sub(r'[\W]+', ' ', text.lower())
    text = text.replace('hadn t' , 'had not')\
               .replace('wasn t', 'was not')\
               .replace('didn t', 'did not')
    return text

In [14]:
review_model_data = review_data.copy()
review_model_data.Review = review_model_data.Review.apply(clean)

In [15]:
review_model_data.sample(10)

Unnamed: 0,Review,Sentiment
696,the phone can also take great pictures and even video clips,1
113,great product fast shipping,1
533,it handles some tough issues with dignity and grace and of course has shocking spoiler here,1
112,i mean this in a terrible way,0
699,comfortable fit you need your headset to be comfortable for at least an hour at a time if not for an entire day,1
889,no allergy warnings on the menu and the waitress had absolutely no clue as to which meals did or did not contain peanuts,0
212,great price also,1
905,not nearly as good looking as the amazon picture makes it look,0
545,tom wilkinson s character is a man who is not prepared for the ordeal that is about to begin but he takes the matter in hand as the story progresses and this great actor gives a performance that m...,1
388,it worked very well,1


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [17]:
tfidf = TfidfVectorizer(strip_accents=None,
                        preprocessor=None,
                        lowercase=False)
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
log_tfidf = Pipeline([('vect', tfidf),
                       ('clf', log_reg)])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(review_model_data.Review, 
                                                    review_model_data.Sentiment, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [19]:
log_tfidf.fit(X_train.values, y_train.values)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [20]:
test_accuracy = log_tfidf.score(X_test.values, y_test.values)
'The model has a test accuracy of {:.0%}'.format(test_accuracy)

'The model has a test accuracy of 81%'

In [21]:
log_tfidf.predict(['I loved this place', 'I hated this place'])

array([1, 0])