# Yelp Data Challenge - NLP

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('last_2_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-03-31,0,6SgvNWJltnZhW7duJgZ42w,5,This is mine and my fiancé's favorite steakhou...,0,oFyOUOeGTRZhFPF9uTqrTQ
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,0,2aeNFntqY2QDZLADNo8iQQ
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-03-16,0,UVUMu_bELdA56Ryfbur-DA,5,Every year a group of us (we had 6 this year) ...,1,gmPP4YFrgYsYQqPYokMgFA
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-02-10,0,UxFpgng8dPMWOj99653k5Q,5,Truly Fantastic! Best Steak ever. Service was...,0,aVOGlN9fZ-BXcbtj6dbf0g
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2017-02-14,0,Xp3ppynEvVu1KxDHQ3ae8w,5,Delmonico Steakhouse is a steakhouse owned by ...,0,KC8H7qTZVPIEnanw9fG43g


### Define feature variables - the text of the review

In [4]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

In [5]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents.dtype, documents.shape

(dtype('O'), (515752,))

In [6]:
documents[10]

"This was supposed to be a very special dinner for 10 for my husband's 50th birthday. Four other couples flew in  from all over to celebrate with us and the first problem is that they seated us at a table for 9. The hostess called that morning and said 9, I corrected her and let her know there were 10 and she said she'd call me back by noon to confirm but I never heard back. Regardless, this Open Table reso said 10 so I didn't worry about it until we all sat down, realized we were one short, and had to get back up and go wait in the bar. \nWhen were seated at a proper table, the appetizers that I'd pre-ordered were sitting out on the table, including fried oysters which had gone ice cold and soggy (very gross). We had to ask for bread long after entrees were served and it was cold, mushy things that looked like they used to be popovers (I thought I'd remembered a wonderful bread service before...?) And so on with the poor service...\nI've eaten here before and service and food were fiv

### Define the target variable (any categorical variable that may be meaningful)

#### I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [7]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = (df['stars'] > 4)
target = df['favorable'].values
target[:10]

array([ True, False,  True,  True,  True, False,  True,  True,  True,
       False])

#### The statistic of the target variable

In [8]:
# To be implemented
target.mean(), target.std()

(0.46397299477268145, 0.49870036584541505)

In [9]:
documents.shape, target.shape

((515752,), (515752,))

## Create training dataset and test dataset

In [10]:
from sklearn.cross_validation import train_test_split



In [12]:
# Documents is X and target is y
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test,target_train,target_test = train_test_split(
    documents,
    target,
    test_size=0.8,
    random_state=42
)

## NLP representation of the documents

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [15]:
# Train the model with your training data
vector_train = vectorizer.fit_transform(documents_train).toarray()
vector_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [17]:
# Use the trained model to transform your test data
vector_test = vectorizer.transform(documents_test).toarray()
vector_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Similar review search engine

In [18]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [19]:
# use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
# Draw an arbitrary review from test (unseen in training) documents
some_random_number = 42
search_query = documents_test[some_random_number]
search_queries = [search_query]
print(search_query)
print(search_queries)

Great food and great price, the only thing is the waiting time. During lunch and dinner hours, they are extremely busy. Call in ahead to get your order and pick up for faster service :)
['Great food and great price, the only thing is the waiting time. During lunch and dinner hours, they are extremely busy. Call in ahead to get your order and pick up for faster service :)']


In [21]:
# Transform the drawn review(s) to vector(s)
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [22]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_score =cosine_similarity(vector_search_queries, vector_train)

In [23]:
# Let's find top 5 similar reviews
n = 5
similar_reviews = get_top_values(similarity_score[0],n,documents_train)

In [24]:
print('Our search query:')
print(search_queries[0]) 

Our search query:
Great food and great price, the only thing is the waiting time. During lunch and dinner hours, they are extremely busy. Call in ahead to get your order and pick up for faster service :)


In [25]:
print('Most %s similar reviews:' % n)
for i, review in enumerate(similar_reviews):
    print('#%s:' %i)
    print(review)  

Most 5 similar reviews:
#0:
Great buffet and a great price. Ive eaten here 4 times. It was great each time. It can get busy though so i would suggest call ahead to find out the wait time
#1:
Well I'm updating my last review. As previously stated gave them 2 stars because the food is somewhat good. I don't like dealing with the depressed people inside so I decided moving forward I would place the order online and pick up. 
When I arrived a few minutes after my pick up time they told me sorry your order is not ready, it will be faster if you stand in line. How is it faster if I stand in line when have had my order for 45minutes?! They told me it would be an additional 20 minutes to make my order but they could magically make it faster if I stood in line in 10 min before ordering the exact same thing they had on the receipt in front of them. No sense in coming back. I have tried time and time again, and every time it gets worse. This will be my last trip to place I at one point in my life

The result makes senses. The keywords of our search query is 'great food', 'great price', and 'long waiting time'. In the top five similar reviews, we could find these keywords also.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [26]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()
model_nb.fit(vector_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
# Get score for training set
model_nb.score(vector_train, target_train)

0.8092195831313621

In [28]:
# Get score for test set
model_nb.score(vector_test, target_test)

0.8031250454433085

#### Logistic Regression Classifier

In [29]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(vector_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
# Get score for training set
model_lr.score(vector_train, target_train)

0.8433446437227339

In [31]:
# Get score for test set
model_lr.score(vector_test, target_test)

0.8256964338515083

#### Key features(words) that make the positive prediction

In [32]:
# Let's find it out by ranking
n = 20
get_top_values(model_lr.coef_[0], n, words)

['amazing',
 'best',
 'incredible',
 'delicious',
 'awesome',
 'thank',
 'perfection',
 'perfect',
 'phenomenal',
 'fantastic',
 'favorite',
 'great',
 'perfectly',
 'excellent',
 'highly',
 'die',
 'heaven',
 'gem',
 'love',
 'bomb']

#### Key features(words) that make the negative prediction

In [33]:
# Let's find it out by ranking
n = 20
get_bottom_values(model_lr.coef_[0], n, words)

['worst',
 'ok',
 'horrible',
 'rude',
 'bland',
 'terrible',
 'okay',
 'mediocre',
 'slow',
 'disappointing',
 'dry',
 'meh',
 'lacking',
 'unfortunately',
 'overpriced',
 'wasn',
 'awful',
 'average',
 'poor',
 'decent']

#### Random Forest Classifier

In [36]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(max_features = 'auto',
                                  max_depth = 20,
                                  n_estimators = 200,
                                  min_samples_split = 2,
                                  min_samples_leaf = 2,
                                  random_state = 0,
                                  n_jobs = -1)
model_rf.fit(vector_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [37]:
# Get score for training set
model_rf.score(vector_train, target_train)

0.8152981095492002

In [38]:
# Get score for test set
model_rf.score(vector_test, target_test)

0.7826210246193669

#### Important features (words) by inspecting the RFC model

In [39]:
n = 20
get_top_values(model_rf.feature_importances_, n, words)

['amazing',
 'best',
 'great',
 'delicious',
 'ok',
 'bad',
 'awesome',
 'love',
 'wasn',
 'didn',
 'definitely',
 'worst',
 'horrible',
 'minutes',
 'good',
 'pretty',
 'vegas',
 'favorite',
 'highly',
 'like']

### Evaluate the classifiers using cross validation

In [40]:
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(model_lr,
                          vector_train,
                          target_train,
                          cv = 5,
                          scoring = 'accuracy')
cv_score

array([0.8253599 , 0.8266602 , 0.82598158, 0.83257392, 0.82010762])

### Use grid search to find best predictable classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = [{'penalty':['l1'], 'C':[0.1,100]},
              {'penalty':['l2'], 'C':[0.1,100]}]

scores = ['accuracy']

for score in scores:
    print('# Tuning hyperparameters for %s' % score + '\n\n')
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv = 5,
                       scoring = score)
    clf.fit(vector_train[:500, :], target[:500])
    print('Best parameters set found on development set: \n\n')
    print(clf.best_params_)
    print('\n Grid scores on development set: \n\n')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, param in zip(means, stds, clf.cv_results_['params']):
        print('%0.3f (+/-%0.03f) for %r' % (mean, std*2, param))
    
    print('\n Detailed classification report: \n')
    print('The model is trained on the full development set.')
    print('The score is computed on the full development set.')
    print('\n')
    y_true, y_pred = target_test, clf.predict(vector_test)
    print(classification_report(y_true, y_pred))
    print('\n')

# Tuning hyperparameters for accuracy


Best parameters set found on development set: 


{'C': 0.1, 'penalty': 'l1'}

 Grid scores on development set: 


0.568 (+/-0.005) for {'C': 0.1, 'penalty': 'l1'}
0.496 (+/-0.094) for {'C': 100, 'penalty': 'l1'}
0.568 (+/-0.005) for {'C': 0.1, 'penalty': 'l2'}
0.536 (+/-0.078) for {'C': 100, 'penalty': 'l2'}

 Detailed classification report: 

The model is trained on the full development set.
The score is computed on the full development set.


