# Yelp Data Challenge_NLP and Modeling

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('last_2_years_restaurant_reviews.csv')

In [4]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-02-14,0,VETXTwMw6qxzOVDlXfe6Tg,5,went for dinner tonight. Amazing my husband ha...,0,ymlnR8UeFvB4FZL56tCZsA
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-12-04,0,S8-8uZ7fa5YbjnEtaW15ng,5,This was an amazing dinning experience! ORDER ...,0,9pSSL6X6lFpY3FCRLEH3og
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2016-09-13,0,N1Z93BthdJ7FT2p5S22jIA,3,Went for a nice anniversary dinner. Researched...,0,CEtidlXNyQzgJSdF1ubPFw
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-05-20,0,Pnkrj90xfykhHyo4BSFRsw,5,ABSOLUTE MUST IN VEGAS! Loved everything my bo...,0,cZVQGCZ_fHtTdfiyGVJPdg
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-12-30,0,Oeh7e6U2xaDQI9L9i4x_Gw,2,I had high hopes for Delmonico's Steakhouse in...,0,li2cBZl60vgqihDJJG7jeA


### Define the feature variables, here is the text of the review

In [5]:
# Take the values of the column that contains review text data, save to variable named "documents"

documents = df['text'].values
#here.values makes result numpy array

In [6]:
# inspect the documents , e.g. check the size, take a peek at elements of the numpy array
documents.dtype

dtype('O')

In [7]:
documents.shape

(365550,)

In [8]:
documents[10]

"My husband and I came out to Vegas for a mini vacation. Along the way he wanted to try a good steak dinner and boy we found it! \r\n\r\nI ordered the bone out ribeye and my husband ordered the Japanese Wagyu. Side dishes were the mushrooms and steak fries. 2 bottles of Syrah for drinks and banana cream pie for dessert. \r\n\r\nEveryone from the hostess to the bussers and servers were nothing short of gracious. Our server was John and he was outstanding with his wine recommendations and overall service. \r\n\r\nOur steaks were cooked perfectly, our glasses were never empty. Definitely an experience to remember. Delmonico's will be a destination for us every time we come to the city."

### Define the target variable(any categorical variable that may be meaningful)

Here, I am interested in perfect(5 stars) and inperfect(1-4 stars)rating

In [9]:
# Make a column and take the value, save to a variable named "target"
df['favorable'] = df['stars'] >4
target = df['favorable'].values
target[:10]

array([ True,  True, False,  True, False, False, False, False,  True,
        True])

In [10]:
# check some statistics of the target variable
target.mean()

0.49467377923676653

In [11]:
target.std()

0.4999716305675566

In [12]:
documents.shape

(365550,)

In [13]:
target.shape

(365550,)

### Create training dataset and testing dataset

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

In [15]:
import sklearn
print (sklearn.__version__)

0.20.0


In [16]:
# Documents is x, target is y
#Now split the data to training set and test set

In [17]:
#Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(
    documents,
    target,
    test_size = 0.8,
    random_state = 42
)

### Get NLP representation of the documents

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Create TfidfVectorizer and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5000)

In [20]:
# Train the model with the training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [21]:
# Create the vocab of the Tfidf
words = vectorizer.get_feature_names()

In [22]:
vectors_train.shape

(73110, 5000)

In [23]:
# Use the trained model to transform the test data
vectors_test = vectorizer.transform(documents_test).toarray()

In [24]:
vectors_test.shape

(292440, 5000)

### Similar review search engine

In [23]:
# we will need these helper methods pretty soon
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST
    
    Given a list of values, find the indices with highest n values.
    Return the labels for each of these indices.
    
    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ['cat', 'dog', 'mouse', 'pig', 'rabbit']
    output:['cat', 'pig']
    '''
    return[labels[i] for i in np.argsort(lst)[::-1][:n]] #np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST
    
    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.
    
    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ['cat', 'dog', 'mouse', 'pig', 'rabbit']
    output:['dog', 'mouse']
    '''
    return[labels[i] for i in np.argsort(lst)[:n]]

In [24]:
# Let's use cosing similarity
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# Draw an arbitrary review from test(unseen in training) documents
some_random_number = 42
search_query = documents_test[some_random_number]
search_queries = [search_query] #Need to be put into a list - like format
print(search_query)
print(search_queries)

Antiques galore with great hospitality everything you'ld expect from an old school diner and more!   The staff here is extremely friendly and make sure you are well taken care of during your visit.  

Seems like the majority of costumers here are working class mixed with retirees and definitely regulars to this establishment.  All the decor takes you back to a time definitely not forgetten here from the 1950's telephone to the early 80's Polaroid camera the window booths have vintage conversation starters galore.

The food is delicious and appropriately prices on my first visit I had Curtis' Haystack ( biscuits and gravy topped with hashbrowns eggs and bacon) and it was a full meal for sure so being your best appetite for that one 4.25/5 

Today I had the French Toast Special and the French Toast was prepared perfectly a nice buttery flavor and disappeared quickly. 2 eggs over medium and bacon completed the meal. 4.5/5.

So if you are looking for a nice relaxing breakfast /lunc

In [26]:
# Transform the drawn review(s) to vector(s)
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [27]:
# Calculate the similarity score(s) between vestor(s) and training vectors
similarity_scores = cosine_similarity(vector_search_queries, vectors_train)

In [28]:
# Let's find top 5 similar reviews
n = 5
returned_reviews= get_top_values(similarity_scores[0], n, documents_train)

In [29]:
print('Our search query:')
print(search_queries[0])

Our search query:
Antiques galore with great hospitality everything you'ld expect from an old school diner and more!   The staff here is extremely friendly and make sure you are well taken care of during your visit.  

Seems like the majority of costumers here are working class mixed with retirees and definitely regulars to this establishment.  All the decor takes you back to a time definitely not forgetten here from the 1950's telephone to the early 80's Polaroid camera the window booths have vintage conversation starters galore.

The food is delicious and appropriately prices on my first visit I had Curtis' Haystack ( biscuits and gravy topped with hashbrowns eggs and bacon) and it was a full meal for sure so being your best appetite for that one 4.25/5 

Today I had the French Toast Special and the French Toast was prepared perfectly a nice buttery flavor and disappeared quickly. 2 eggs over medium and bacon completed the meal. 4.5/5.

So if you are looking for a nice relaxi

In [30]:
print('query')
print(search_query)

print('\n\nMost %s similary reviews:' %n)
for i, review in enumerate(returned_reviews):
    print('#%s:' %i)
    print(review)

query
Antiques galore with great hospitality everything you'ld expect from an old school diner and more!   The staff here is extremely friendly and make sure you are well taken care of during your visit.  

Seems like the majority of costumers here are working class mixed with retirees and definitely regulars to this establishment.  All the decor takes you back to a time definitely not forgetten here from the 1950's telephone to the early 80's Polaroid camera the window booths have vintage conversation starters galore.

The food is delicious and appropriately prices on my first visit I had Curtis' Haystack ( biscuits and gravy topped with hashbrowns eggs and bacon) and it was a full meal for sure so being your best appetite for that one 4.25/5 

Today I had the French Toast Special and the French Toast was prepared perfectly a nice buttery flavor and disappeared quickly. 2 eggs over medium and bacon completed the meal. 4.5/5.

So if you are looking for a nice relaxing breakfast

#### The results really make sense

### Classifying positive/negative review

#### Naive-Bayes Classifier

In [31]:
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
model_nb = MultinomialNB()
model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
# Get score for training set
model_nb.score(vectors_train, target_train) # accuracy

0.820093010532075

In [33]:
# Get score for testing set
model_nb.score(vectors_test, target_test)

0.8120093010532075

#### Logistic Regression Classifier

In [34]:
# Build a Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [35]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.8515524552044864

In [36]:
# Get score for testing set
model_lrc.score(vectors_test, target_test)

0.833552181644098

#### What are the key features(words) that make the positive prediction?

In [37]:
# Let's find out by ranking
n = 20
get_top_values(model_lrc.coef_[0], n, words)

['amazing',
 'best',
 'delicious',
 'awesome',
 'incredible',
 'perfect',
 'perfection',
 'fantastic',
 'thank',
 'excellent',
 'great',
 'highly',
 'love',
 'die',
 'heaven',
 'regret',
 'favorite',
 'perfectly',
 'outstanding',
 'bomb']

##### All the positive words, people like to use amazing to express.

#### What are the key features(words) that make the negative prediction?

In [38]:
# Let's find out by ranking
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

['worst',
 'ok',
 'horrible',
 'rude',
 'slow',
 'disappointing',
 'terrible',
 'mediocre',
 'okay',
 'bland',
 'reason',
 'decent',
 'poor',
 'lacking',
 'average',
 'meh',
 'wasn',
 'worse',
 'overall',
 'unfortunately']

##### The worst is the most straightforward word to express

#### Random Forest Classifier

In [39]:
# Build a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(max_depth = 20,
                                   n_estimators = 50,
                                   min_samples_leaf = 10,
                                   n_jobs = -1)

model_rfc.fit(vectors_train, target_train)                  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.7993708111065517

In [41]:
# Get score for testing set
model_rfc.score(vectors_test, target_test)

0.7853132266447819

#### Find out the important features in RFC model

In [42]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

['amazing',
 'delicious',
 'best',
 'ok',
 'love',
 'bad',
 'minutes',
 'didn',
 'great',
 'awesome',
 'definitely',
 'wasn',
 'vegas',
 'place',
 'favorite',
 'worst',
 'friendly',
 'terrible',
 'rude',
 'said']

### Use cross validation to evaluate the classifiers
[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [43]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model_lrc,
                           vectors_train,
                           target_train,
                           cv=5,
                           scoring ='accuracy')
cv_scores

array([0.8349176 , 0.82383916, 0.82916154, 0.83256959, 0.83585254])

### Use grid search to find best predictable classifier

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = [{'penalty':['l1'], 'C':[0.1, 100]},
              {'penalty':['l2'], 'C':[0.1, 100]}]

scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score + "\n\n")
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv=5,
                       scoring=score)
    clf.fit(vectors_train[:500,:], target_train[:500])
    print("Best parameters set found on development set:\n\n")
    print(clf.best_params_)
    print("\nGrid scores on development set:\n\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print("\n")
    y_true, y_pred = target_test, clf.predict(vectors_test)
    print(classification_report(y_true, y_pred))
    print("\n")

# Tuning hyper-parameters for accuracy


Best parameters set found on development set:


{'C': 100, 'penalty': 'l2'}

Grid scores on development set:


0.518 (+/-0.005) for {'C': 0.1, 'penalty': 'l1'}
0.718 (+/-0.085) for {'C': 100, 'penalty': 'l1'}
0.644 (+/-0.046) for {'C': 0.1, 'penalty': 'l2'}
0.720 (+/-0.079) for {'C': 100, 'penalty': 'l2'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.


              precision    recall  f1-score   support

       False       0.74      0.78      0.76    147599
        True       0.76      0.73      0.74    144841

   micro avg       0.75      0.75      0.75    292440
   macro avg       0.75      0.75      0.75    292440
weighted avg       0.75      0.75      0.75    292440



