# Yelp Data Challenge - NLP

Xueji Wang

Aug 2018

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/Users/xuejiwang/Downloads/yelp_dataset/last_2_years_restaurant_reviews.csv')

In [4]:
df.head(2)

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-02-14,0,VETXTwMw6qxzOVDlXfe6Tg,5,went for dinner tonight. Amazing my husband ha...,0,ymlnR8UeFvB4FZL56tCZsA
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Steakhouses, Restaurants, Cajun/Creole",4.0,0,2017-12-04,0,S8-8uZ7fa5YbjnEtaW15ng,5,This was an amazing dinning experience! ORDER ...,0,9pSSL6X6lFpY3FCRLEH3og


### Define feature variables, here is the text of the review

In [14]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

In [15]:
# inspect your documents, check the size, take a peek at elements of the numpy array
documents.dtype


dtype('O')

### Define target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [16]:
# Make a column and take the values, save to a variable named "target"
df['perfect'] = (df['stars'] > 4)
target = df['perfect'].values

In [17]:
target[0:5]

array([ True,  True,  True, False,  True])

#### Look at the statistic of the target variable

In [18]:
# To be implemented
target.mean()

0.4741461922405801

## Let's create training dataset and test dataset

In [19]:
from sklearn.cross_validation import train_test_split



In [50]:
# Documents is X, target is y
# Split the data to training set and test set
documents_train, documents_test, target_train, target_test = \
train_test_split(documents,target, test_size = 0.9, random_state = 7)


## Let's get NLP representation of the documents

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)


In [26]:
# Train the model with training data
vectors_train = vectorizer.fit_transform(documents_train).todense()
vectors_test = vectorizer.transform(documents_test).todense()

In [27]:
vectors_train

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [29]:
vectors_train.shape

(64071, 1000)

In [30]:
# Get the vocab of your tfidf
wordsBag = vectorizer.get_feature_names()

In [31]:
# Transform all reviews to vector
vectorized_documents = vectorizer.transform(documents).todense()

In [33]:
vectorized_documents

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.09321124]])

## Similar review search engine

In [35]:
import numpy as np

# The following function will be used sooner

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [51]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
# Draw an arbitrary review from test (unseen in training) documents
random_state = 7
arbitrary_review = documents_test[random_state]
arbitrary_reviews = [arbitrary_review]
print(arbitrary_review)
print(arbitrary_reviews)


Poor service, mediocre food. With so many options in Vegas, there are many better choices
['Poor service, mediocre food. With so many options in Vegas, there are many better choices']


In [38]:
# Transform the drawn review(s) to vector(s)
vector_arbitrary_reviews = vectorizer.transform(arbitrary_reviews).todense()
vector_arbitrary_reviews 

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [39]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_arbitrary_reviews, vectors_train)

In [40]:
# Let's find top 5 similar reviews
n = 5
similar_reviews = get_top_values(similarity_scores[0], n,documents_train)
similar_reviews

['Poor service, overpriced mediocre burgers. I have better luck at the Casino',
 'Poor poor poor very abrupt and vey condescending so for that they get a 1 star they should not give vip service to some and not others',
 'I was not thrilled with this restaurant. Quality of food was mediocre or poor for the price they charge. Service was slow. :(',
 'Skip this place, service is very slow. Food expensive.\n There are better choices in Vegas.',
 'Insane food for the healthy and the not so. Great choices for the wife and better choices for me :)']

In [41]:
print('An arbitrary review:')
print(arbitrary_reviews) 

An arbitrary review:
['Poor service, mediocre food. With so many options in Vegas, there are many better choices']


In [43]:
print('Most %s similar reviews:' % n)
for i, review in enumerate(similar_reviews):
    print('#%s:'%(i+1))
    print(review)

Most 5 similar reviews:
#1:
Poor service, overpriced mediocre burgers. I have better luck at the Casino
#2:
Poor poor poor very abrupt and vey condescending so for that they get a 1 star they should not give vip service to some and not others
#3:
I was not thrilled with this restaurant. Quality of food was mediocre or poor for the price they charge. Service was slow. :(
#4:
Skip this place, service is very slow. Food expensive.
 There are better choices in Vegas.
#5:
Insane food for the healthy and the not so. Great choices for the wife and better choices for me :)


#### Some insights

The chosen arbitrary review is negative. And using cosine similarity to choose 5 similar reviews. They are also negative reviews, the users is not satisfied with the service. The main words includs "poor', 'mediocre'.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [44]:
# Build a Naive-Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
# Get score for training set
nb_clf.score(vectors_train, target_train)

0.801080051817515

In [46]:
# Get score for test set
nb_clf.score(vectors_test, target_test)

0.7968237067044484

#### Logistic Regression Classifier

In [47]:
# Build a Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
logit_clf = LogisticRegression()
logit_clf.fit(vectors_train, target_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
# Get score for training set
logit_clf.score(vectors_train, target_train)

0.8241950336345617

In [49]:
# Get score for test set
logit_clf.score(vectors_test, target_test)

0.8166781410464288

#### Key features(words) that make the positive prediction

In [53]:
# Find it out by ranking
n = 20
get_top_values(logit_clf.coef_[0],n, wordsBag)

['amazing',
 'best',
 'incredible',
 'awesome',
 'delicious',
 'perfect',
 'perfection',
 'thank',
 'favorite',
 'phenomenal',
 'exceptional',
 'outstanding',
 'excellent',
 'love',
 'fantastic',
 'great',
 'notch',
 'glad',
 'highly',
 'gem']

**The positive prediction mainly have the following words:
amazing, best, incredible, awesome, delicious,..., excellent..fantastic..glad.**

#### Key features(words) that make the negative prediction

In [54]:
# Let's find it out by ranking
n = 20
get_bottom_values(logit_clf.coef_[0], n , wordsBag)

['worst',
 'horrible',
 'ok',
 'terrible',
 'bland',
 'slow',
 'mediocre',
 'disappointing',
 'rude',
 'okay',
 'reason',
 'average',
 'dry',
 'awful',
 'poor',
 'unfortunately',
 'overpriced',
 'worse',
 'decent',
 'wasn']

The result does make sense that negative reviews mainly inclde words like: worst, horrible, terrible, bland,...awful. 

#### Random Forest Classifier

In [58]:
# Build a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth = None, n_estimators = 20, min_samples_leaf = 3, random_state = 7)
rf_clf.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [59]:
# Get score for training set
rf_clf.score(vectors_train, target_train)

0.9253640492578545

In [57]:
# Get score for test set
rf_clf.score(vectors_test, target_test)

0.7907350597505927

#### From training score and the testing score

That test error is much larger than trainging error, there is overfitting problem.

#### Feature/Word Importance

In [61]:
n = 20
get_top_values(rf_clf.feature_importances_, n, wordsBag)

['amazing',
 'best',
 'great',
 'delicious',
 'love',
 'awesome',
 'ok',
 'wasn',
 'vegas',
 'good',
 'worst',
 'bad',
 'definitely',
 'didn',
 'favorite',
 'friendly',
 'excellent',
 'said',
 'perfect',
 'place']

In [62]:
df_coef = pd.DataFrame(list(zip(wordsBag, rf_clf.feature_importances_))).sort_values(by = [1], ascending = False)
df_coef.columns = ['Words', 'Feature Importance']
df_coef

Unnamed: 0,Words,Feature Importance
28,amazing,0.043103
80,best,0.030295
372,great,0.029903
225,delicious,0.029081
505,love,0.019670
54,awesome,0.018155
587,ok,0.017674
960,wasn,0.016978
935,vegas,0.016846
367,good,0.012452


### Use cross validation to evaluate classifiers

In [63]:
from sklearn.model_selection import cross_val_score
# 10 folds cross validation
cv_scores = cross_val_score(logit_clf,
                           vectors_train,
                            target_train,
                            cv = 10,
                            scoring = 'accuracy'
                           )

In [64]:
cv_scores

array([0.80617978, 0.81098798, 0.82706415, 0.80771032, 0.81738723,
       0.81676292, 0.81645076, 0.82456688, 0.82253785, 0.81051974])

### Use grid search to find best predictable classifier

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [66]:
param_grid = [{'penalty' :['l1'], 'C':[0.1,100]},
              {'penalty' :['l2'], 'C':[0.1,100]}
             ]

In [67]:
scores = ['accuracy']

In [68]:
for score in scores:
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv = 10,
                       scoring = score
    
    )
    clf.fit(vectors_train[:500,:], target_train[:500])
    print(clf.best_params_)
    

{'C': 100, 'penalty': 'l1'}


In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
param_grid = {'n_estimators': [100,200], 
              'max_features': ['auto'], 
              'criterion': ['gini'],
              'max_depth': [15,20,25], 
              'min_samples_split': [2],
              'min_samples_leaf': [2,10,20],
              'n_jobs':[-1]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(roc_auc_score)

# Run the grid search
# read theory
grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)
grid_obj = grid_obj.fit(vectors_train, target_train)

In [70]:
print(grid_obj.best_params_)

{'criterion': 'gini', 'max_depth': 25, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': -1}


In [71]:
# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)