### Load Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data\\review_classification.csv', encoding = "utf-8", usecols = ['stars','text'])

In [3]:
df.head(2)

Unnamed: 0,stars,text
0,5,went for dinner tonight. Amazing my husband ha...
1,5,This was an amazing dinning experience! ORDER ...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 586927 entries, 0 to 586926
Data columns (total 2 columns):
stars    586927 non-null int64
text     586927 non-null object
dtypes: int64(1), object(1)
memory usage: 9.0+ MB


### Define target variable

In [5]:
# set stars that higher than 4 to be positive, else negative
df['target'] = df['stars'] > 4
target = df['target'].values
target

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [6]:
target.mean(), target.std(), target.shape

(0.47496877805928167, 0.49937304485540107, (586927,))

### Define feature variables (Text Review)

In [7]:
documents = df['text'].values
documents[0]

"went for dinner tonight. Amazing my husband had lobster bisque and the T bone both were delish.I had the French onion soup and the pan seared duck. Cooked to perfection and I'm still raving about the flavor. If you are ever in Vegas this is a must try."

In [8]:
documents.dtype, documents.shape

(dtype('O'), (586927,))

### Split dataset into training set and testing set

In [9]:
from sklearn.cross_validation import train_test_split



In [10]:
documents_train, documents_test, y_train, y_test = train_test_split(documents, target, test_size = 0.2, random_state = 66) 

In [11]:
len(documents_train), len(documents_test), len(y_train), len(y_test)

(469541, 117386, 469541, 117386)

### Get NLP represetation of the documents

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# create TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 350)

In [14]:
# train the model with training data
x_train = vectorizer.fit_transform(documents_train).toarray()
x_train.shape

(469541, 350)

In [15]:
# get the vocab of tfidf
words = vectorizer.get_feature_names()

In [16]:
x_test = vectorizer.fit_transform(documents_test).toarray()
x_test.shape

(117386, 350)

### Similar review search engine

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
get_random_number = 66
search_query = documents_test[get_random_number]
search_queries = [search_query]  # need to be put into a list-like format
print(search_queries)

In [None]:
vector_search_queries = vectorizer.transform(search_queries).toarray()

In [None]:
similarity_scores = cosine_similarity(vector_search_queries, x_train)

In [None]:
similarity_scores

In [None]:
# pick top n similar reviews
n = 5
returned_reviews = [documents_train[i] for i in np.argsort(similarity_scores[0])[::-1][:n]] # argsort returns the indices that would sort an array

In [84]:
# print the output
print('Search query:')
print(search_query)

print('\nMost %s similar reviews:' % n)
for i, review in enumerate(returned_reviews):
    print('#%s:' % i)
    print(review)

Search query:
My boyfriend and I came here for the first time and I  think we might've found our new sushi spot! Fresh sushi, a lot of selections and great service. Wish I had gotten our servers name but he was an Asian dude with glasses and he was the best!! Super nice and attentive at all times. Thank you!

Most 5 similar reviews:
#0:
Fresh sushi, good service, great deals! I think I found my new sushi spot in mountains edge!
#1:
Nice spot with a good sushi. I ordered sushi for four people to go and paid around 45 dollars. I got huge pack of sushi and it was enough for big party. Thank you
#2:
The best sushi I think I've ever had especially on this side of town. Everything we ordered tasted super clean and fresh with minimal waiting time. We did AYCE and it was the first time I think I've ever stuffed my face with sushi and didn't feel like garbage after. The menu has a wide and unique variety of items that you typically can't find in sushi joints with all you can eat. I have definit

### Classifying positive/negative review

#### Naive-Bayes Classifier (Baseline Model)

In [18]:
from sklearn.naive_bayes import GaussianNB

In [19]:
model_nb = GaussianNB()
model_nb.fit(x_train, y_train)

GaussianNB(priors=None)

In [20]:
# Get score for training set
model_nb.score(x_train, y_train) # accuracy

0.7499685863428327

In [21]:
# Get score for test set
model_nb.score(x_test, y_test)

0.59709846148603751

### Grid search to find best predictale classifier

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [36]:
from sklearn.linear_model import LogisticRegression

In [32]:
# logistic regression
param_grid = [{'penalty':['l1'], 'C':[0.1, 1, 10]},
              {'penalty':['l2'], 'C':[0.1, 1, 10]}]

scores = ['accuracy']

In [35]:
for score in scores:
    
    print("# Tuning hyper-parameters for %s" % score + "\n\n")
    
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv=5,
                       scoring=score)
    clf.fit(x_train[:500,:], y_train[:500])
    print("Best parameters set found on development set:\n\n")
    print(clf.best_params_)
    print("\nGrid scores on development set:\n\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print("\n")
    y_true, y_pred = y_test, clf.predict(x_test)
    print(classification_report(y_true, y_pred))
    print("\n")


# Tuning hyper-parameters for accuracy


Best parameters set found on development set:


{'C': 1, 'penalty': 'l2'}

Grid scores on development set:


0.520 (+/-0.000) for {'C': 0.1, 'penalty': 'l1'}
0.696 (+/-0.073) for {'C': 1, 'penalty': 'l1'}
0.690 (+/-0.123) for {'C': 10, 'penalty': 'l1'}
0.702 (+/-0.082) for {'C': 0.1, 'penalty': 'l2'}
0.732 (+/-0.108) for {'C': 1, 'penalty': 'l2'}
0.710 (+/-0.077) for {'C': 10, 'penalty': 'l2'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.


             precision    recall  f1-score   support

      False       0.63      0.81      0.71     61680
       True       0.69      0.47      0.56     55706

avg / total       0.66      0.65      0.64    117386





#### Logistic Regression Classifier

In [37]:
model_lrc = LogisticRegression(C =1 ,penalty = 'l2')
model_lrc.fit(x_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [38]:
# Get score for training set
model_lrc.score(x_train, y_train)

0.7952106418821786

In [39]:
# Get score for test set
model_lrc.score(x_test, y_test)

0.6345134854241562

In [40]:
# Top n key features(words) that make the positive prediction
n = 20
[words[i] for i in np.argsort(model_lrc.coef_[0])[::-1][:n]]

['amazing',
 'best',
 'thank',
 'awesome',
 'delicious',
 'highly',
 'perfect',
 'fantastic',
 'favorite',
 'excellent',
 'great',
 'wonderful',
 'love',
 'perfectly',
 'loved',
 'definitely',
 'happy',
 'vegas',
 'absolutely',
 'fresh']

In [41]:
# Top n key features(words) that make the negative prediction
[words[i] for i in np.argsort(model_lrc.coef_[0])[:n]]

['worst',
 'horrible',
 'ok',
 'okay',
 'slow',
 'dry',
 'decent',
 'wasn',
 'reason',
 'wouldn',
 'overall',
 'bad',
 'maybe',
 'cold',
 'money',
 'pretty',
 'didn',
 'used',
 'stars',
 'asked']

#### Random Forest Classifier

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
# random forest
param_grid = [{'n_estimators':[5, 10,15,20], 'min_samples_leaf':[1, 3, 5, 7]},
              {'n_estimators':[5, 10,15,20], 'min_samples_leaf':[1, 3, 5, 7]}]

scores = ['accuracy']

In [48]:
for score in scores:
    
    print("# Tuning hyper-parameters for %s" % score + "\n\n")
    
    clf = GridSearchCV(RandomForestClassifier(),
                       param_grid,
                       cv=5,
                       scoring=score)
    clf.fit(x_train[:500,:], y_train[:500])
    print("Best parameters set found on development set:\n\n")
    print(clf.best_params_)
    print("\nGrid scores on development set:\n\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print("\n")
    y_true, y_pred = y_test, clf.predict(x_test)
    print(classification_report(y_true, y_pred))
    print("\n")

# Tuning hyper-parameters for accuracy


Best parameters set found on development set:


{'min_samples_leaf': 1, 'n_estimators': 15}

Grid scores on development set:


0.662 (+/-0.157) for {'min_samples_leaf': 1, 'n_estimators': 5}
0.710 (+/-0.066) for {'min_samples_leaf': 1, 'n_estimators': 10}
0.728 (+/-0.075) for {'min_samples_leaf': 1, 'n_estimators': 15}
0.696 (+/-0.059) for {'min_samples_leaf': 1, 'n_estimators': 20}
0.658 (+/-0.062) for {'min_samples_leaf': 3, 'n_estimators': 5}
0.690 (+/-0.055) for {'min_samples_leaf': 3, 'n_estimators': 10}
0.704 (+/-0.123) for {'min_samples_leaf': 3, 'n_estimators': 15}
0.702 (+/-0.060) for {'min_samples_leaf': 3, 'n_estimators': 20}
0.646 (+/-0.063) for {'min_samples_leaf': 5, 'n_estimators': 5}
0.660 (+/-0.104) for {'min_samples_leaf': 5, 'n_estimators': 10}
0.674 (+/-0.078) for {'min_samples_leaf': 5, 'n_estimators': 15}
0.684 (+/-0.082) for {'min_samples_leaf': 5, 'n_estimators': 20}
0.688 (+/-0.079) for {'min_samples_leaf': 7, 'n_estimat

In [64]:
model_rfc = RandomForestClassifier(max_depth = None,
                                   n_estimators = 15,
                                   min_samples_leaf = 1)
model_rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [65]:
# Get score for training set
model_rfc.score(x_train, y_train)

0.99445202868333116

In [66]:
# Get score for test set
model_rfc.score(x_test, y_test)

0.63728212904434944

In [67]:
# Top n most important features(words)
n = 20
[words[i] for i in np.argsort(model_rfc.feature_importances_)[::-1][:n]]

['amazing',
 'best',
 'great',
 'delicious',
 'love',
 'ok',
 'good',
 'didn',
 'awesome',
 'vegas',
 'food',
 'definitely',
 'place',
 'favorite',
 'excellent',
 'worst',
 'service',
 'bad',
 'perfect',
 'like']

### Use Cross Validation to check three methods performance

In [69]:
from sklearn.model_selection import cross_val_score
import time

In [70]:
start = time.time()
# naive bayes
cv_scores = cross_val_score(model_nb,
                            x_train,
                            y_train,
                            cv = 5,
                            scoring="accuracy")
execution_time = time.time() - start
cv_scores

array([ 0.74870353,  0.74915875,  0.75028752,  0.74974443,  0.7508732 ])

In [71]:
print("Navie Bayes - Execution Time: ",execution_time," Accuracy: ",np.mean(cv_scores))

Navie Bayes - Execution Time:  18.43092370033264  Accuracy:  0.749753484893


In [72]:
start = time.time()
# logistic regression
cv_scores = cross_val_score(model_lrc,
                            x_train,
                            y_train,
                            cv = 5,
                            scoring="accuracy")
execution_time = time.time() - start
cv_scores

array([ 0.79371519,  0.79588533,  0.79547003,  0.79477787,  0.79414959])

In [73]:
print("Logistic Regression - Execution Time: ",execution_time," Accuracy: ",np.mean(cv_scores))

Logistic Regression - Execution Time:  21.563192129135132  Accuracy:  0.794799604474


In [74]:
start = time.time()
# random forest
cv_scores = cross_val_score(model_rfc,
                            x_train,
                            y_train,
                            cv = 5,
                            scoring="accuracy")
execution_time = time.time() - start
cv_scores

array([ 0.77153414,  0.77388508,  0.77318226,  0.7724262 ,  0.77092473])

In [75]:
print("Random Forest - Execution Time: ",execution_time," Accuracy: ",np.mean(cv_scores))

Random Forest - Execution Time:  273.11503529548645  Accuracy:  0.772390485296


### In conclusion, Logistic Regression performs better than Random Forest. It is more 10 times faster tan random forest, but achieved 2 more percentage accuracy on the test set. However, random forest is overfitting. The accuracy of traning set is 99%. If more data are fed into random forest model. Theproblem can be remitted.