In [8]:
import pandas as pd, numpy as np
import re

In [9]:
reviews0 = pd.read_csv("Zomato_reviews.csv")

In [10]:
reviews0.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [11]:
reviews0.describe(include="all")

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


14 rows are missing the review text - need to get rid of these records

In [12]:
reviews1 = reviews0[~reviews0.review_text.isnull()].copy()
reviews1.reset_index(inplace=True, drop=True)

In [13]:
reviews0.shape, reviews1.shape

((27762, 2), (27748, 2))

#### Converting to list for easy manipulation

In [14]:
reviews_list = reviews1.review_text.values

In [15]:
len(reviews_list)

27748

### Text clean up 
- Normalize the case  
- Remove stop words
   - remove "not", "no" from the stop word list
- Remove punctuations

Normalizing case

In [16]:
reviews_lower = [txt.lower() for txt in reviews_list]

In [17]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly better cooked.\r\ntried 2 beverages, both were very sweet.']

Remove extra line breaks

In [18]:
reviews_lower = [" ".join(txt.split()) for txt in reviews_lower]

In [19]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

#### Tokenize

In [20]:
from nltk.tokenize import word_tokenize

In [21]:
print(word_tokenize(reviews_lower[0]))

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [22]:
reviews_tokens = [word_tokenize(sent) for sent in reviews_lower]
print(reviews_tokens[0])

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


### Remove stop words and punctuations

In [23]:
from nltk.corpus import stopwords
from string import punctuation

In [24]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [25]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [26]:
stop_nltk.remove("no")
stop_nltk.remove("not")
stop_nltk.remove("don")
stop_nltk.remove("won")

In [27]:
"no" in stop_nltk

False

In [28]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]

In [29]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [30]:
del_stop(reviews_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [31]:
reviews_clean = [del_stop(sent) for sent in reviews_tokens]

In [32]:
reviews_clean = [" ".join(sent) for sent in reviews_clean]
reviews_clean[:2]

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"]

### Separate X and Y and perform train test split, 70-30

In [33]:
len(reviews_clean)

27748

In [36]:
X = reviews_clean
y = reviews1.rating

Train test split

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

### Document term matrix using TfIdf

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
vectorizer = TfidfVectorizer(max_features = 5000)

In [40]:
len(X_train), len(X_test)

(19423, 8325)

In [41]:
X_train_bow = vectorizer.fit_transform(X_train)

In [42]:
X_test_bow = vectorizer.transform(X_test)

In [43]:
X_train_bow.shape, X_test_bow.shape

((19423, 5000), (8325, 5000))

### Model building

In [91]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [92]:
?RandomForestRegressor

In [114]:
learner_rf = RandomForestRegressor(random_state=42)

In [115]:
%%time
learner_rf.fit(X_train_bow, y_train)

Wall time: 1min 23s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [116]:
y_train_preds = learner_rf.predict(X_train_bow)

In [123]:
from sklearn.metrics import mean_squared_error

In [124]:
mean_squared_error(y_train, y_train_preds)**0.5

0.2648944171413865

#### Increase the number of trees

In [125]:
learner_rf = RandomForestRegressor(random_state=42, n_estimators=20)

In [126]:
%%time
learner_rf.fit(X_train_bow, y_train)

Wall time: 2min 47s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [127]:
y_train_preds = learner_rf.predict(X_train_bow)

In [130]:
mean_squared_error(y_train, y_train_preds)**0.5

0.25087955564587494

### Hyper-parameter tuning

"class_weights" was one of the many hyperparameters to tune for the SVM.  

Let's find the best hyper-parameters for the SVM classifier

In [131]:
from sklearn.model_selection import GridSearchCV

In [134]:
?RandomForestRegressor

Instantiate the learner with a random state

In [142]:
learner_rf = RandomForestRegressor(random_state=42)

In [139]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 20, 25]
}

In [143]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = learner_rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )

In [144]:
grid_search.fit(X_train_bow, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.6min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_features': [500, 'sqrt', 'log2', 'auto'], 'max_depth': [10, 15, 20, 25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [145]:
grid_search.grid_scores_



[mean: -0.86027, std: 0.03287, params: {'max_depth': 10, 'max_features': 500},
 mean: -1.17046, std: 0.04698, params: {'max_depth': 10, 'max_features': 'sqrt'},
 mean: -1.46015, std: 0.03526, params: {'max_depth': 10, 'max_features': 'log2'},
 mean: -0.83019, std: 0.03600, params: {'max_depth': 10, 'max_features': 'auto'},
 mean: -0.67923, std: 0.03255, params: {'max_depth': 15, 'max_features': 500},
 mean: -1.01139, std: 0.03546, params: {'max_depth': 15, 'max_features': 'sqrt'},
 mean: -1.38664, std: 0.03447, params: {'max_depth': 15, 'max_features': 'log2'},
 mean: -0.68236, std: 0.03287, params: {'max_depth': 15, 'max_features': 'auto'},
 mean: -0.56164, std: 0.02840, params: {'max_depth': 20, 'max_features': 500},
 mean: -0.89007, std: 0.03176, params: {'max_depth': 20, 'max_features': 'sqrt'},
 mean: -1.31277, std: 0.03576, params: {'max_depth': 20, 'max_features': 'log2'},
 mean: -0.58350, std: 0.02534, params: {'max_depth': 20, 'max_features': 'auto'},
 mean: -0.48085, std: 0.0

In [146]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features=500, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

### Using the best estimator to make predictions on the test set

In [150]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [147]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [153]:
mean_squared_error(y_train, y_train_pred)**0.5

0.35787640577342056

In [155]:
mean_squared_error(y_test, y_test_pred)**0.5

0.6826219636703414

### Identifying mismatch cases

In [156]:
res_df = pd.DataFrame({'review':X_test, 'rating':y_test, 'rating_pred':y_test_pred})

In [160]:
res_df[(res_df.rating - res_df.rating_pred)>=2].shape

(11, 3)

In [159]:
res_df[(res_df.rating - res_df.rating_pred)>=2]

Unnamed: 0,review,rating,rating_pred
7277,life saviours serving excellent food worst tim...,5.0,1.595731
1818,value money ordered second time,5.0,2.192872
4771,not good,5.0,2.104202
19793,part review programme ordered bombay masala qu...,5.0,2.790877
13196,delicious food say vegetarian thought might sh...,5.0,2.949121
16510,may not polished serving packaging etc never b...,5.0,1.687134
14845,oh memories place first drink bangalore almost...,5.0,2.621843
15201,sauce not included,4.0,1.704264
21993,often order food love chicken biriyani also pa...,5.0,2.848742
27705,options would improvement long quality not com...,4.0,1.809524
