In [1]:
%%time
%load_ext autoreload
%autoreload 2

import heapq
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

biz_name = 'Starbucks'
reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')
biz = pd.read_csv('data/yelp_academic_dataset_business.csv')[['business_id', 'name']]

reviews = pd.merge(reviews, biz, how='inner', on='business_id')

CPU times: user 43.2 s, sys: 11.1 s, total: 54.3 s
Wall time: 1min 1s


In [32]:
biz_name = "Five Guys Burgers and Fries"
biz_reviews = reviews[reviews['name'] == biz_name][['text', 'stars']]

In [144]:
reviews['name'].value_counts()

Starbucks                                           15595
Hash House A Go Go                                   8351
McDonald's                                           6601
Chipotle Mexican Grill                               6544
Mon Ami Gabi                                         6414
Bacchanal Buffet                                     5715
Wicked Spoon                                         5216
Gordon Ramsay BurGR                                  5116
Earl of Sandwich                                     5044
Buffalo Wild Wings                                   4928
In-N-Out Burger                                      4437
Gangnam Asian BBQ Dining                             4120
Serendipity 3                                        3911
The Buffet                                           3822
Pita Jungle                                          3749
Egg Works                                            3746
Bachi Burger                                         3695
Dunkin' Donuts

In [98]:
tfidf = TfidfVectorizer(stop_words='english', max_features=300, ngram_range=(1, 3))
model = LinearRegression() 

large_sample = reviews.sample(100000)
corpus = large_sample['text'].values
X = tfidf.fit_transform(corpus)

y = large_sample['stars'].values
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [99]:
# Get the 10 highest and the 10 lowest weights indices 
max_indices = heapq.nlargest(100, range(len(model.coef_)), key=model.coef_.__getitem__)
min_indices = heapq.nsmallest(100, range(len(model.coef_)), key=model.coef_.__getitem__)

def print_summary(indices):
    features = tfidf.get_feature_names()
    for i in indices:
        feat = features[i]
        print('{} ({}): {:.2f}'.format(feat, tfidf.vocabulary_[feat], model.coef_[i]))

print('Business summary: ' + biz_name + '\n')
print_summary(max_indices)
print('----------')
print_summary(reversed(min_indices))

Business summary: Five Guys Burgers and Fries

amazing (7): 1.98
great (113): 1.83
best (19): 1.81
delicious (63): 1.70
excellent (85): 1.66
awesome (14): 1.66
perfect (191): 1.52
fantastic (90): 1.42
wonderful (292): 1.35
love (159): 1.34
happy (119): 1.28
favorite (93): 1.24
friendly (104): 1.23
definitely (62): 1.19
helpful (123): 1.18
loved (160): 1.18
highly (125): 1.18
able (4): 1.00
needed (173): 0.94
enjoyed (83): 0.76
vegas (274): 0.75
really good (208): 0.74
fast (92): 0.73
fresh (101): 0.72
right (216): 0.70
feel (94): 0.69
guys (116): 0.69
quick (205): 0.67
years (297): 0.65
yelp (298): 0.63
nice (175): 0.62
clean (40): 0.61
family (89): 0.61
super (246): 0.60
tasty (254): 0.58
fun (107): 0.57
try (268): 0.54
enjoy (82): 0.53
free (100): 0.52
house (132): 0.52
cool (48): 0.51
highly recommend (126): 0.51
new (174): 0.49
good (111): 0.47
sweet (249): 0.47
huge (133): 0.46
makes (163): 0.46
job (140): 0.45
ve (273): 0.44
dessert (64): 0.44
spicy (235): 0.44
usually (272): 0.4