# Yelp Data Challenge - Clustering - all

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv('all_33_date.csv')

In [3]:
df.head()

Unnamed: 0,business_id,user_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful
0,LkMtMHVetws5_7QfRjPtlg,Ud72j_rglYmXrL_O8zCdoA,LVH - Las Vegas Hotel & Casino,"Restaurants, Casinos, Food, Active Life, Night...",2.5,0.0,2005-01-24 22:21:29,0.0,Dm7Jh7tVp_97sOxEYH4DnA,4.0,"This is a little bit off the strip, but the vi...",0.0
1,ii8sAGBexBOJoYRFafF9XQ,Ud72j_rglYmXrL_O8zCdoA,Paris Las Vegas Hotel & Casino,"Restaurants, Hotels, Hotels & Travel, Event Pl...",3.0,0.0,2005-01-24 22:32:33,0.0,VwlljS4oxU4VyP2ODo9gwA,5.0,The buffet in this hotel is excellent! The set...,0.0
2,ZkGDCVKSdf8m76cnnalL-A,Ud72j_rglYmXrL_O8zCdoA,Le Village Buffet,"Sandwiches, French, Breakfast & Brunch, Buffet...",3.0,0.0,2005-01-26 17:50:53,0.0,Z54-ssb69Mx_lG1YtJ6N1g,5.0,Hella food there!! Cocktail shrimps are great!...,0.0
3,Wxxvi3LZbHNIDwJ-ZimtnA,NkVYcoaThHjOfTTgq6OdHQ,The Venetian Las Vegas,"Shopping Centers, Resorts, Arts & Entertainmen...",4.0,1.0,2005-04-22 20:07:39,2.0,0NmMH1xSUdI3pqed9WQQww,5.0,outstanding hotel. we had a really amazing sui...,2.0
4,6xgcHeLad-VaoTIQewK84A,G5LXEaxhQMF_BVjNHkVr7g,N9NE Steakhouse,"Seafood, Steakhouses, American (Traditional), ...",4.0,0.0,2005-04-27 23:38:05,0.0,VgBxQqaDgOHq9YRxtMUm3A,5.0,A very reliable source tells me this is the be...,0.0


In [4]:
df = df.dropna(axis=0)

## 1. Cluster the review text data for all the restaurants

In [9]:
# Take the values of the column that contains review text data, save to a variable named "documents"
df_positive = df[df['stars'] > 3.0]

documents = df_positive['text']

In [13]:
df_positive.head()

Unnamed: 0,business_id,user_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful
0,LkMtMHVetws5_7QfRjPtlg,Ud72j_rglYmXrL_O8zCdoA,LVH - Las Vegas Hotel & Casino,"Restaurants, Casinos, Food, Active Life, Night...",2.5,0.0,2005-01-24 22:21:29,0.0,Dm7Jh7tVp_97sOxEYH4DnA,4.0,"This is a little bit off the strip, but the vi...",0.0
1,ii8sAGBexBOJoYRFafF9XQ,Ud72j_rglYmXrL_O8zCdoA,Paris Las Vegas Hotel & Casino,"Restaurants, Hotels, Hotels & Travel, Event Pl...",3.0,0.0,2005-01-24 22:32:33,0.0,VwlljS4oxU4VyP2ODo9gwA,5.0,The buffet in this hotel is excellent! The set...,0.0
2,ZkGDCVKSdf8m76cnnalL-A,Ud72j_rglYmXrL_O8zCdoA,Le Village Buffet,"Sandwiches, French, Breakfast & Brunch, Buffet...",3.0,0.0,2005-01-26 17:50:53,0.0,Z54-ssb69Mx_lG1YtJ6N1g,5.0,Hella food there!! Cocktail shrimps are great!...,0.0
3,Wxxvi3LZbHNIDwJ-ZimtnA,NkVYcoaThHjOfTTgq6OdHQ,The Venetian Las Vegas,"Shopping Centers, Resorts, Arts & Entertainmen...",4.0,1.0,2005-04-22 20:07:39,2.0,0NmMH1xSUdI3pqed9WQQww,5.0,outstanding hotel. we had a really amazing sui...,2.0
4,6xgcHeLad-VaoTIQewK84A,G5LXEaxhQMF_BVjNHkVr7g,N9NE Steakhouse,"Seafood, Steakhouses, American (Traditional), ...",4.0,0.0,2005-04-27 23:38:05,0.0,VgBxQqaDgOHq9YRxtMUm3A,5.0,A very reliable source tells me this is the be...,0.0


In [14]:
# Make a column and take the values, save to a variable named "target"
target = df_positive['stars']

In [15]:
target.describe()

count    559211.000000
mean          4.576752
std           0.494074
min           4.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: stars, dtype: float64

In [16]:
# Create training dataset and test dataset
from sklearn.model_selection import train_test_split

In [17]:
# documents is your X, target is your y
# split the data to training set and test set for future classification problem
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, test_size=0.2, random_state=0)

### Get NLP representation of the documents

#### Fit TfidfVectorizer with training data only, then tranform all the data to tf-idf

In [19]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
# Create TfidfVectorizer, and name it vectorizer, choose a reasonable max_features
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=1000)

In [21]:
# Train the model with training data
vectors = vectorizer.fit_transform(documents_train).todense()

In [22]:
# Get the vocab of tfidf
vocab = vectorizer.get_feature_names()

In [23]:
# Use the trained model to transform all the reviews
vectors_all = vectorizer.transform(documents)

### Cluster reviews with KMeans

#### Fit k-means clustering with the training vectors and apply it on all the data

In [27]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=20)

kmeans.fit(vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [28]:
kmeans_pred = kmeans.predict(vectors_all)
kmeans_pred

array([2, 1, 5, ..., 2, 8, 5], dtype=int32)

#### Inspect the centroids

In [29]:
print("cluster centers:")
print(kmeans.cluster_centers_)

cluster centers:
[[0.00142329 0.007551   0.00148276 ... 0.00047832 0.00321597 0.00767726]
 [0.00308988 0.00860644 0.00464374 ... 0.00234819 0.00050289 0.00109737]
 [0.00144265 0.00577244 0.00142172 ... 0.00116895 0.00331638 0.00681028]
 ...
 [0.00077451 0.00529494 0.0020158  ... 0.00126857 0.00345669 0.00629814]
 [0.00172746 0.00668523 0.00254762 ... 0.00367596 0.00275695 0.00414365]
 [0.00307907 0.01088014 0.00323054 ... 0.00188412 0.00483861 0.0076519 ]]


#### Find the top 10 features for each cluster

In [30]:
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
print("top features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(vocab[i] for i in centroid)))

top features for each cluster:
0: ramen, pho, broth, noodles, place, good, pork, bowl, service, like
1: room, hotel, rooms, stay, pool, casino, strip, stayed, nice, vegas
2: good, food, really, place, service, pretty, nice, like, back, great
3: place, food, like, delicious, get, back, go, one, time, best
4: steak, good, great, service, food, cooked, best, place, ordered, delicious
5: great, food, service, place, back, atmosphere, friendly, good, staff, awesome
6: thai, pad, food, curry, good, place, rice, spicy, chicken, restaurant
7: burger, fries, burgers, good, cheese, shake, place, great, food, like
8: always, food, great, place, love, service, good, get, friendly, go
9: love, place, food, great, good, service, go, delicious, friendly, staff
10: breakfast, eggs, good, place, great, pancakes, food, coffee, service, toast
11: sushi, rolls, roll, ayce, place, fish, great, good, fresh, eat
12: hour, happy, great, good, food, bar, drinks, place, menu, service
13: chicken, fried, good, r

#### Try different k

In [44]:
#kmeans_5 = KMeans(n_clusters=5)

#kmeans_5.fit(vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [45]:
#top_centroids_5 = kmeans_5.cluster_centers_.argsort()[:,-1:-11:-1]
#print("top features for each cluster:")
#for num, centroid in enumerate(top_centroids_5):
#    print("%d: %s" % (num, ", ".join(vocab[i] for i in centroid)))

top features for each cluster:
0: pizza, crust, good, great, place, best, ordered, cheese, vegas, get
1: great, food, service, place, good, back, amazing, friendly, awesome, staff
2: sushi, place, roll, great, ayce, rolls, fresh, service, good, fish
3: good, food, chicken, ordered, back, really, delicious, got, time, place
4: place, love, best, always, vegas, food, amazing, go, good, one


#### Print out the rating and review of a random sample of the reviews assigned to each cluster to get a sense of the cluster.

In [31]:
assigned_cluster = kmeans.transform(vectors).argmin(axis=1)

In [32]:
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, vectors.shape[0])[assigned_cluster==i]
    sample_df = np.random.choice(cluster, 1, replace=False)
    print("cluster %d:" % i)
    for reviews in sample_df:
        print("    %s" % df.loc[reviews]['text'])

cluster 0:
    Omg I love this place! Best sushi bar in town- hands down! And buy two get one free on speciality rolls? My husband and I literally pay $25 and we're stuffed! It's amazing. 

My favorite is the coconut shrimp roll- so damn good! 

He's obsessed with the Hawaiian. 

The 3rd roll is always something we haven't tried yet and we have never been disappointed. 

Staff is always so accommodating and so welcoming. If you live on this side of town...you should definitely try it. You'll be pleasantly surprised. You'll see.
cluster 1:
    Food was ok. Service was ok. Everything was just ok. Not worth the price. That was not ok. Too many other places in LV.
cluster 2:
    I'm a local and I went to zingers thinking that it would be a quick, yummy place to get lunch. Oh man, was I wrong. None of the servers would talk to me, even though it wasn't that busy. Then, one of the ladies took my order and told me it would take 5 minutes. I had to use the restroom, so I went there and returne

## 3. Cluster restaurants from category information

In [45]:
# get column of category
category = df['categories']

In [46]:
# split the data into train data and test data
cate_train, cate_test = train_test_split(category, train_size=0.8, random_state=0)



In [47]:
# vectorize the categories
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=500)
vectors_ec1 = vectorizer.fit_transform(cate_train).todense()
vocab_ec1 = vectorizer.get_feature_names()

In [48]:
# use the trained model to transform all the data
vectors_all_ec1 = vectorizer.transform(category)

In [49]:
# kmeans clustering
from sklearn.cluster import KMeans

kmeans_ec1 = KMeans()
kmeans_ec1.fit(vectors_ec1)

kmeans_pred = kmeans_ec1.predict(vectors_all_ec1)

In [50]:
# inspect the centroids
print("cluster centers:")
print(kmeans_ec1.cluster_centers_)

cluster centers:
[[ 3.30356402e-16  9.55714727e-04 -5.42101086e-17 ... -7.96346496e-17
   1.30444737e-04 -1.81549654e-16]
 [ 2.82922557e-16  1.40425865e-15 -1.91415894e-16 ... -2.37711326e-17
  -5.32646843e-15 -2.14888871e-16]
 [ 2.27140355e-16  1.26938390e-15 -6.77084257e-17 ...  4.19044140e-17
   1.92203059e-04 -6.07695318e-17]
 ...
 [ 1.65123991e-16  1.12019768e-15  6.96870946e-17 ...  8.42967189e-17
  -9.89659743e-16  1.10425991e-16]
 [ 1.31676354e-16  1.63284897e-02  9.83913472e-17 ...  7.13947131e-17
   2.41993925e-16  1.09883890e-16]
 [ 2.45246531e-16  1.31318567e-15 -1.07932326e-16 ...  2.05320786e-17
  -3.94042438e-15 -1.10886777e-16]]


In [51]:
# find top 10 features
top_centroids = kmeans_ec1.cluster_centers_.argsort()[:,-1:-11:-1]
print("top features for each cluster:")
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(vocab_ec1[i] for i in centroid)))

top features for each cluster:
0: breakfast, brunch, american, restaurants, traditional, food, sandwiches, new, buffets, burgers
1: japanese, sushi, bars, restaurants, fusion, asian, ramen, seafood, noodles, hawaiian
2: mexican, restaurants, bars, nightlife, fast, tacos, food, tex, mex, brunch
3: food, restaurants, fast, sandwiches, thai, burgers, bars, tea, barbeque, steakhouses
4: american, bars, new, traditional, nightlife, restaurants, burgers, steakhouses, seafood, wine
5: chinese, restaurants, asian, fusion, sum, dim, noodles, thai, cantonese, seafood
6: hotels, travel, casinos, event, entertainment, arts, planning, services, resorts, spas
7: pizza, italian, restaurants, bars, food, salad, sandwiches, nightlife, wings, chicken


In [52]:
# get sample of each cluster
assigned_cluster_ec1 = kmeans_ec1.transform(vectors_ec1).argmin(axis=1)

for i in range(kmeans_ec1.n_clusters):
    cluster = np.arange(0, vectors_ec1.shape[0])[assigned_cluster_ec1==i]
    sample_df_ec1 = np.random.choice(cluster, 5, replace=False)
    print("cluster %d:" % i)
    for categ in sample_df_ec1:
        print("    %s" % df.loc[categ]['categories'])

cluster 0:
    Greek, Restaurants, Mediterranean, Caterers, Burgers, Middle Eastern, Lebanese, Event Planning & Services
    American (New), Bars, Nightlife, Italian, Mediterranean, Restaurants
    Pizza, Restaurants
    Restaurants, Thai
    Food Delivery Services, Salad, Sandwiches, Soup, Food, Event Planning & Services, American (New), Restaurants, Caterers, Wraps
cluster 1:
    Restaurants, Vegetarian, Asian Fusion, Korean, Fast Food
    Fish & Chips, Pubs, Nightlife, Gastropubs, Bars, British, American (Traditional), Restaurants
    Diners, Cafes, American (Traditional), Burgers, Restaurants, Sandwiches
    Bars, American (New), Burgers, Asian Fusion, Food, Beer, Wine & Spirits, Restaurants, Nightlife
    Gluten-Free, Chinese, Sushi Bars, Asian Fusion, Restaurants
cluster 2:
    Food, Comfort Food, Coffee & Tea, American (Traditional), Hotels & Travel, Event Planning & Services, Arts & Entertainment, Casinos, Day Spas, Restaurants, Hotels, Breakfast & Brunch, Beauty & Spas
    Foo