In [1]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.externals import joblib

In [2]:
#load files for analysis
beer = pd.read_csv('10k_filtered.csv')
beer.columns

Index(['Unnamed: 0', 'brewery_id', 'brewery_name', 'review_time',
       'review_overall', 'review_aroma', 'review_appearance',
       'review_profilename', 'beer_style', 'review_palate', 'review_taste',
       'beer_name', 'beer_abv', 'beer_beerid', 'main_style'],
      dtype='object')

In [3]:
beer.head()

Unnamed: 0.1,Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,main_style
0,0,10099,Dogfish Head Brewery,1266775164,4.0,4.5,4.5,DmanGTR,American Double / Imperial IPA,4.5,4.5,90 Minute IPA,9.0,2093,India Pale Ales
1,1,10099,Dogfish Head Brewery,1296628385,4.0,4.0,4.0,nickadams2,American Double / Imperial IPA,3.5,4.0,90 Minute IPA,9.0,2093,India Pale Ales
2,2,10099,Dogfish Head Brewery,1250389086,4.0,4.0,4.0,kegger22,American Double / Imperial IPA,4.0,4.5,90 Minute IPA,9.0,2093,India Pale Ales
3,3,10099,Dogfish Head Brewery,1230224468,5.0,5.0,5.0,fairway31533,American Double / Imperial IPA,5.0,5.0,90 Minute IPA,9.0,2093,India Pale Ales
4,4,10099,Dogfish Head Brewery,1229312994,4.0,4.5,4.0,Bung,American Double / Imperial IPA,4.0,4.0,90 Minute IPA,9.0,2093,India Pale Ales


In [3]:
len(beer.review_profilename.unique())

32141

# Data Cleaning 10 or more reviews

In [4]:
grouped = beer.groupby(['review_profilename','beer_name']).mean().round(2)['review_overall']
grouped.head(20)

review_profilename  beer_name                              
0110x011            10 Commandments                            3.5
                    15th Anniversary Wood Aged                 3.5
                    21st Amendment IPA                         4.5
                    90 Minute IPA                              5.0
                    Adam                                       4.0
                    AleSmith Decadence 2007 Imperial Porter    3.0
                    AleSmith IPA                               5.0
                    AleSmith My Bloody Valentine               5.0
                    AleSmith Old Numbskull                     4.0
                    AleSmith Speedway Stout                    4.5
                    AleSmith Speedway Stout - Barrel Aged      3.5
                    Allagash Odyssey                           4.5
                    Alpha King Pale Ale                        5.0
                    Alpha Klaus Christmas (Xmas) Porter        5.0
  

In [5]:
grouped = pd.DataFrame(grouped)
grouped = grouped.reset_index()
grouped.head()

Unnamed: 0,review_profilename,beer_name,review_overall
0,0110x011,10 Commandments,3.5
1,0110x011,15th Anniversary Wood Aged,3.5
2,0110x011,21st Amendment IPA,4.5
3,0110x011,90 Minute IPA,5.0
4,0110x011,Adam,4.0


In [6]:
user_reviews = grouped.pivot(index='review_profilename',columns='beer_name',values='review_overall')
#user_reviews = user_reviews.reset_index(drop = True)
user_reviews.head()

beer_name,"""400"" Ale","""Hop Obama"" Ale","""Old Yeltsin"" Imperial Stout","""Shabadoo"" Black & Tan Ale","""The Wind Cried Mari..."" Scottish Heather Ale","""True Blue"" Blueberry Ale",# 100,#'s Ale,#9,'t Gaverhopke Extra,...,Épluche-Culotte,Équinoxe Du Printemps,Ölsch,Ølfabrikken 100 Gram India Pale Ale,Ølfabrikken Abbey Ale (Special Reserve),Ølfabrikken Kloster Jul,Ølfabrikken Porter,Über Alt,Über Pils,ÜberSun (Imperial Summer Wheat Beer)
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,,,,,,,,,,,...,,,,,,,5.0,,,
01Ryan10,,,,,,,,,,,...,,,,,,,,,,
02maxima,,,,,,,,,,,...,,,,,,,,,,
03SVTCobra,,,,,,,,,,,...,,,,,,,,,,
04101Brewer,,,,,,,,,,,...,,,,,,,,,,


In [7]:
counted = user_reviews.count(axis=1)
counted = pd.DataFrame(counted)

In [8]:
ten_reviews = counted.loc[(counted[0]>1) & (counted[0]<15)]

In [9]:
len(ten_reviews)

13605

In [10]:
ten_reviews = ten_reviews.reset_index()

In [11]:
grouped_clean = pd.merge(ten_reviews, grouped, how = 'left', on = "review_profilename")

In [12]:
grouped_clean.head()

Unnamed: 0,review_profilename,0,beer_name,review_overall
0,02maxima,4,Allagash Black - Bourbon Barrel Aged,4.0
1,02maxima,4,Birra Moretti,3.0
2,02maxima,4,Brooklyn Black Chocolate Stout,4.5
3,02maxima,4,Maharaja,4.5
4,03SVTCobra,3,Lone Star Light,1.0


In [13]:
grouped_clean = grouped_clean.drop([0])

In [14]:
len(grouped_clean)

68475

In [15]:
user_reviews_6 = grouped_clean.pivot(index='review_profilename',columns='beer_name',values='review_overall')
#user_reviews = user_reviews.reset_index(drop = True)
user_reviews_6.head()

beer_name,"""400"" Ale","""Hop Obama"" Ale","""Old Yeltsin"" Imperial Stout","""Shabadoo"" Black & Tan Ale","""The Wind Cried Mari..."" Scottish Heather Ale",# 100,#'s Ale,#9,'t Gaverhopke Extra,'t Smisje BBBourgondier,...,Éphemère (Framboise),Éphémère (Apple),Éphémère (Cassis / Black Currant),Éphémère (Cranberry),Équinoxe Du Printemps,Ölsch,Ølfabrikken 100 Gram India Pale Ale,Ølfabrikken Porter,Über Pils,ÜberSun (Imperial Summer Wheat Beer)
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02maxima,,,,,,,,,,,...,,,,,,,,,,
03SVTCobra,,,,,,,,,,,...,,,,,,,,,,
04101Brewer,,,,,,,,,,,...,,,,,,,,,,
0beerguy0,,,,,,,,,,,...,,,,,,,,,,
0runkp0s,,,,,,,,,,,...,,,,,,,,,,


In [16]:
user_reviews_6.shape

(13605, 7508)

# Data Cleaning Dense Data

In [17]:
user_reviews = user_reviews_6

In [18]:
# Define Function to get the most rated movies
def get_most_rated_beers(user_reviews, max_number_of_beers):
    # 1- Count
    user_reviews = user_reviews.append(user_reviews.count(), ignore_index=True)
    # 2- sort
    user_reviews_sorted = user_reviews.sort_values(len(user_reviews)-1, axis=1, ascending=False)
    user_reviews_sorted = user_reviews_sorted.drop(user_reviews_sorted.tail(1).index)
    # 3- slice
    most_rated_beers = user_reviews_sorted.iloc[:, :max_number_of_beers]
    return most_rated_beers

In [19]:
def get_users_who_rate_the_most(user_reviews, max_number_of_beers):
    # Get most voting users
    # 1- Count
    user_reviews['counts'] = pd.Series(user_reviews.count(axis=1))
    # 2- Sort
    most_rated_beers_users = user_reviews.sort_values('counts', ascending=False)
    # 3- Slice
    most_rated_beers_users_selection = most_rated_beers_users.iloc[:max_number_of_beers, :]
    most_rated_beers_users_selection = most_rated_beers_users_selection.drop(['counts'], axis=1)
    return most_rated_beers_users_selection

In [20]:
# Define the sorting by rating function
# Define the sorting by rating function
def sort_by_rating_density(user_reviews, n_beers, n_users):
    most_rated_beers = get_most_rated_beers(user_reviews, n_beers)
    most_rated_beers = get_users_who_rate_the_most(most_rated_beers, n_users)
    return most_rated_beers
# choose the number of movies and users and sort
n_beers = 50
n_users = 30000
most_rated_beers_users_selection = sort_by_rating_density(user_reviews, n_beers, n_users)
# Print the result
#print(most_rated_beers_users_selection.head())

In [21]:
most_rated_beers_users_selection.head()

beer_name,90 Minute IPA,Old Rasputin Russian Imperial Stout,Pliny The Elder,Bell's Hopslam Ale,Sierra Nevada Pale Ale,Guinness Draught,Arrogant Bastard Ale,Duvel,Sierra Nevada Celebration Ale,Two Hearted Ale,...,Palo Santo Marron,Samuel Adams Octoberfest,Bud Light,#9,Pumking,Bourbon County Brand Stout,Ayinger Celebrator Doppelbock,Tröegs Nugget Nectar,Orval Trappist Ale,Punkin Ale
7228,,,5.0,4.0,,,,,,,...,,,5.0,,,,,,,
7149,4.0,,,,5.0,,4.5,4.5,4.5,,...,,,,,,,,,,3.25
12505,,,,,,,,4.5,,,...,,,,,5.0,,,,,
12943,4.0,,,4.0,,,,4.0,,4.5,...,,,,,,,,4.0,,
9947,,,,,,,3.5,,,,...,,,,,,,,,,


In [22]:
len(most_rated_beers_users_selection)

13605

In [23]:
most_rated_beers_users_selection.to_csv('test4.csv')

In [24]:
# Conversion to sparse csr matrix
sparse_ratings_dense = csr_matrix(pd.SparseDataFrame(most_rated_beers_users_selection).to_coo())

In [25]:
sparse_ratings_dense.shape

(13605, 50)

# Kmeans

In [None]:
Sum_of_squared_distances = []
K = range(1,35)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(sparse_ratings_dense)
    Sum_of_squared_distances.append(km.inertia_)

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [26]:
# Create a kmeans model using k = 12
from sklearn.cluster import KMeans
model = KMeans(n_clusters=8, n_init = 5)

# Fit the model to the data
model.fit(sparse_ratings_dense)

# Use the data to predict the clusters
# save the predictions as `predicted_clusters`
predicted_clusters = model.predict(sparse_ratings_dense)

In [27]:
joblib.dump(model, 'model.pkl')  
model_loaded = joblib.load('model.pkl')

model_loaded

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=5, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [28]:
#load files for analysis
base = pd.read_csv("test7.csv", index_col='review_profilename')
test_sparse = csr_matrix(pd.SparseDataFrame(base).to_coo())

In [30]:
model_loaded.predict(test_sparse)

array([3])