Created by Tomer Danon

This notebook walks through the exploratory data analysis and algorithms for the beer recommendation model.

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import pickle


In [None]:
# Read files
main = pd.read_pickle('./main.pickle')
users = pd.read_pickle('./users.pickle')

### Exploratory Data Analysis

In [None]:
# Print basic statistics of the dataframe
print('Number of Reviews:   ', len(main))
print('Number of Users:     ', len(main.review_profilename.unique()))
print('Number of Beers:     ', len(main.beer_beerid.unique()))
print('Number of Styles:    ', len(main.beer_style.unique()))
print('Number of Breweries: ', len(main.brewery_name.unique()))


In [None]:
# Histograms of attributes.
fig = make_subplots(rows=3, cols=2,
                    subplot_titles=("Appearance Rating", "Aroma Rating", "Palate Rating", "Taste Rating", "Overall Rating", "Alcohol by Volume"))

fig.add_trace(
    go.Histogram(x=main.review_appearance),
    row=1, col=1, 
)
fig.add_trace(
    go.Histogram(x=main.review_aroma),
    row=1, col=2,
)
fig.add_trace(
    go.Histogram(x=main.review_palate),
    row=2, col=1,
)
fig.add_trace(
    go.Histogram(x=main.review_taste),
    row=2, col=2,
)
fig.add_trace(
    go.Histogram(x=main.review_overall),
    row=3, col=1,
)
fig.add_trace(
    go.Histogram(x=main.beer_abv),
    row=3, col=2,
)
fig.update_layout(height=1000, width=600, showlegend=False)
fig.show()
fig.write_html('./atts_distributions.html')
fig.write_image('./atts_distributions.png')

In [None]:
# Pie chart of top 10 style, rest in "Other"

styles = main.groupby('beer_style').count()['beer_beerid'].sort_values(ascending=False)
top10 = styles[:10]
pie = pd.concat( [top10, pd.Series(styles[10:].sum())] ).rename({0:'94 Other Styles'})

fig = px.pie(
    pie,
    values=pie.values,
    names=pie.index,
    title="Top 10 Styles")
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(height=1000, width=1000)
fig.show()
fig.write_html('./top10styles.html')
fig.write_image('./top10styles.png')

In [None]:
# Beers per style
main.groupby('beer_style').count().iloc[:,0].sort_values(ascending=False)


In [None]:
# Reviews per beers per brewery / Most reviewed breweries
main.groupby('brewery_name').count().iloc[:,0].sort_values(ascending=False)

In [None]:
# Top users
users['num_reviews'].sort_values(ascending=False)[:20]

In [None]:
# Most reviewed beers
main.groupby(['beer_beerid', 'beer_name', 'brewery_name', 'beer_style' ]).count()['brewery_id'].sort_values(ascending=False)[:10]

In [None]:
# Best rated beers
main.groupby(['beer_beerid', 'beer_name', 'brewery_name', 'beer_style' ]).mean()['review_overall'].sort_values(ascending=False)[:50]

In [None]:
users.head()

### Recommend beers of specific style

In [None]:
# Beer styles to choose from
main.beer_style.unique()

In [None]:
def recStyle(styles, rating=4.25, topX=10):
    '''
    Function to provide beer recommendations based on style preferences.
    Accepts a list/set of styles, custom rating threshold, and 
    number of recommendations threshold.
    Function prints out list of users in cluster and number of users in
    cluster.
    Returns dataframe of beer recommendations.
    '''
    # Check to see what users review these styles the most
    # and give rating of 4.25 (or user specified) or higher.
    styles = set(styles)
    users_reviewed = []
    for i in range(len(users)):
        temp = users.iloc[i, : ]
        name = temp.name
        for style in styles:
            if style in temp[1]:
                if temp[1][style] >= rating:
                    users_reviewed.append((name, style, temp[1][style]))

    count = pd.DataFrame(users_reviewed, columns=['user', 'style', 'rating']).groupby('user').count()['style']

    cluster_users = []
    for i in count.items():
        if i[1] == len(styles):
            cluster_users.append(i[0])

    # These are the users who gave 'rating' or higher for all of the new users preferred styles.
    print("Number of users in cluster:", len(cluster_users))
    print("Users in cluster:\n", cluster_users)


    # These are the beers new user should try in the preferred styles. 
    beer_to_suggest = pd.DataFrame()

    for user in cluster_users:
        temp = users.loc[user, 'user_beers']
        df = pd.DataFrame(temp, index=['beer_beerid', 'brewery_name', 'beer_style', 'review_overall']).transpose()
        
        for style in styles:
            beer_to_suggest = pd.concat([beer_to_suggest, df[ (df.beer_style==style) & (df.review_overall >= rating) ]])

    beer_to_suggest['beer_name'] = beer_to_suggest.index.copy()
    beer_to_suggest = pd.DataFrame(beer_to_suggest.groupby(['beer_beerid', 'beer_name', 'brewery_name', 'beer_style'])['review_overall'].mean().round(2))
    beer_to_suggest = beer_to_suggest.reset_index([1,2,3]).sort_values(['review_overall'], ascending=False)

    rec = pd.DataFrame()
    for style in styles:

        rec = pd.concat([rec, beer_to_suggest[ beer_to_suggest.beer_style==style ][:topX]])

    return rec


In [None]:
# Working example
recStyle({'Saison / Farmhouse Ale', 'American IPA', 'American Double / Imperial IPA'})

# Try your own from the list above
# recStyle({})

### Recommend beer based on specific beers

In [None]:
# The list of beers is 66k+ long. 
# Below is is a short list of beers to choose from
someBeers = main.groupby(['beer_name']).count()['review_overall'].sort_values(ascending=False)[50:150]
print(someBeers.index)

In [None]:
def recBeer(beers, rating=4.25, topX=10):
    '''
    Function to find beer recommendations based on beer preferences.
    Accepts list or set or beer names, as well as custom rating threshold and 
    number of recommendations threshold.
    Function prints out list of users in cluster and number of users in
    cluster.
    Returns dataframe of beer recommendations.
    '''
    styles = []
    for beer in beers:
        styles.append(main[ main.beer_name==beer].iloc[0]['beer_style'])
    styles = set(styles)
    cluster_users = []

    # Check to see what users reviewed these beers the highest
    for user in users.user_beers.items():
        count = 0
        for beer in beers:
            if beer in user[1]:
                count += 1
        if count == len(beers):
            cluster_users.append(user[0])

    print("Number of users in cluster:", len(cluster_users))        
    print("Users in cluster:\n", cluster_users)  

    # These are the beers new user should try in the preferred styles. 
    beer_to_suggest = pd.DataFrame()

    for user in cluster_users:
        temp = users.loc[user, 'user_beers']
        df = pd.DataFrame(temp, index=['beer_beerid', 'brewery_name', 'beer_style', 'review_overall']).transpose()
        
        for style in styles:
            beer_to_suggest = pd.concat([beer_to_suggest, df[ (df.beer_style==style) & (df.review_overall >= rating) ]])

    beer_to_suggest['beer_name'] = beer_to_suggest.index.copy()
    beer_to_suggest = pd.DataFrame(beer_to_suggest.groupby(['beer_beerid', 'beer_name', 'brewery_name', 'beer_style'])['review_overall'].mean().round(2))
    beer_to_suggest = beer_to_suggest.reset_index([1,2,3]).sort_values(['review_overall'], ascending=False)

    rec = pd.DataFrame()
    for style in styles:
        rec = pd.concat([rec, beer_to_suggest[ beer_to_suggest.beer_style==style ][:topX]])

    return rec



In [None]:
# Example
recBeer({'Pliny The Elder', 'Heady Topper', 'Drie Fonteinen Oude Geuze', '90 Minute IPA'})   

# Try your own from the list above
# recBeer({})