In [1]:
import pandas as pd

PROCESSED_DATASET_PATH = "../data/beer-processed.pkl"

In [2]:
df = pd.read_pickle(PROCESSED_DATASET_PATH)

df.head()

Unnamed: 0,brewery_id,brewery_name,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,review_year,review_month
0,10325,vecchio birraio,1.5,2.0,2.5,stcules,hefeweizen,1.5,1.5,sausa weizen,5.0,47986,217,2
1,10325,vecchio birraio,3.0,2.5,3.0,stcules,english strong ale,3.0,3.0,red moon,6.2,48213,217,3
2,10325,vecchio birraio,3.0,2.5,3.0,stcules,foreign / export stout,3.0,3.0,black horse black beer,6.5,48215,217,3
3,10325,vecchio birraio,3.0,3.0,3.5,stcules,german pilsener,2.5,3.0,sausa pils,5.0,47969,217,2
4,1075,caldera brewing company,4.0,4.5,4.0,johnmichaelsen,american double / imperial ipa,4.0,4.5,cauldron dipa,7.7,64883,218,12


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1518058 entries, 0 to 1586613
Data columns (total 14 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1518058 non-null  int16  
 1   brewery_name        1518058 non-null  object 
 2   review_overall      1518058 non-null  float16
 3   review_aroma        1518058 non-null  float16
 4   review_appearance   1518058 non-null  float16
 5   review_profilename  1518058 non-null  object 
 6   beer_style          1518058 non-null  object 
 7   review_palate       1518058 non-null  float16
 8   review_taste        1518058 non-null  float16
 9   beer_name           1518058 non-null  object 
 10  beer_abv            1518058 non-null  float64
 11  beer_beerid         1518058 non-null  uint32 
 12  review_year         1518058 non-null  uint8  
 13  review_month        1518058 non-null  uint8  
dtypes: float16(5), float64(1), int16(1), object(4), uint32(1), uint8(2

## Queston #1: Which brewery produces the strongest beers by abv?
Note: We have removed entries that report ABV > 20% as unrealistic.

In [4]:
df.get(["brewery_name", "beer_abv"]) \
    .sort_values(by="beer_abv", ascending=False) \
    .drop_duplicates() \
    .head(10) \
    .set_index("brewery_name")

Unnamed: 0_level_0,beer_abv
brewery_name,Unnamed: 1_level_1
the bruery,19.5
sonoran brewing company,19.5
bfm brasserie des franches-montagnes,19.5
sonoran brewing company,19.37
mikkeller aps,19.2
boquébière,19.2
the bruery,18.5
mikkeller aps,18.5
sherbrooke liquor store,18.5
sigtuna brygghus,18.5


In [5]:
df.get(["brewery_name", "beer_abv"]) \
    .groupby(by="brewery_name") \
    .agg(["mean", "count"]) \
    .sort_values(by=[("beer_abv", "mean")], ascending=False) \
    .head(10)

Unnamed: 0_level_0,beer_abv,beer_abv
Unnamed: 0_level_1,mean,count
brewery_name,Unnamed: 1_level_2,Unnamed: 2_level_2
shoes brewery,15.2,2
rome brewing company,13.84,5
hurlimann brewery,13.75,18
schorschbräu,13.366667,27
alt-oberurseler brauhaus,13.2,1
rascal creek brewing co.,13.0,1
monks porter house,12.466667,3
brasserie grain d' orge (brasserie jeanne d'arc sa),12.44586,314
tugboat brewing company,12.1875,8
rinkuki&#371; aluas darykla,12.0,11


Apparently, it is the "Shoes Brewery".

In my opinion, the first table only gives us the name of a brewery that has produced a single, most alcoholic beer. However, I understand the quesion as a case to find a brewery that produces strong beers on average.
From the second list, we can see it is the "Shoes Brewery".
Still, I would argue that it could just as well be "Hurlimann", "Schorschriau" or "Brasserie Grain d'Orge", because their average is "statistically stronger".

## Question #2. If you had to pick 3 beers to recommend to someone, how would you approach the problem?

The best approach would be to create a recommender system.
Unfortunately, we know very little of the users, apart from how they graded the beers - we don't have their vectorial representations. So it is hard to start.
In addition, if we have a new user, we don't know anything about his/her preference, which adds to the difficulty (it's not impossible, but just challenging).

Another, "cheap" approach will be to look at the correlations.
Mainly, similar users should grade similar beers... similarly.

Here, we choose to pick users that scored at least 1000 beers.
We also choose beers that were given at least 1000 scores.
Why?
* Because single opinions don't mean much statistically... we may have a beer that scores 5/5, but if only one person drank it, it doesn't tell much.
* Because I will be pivoting the table, and it won't fit into the memory. (I can convert it to a sparse representation, but that's a bit beyond.)

In [38]:
profilenames = df["review_profilename"].value_counts() \
    .to_frame() \
    .rename(columns={"review_profilename": "counts"}) \
    .sort_values(by="counts", ascending=False) \
    .query("counts > 1000") \
    .reset_index() \
    .get("index") \
    .tolist()

beernames = df["beer_name"].value_counts() \
    .to_frame() \
    .rename(columns={"beer_name": "counts"}) \
    .sort_values(by="counts", ascending=False) \
    .query("counts > 1000") \
    .reset_index() \
    .get("index") \
    .tolist()

print(f"#Profiles: {len(profilenames)}, #beers: {len(beernames)}")

#Profiles: 223, #beers: 209


In [39]:
review_columns = [
    'review_overall',
    'review_aroma',
    'review_appearance',
    'review_palate',
    'review_taste',
]

reviews = df \
    .query("review_profilename in @profilenames") \
    .get(["review_profilename", "beer_name"] + review_columns)

reviews

Unnamed: 0,review_profilename,beer_name,review_overall,review_aroma,review_appearance,review_palate,review_taste
0,stcules,sausa weizen,1.5,2.0,2.5,1.5,1.5
1,stcules,red moon,3.0,2.5,3.0,3.0,3.0
2,stcules,black horse black beer,3.0,2.5,3.0,3.0,3.0
3,stcules,sausa pils,3.0,3.0,3.5,2.5,3.0
4,johnmichaelsen,cauldron dipa,4.0,4.5,4.0,4.0,4.5
...,...,...,...,...,...,...,...
1586577,glid02,o'defiant stout,4.5,4.0,4.0,4.5,4.0
1586578,weswes,o'defiant stout,4.5,4.0,4.0,4.0,4.5
1586585,tgbljb,big thumper ale,3.5,3.5,3.5,3.5,3.5
1586592,rblwthacoz,irish amber,4.5,3.5,4.0,4.0,4.5


In [64]:
reviews.corr()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_palate,review_taste
review_overall,1.0,0.670401,0.518278,0.739869,0.818578
review_aroma,0.670401,1.0,0.559038,0.655634,0.753233
review_appearance,0.518278,0.559038,1.0,0.571777,0.549069
review_palate,0.739869,0.655634,0.571777,1.0,0.760125
review_taste,0.818578,0.753233,0.549069,0.760125,1.0


In [68]:
user = "rblwthacoz"

u = reviews.query("review_profilename == @user")

u

Unnamed: 0,review_profilename,beer_name,review_overall,review_aroma,review_appearance,review_palate,review_taste
2505,rblwthacoz,orgemont brune,3.5,3.5,4.0,3.5,3.5
2506,rblwthacoz,orgemont bière des moissons,4.0,3.0,4.0,3.5,4.5
2508,rblwthacoz,orgemont triple,3.0,3.5,4.0,3.5,2.5
2509,rblwthacoz,orgemont bière rèmes speciale au marc de champ...,4.0,3.5,4.0,4.0,4.0
3013,rblwthacoz,wild frog wheat ale,4.0,4.5,3.5,3.5,4.0
...,...,...,...,...,...,...,...
1586363,rblwthacoz,esb,4.5,4.0,4.0,4.0,4.5
1586392,rblwthacoz,little thumper ale,4.0,4.0,4.0,4.0,4.5
1586512,rblwthacoz,four horsemen #2: war,4.5,4.0,4.5,4.0,4.5
1586528,rblwthacoz,porter,4.5,4.0,4.5,4.0,4.0


In [32]:
profilenames

['northyorksammy',
 'mikesgroove',
 'buckeyenation',
 'thorpe429',
 'brentk56',
 'womencantsail',
 'chaingangguy',
 'phyl21ca',
 'nerofiddled',
 'oberon',
 'weswes',
 'feloniousmonk',
 'akorsak',
 'gueuzedude',
 'beerchitect',
 'jwc215',
 'themaniacalone',
 'russpowell',
 'wasatch',
 'zeff80',
 'tempest',
 'drabmuh',
 'mora2000',
 'viggo',
 'wvbeergeek',
 'gavage',
 'halcyondays',
 'wl0307',
 'jdhilt',
 'glid02',
 'weeare138',
 'ppoitras',
 'drjay',
 'gusler',
 'billolick',
 'jason',
 'zorro',
 'emerge077',
 'tmoney2591',
 'knapp85',
 'smcolw',
 'bighuge',
 'barleywinefiend',
 'rhoadsrage',
 'masterski',
 'johnmichaelsen',
 'kegatron',
 'georgiabeer',
 'biegaman',
 'beerthulhu',
 'derek',
 'biboergosum',
 'rblwthacoz',
 'jrallen34',
 'jpm30',
 'royalt',
 'tavernjef',
 'thelongbeachbum',
 'francisweizen',
 'corby112',
 'woodychandler',
 'gford217',
 'doublej',
 'turdfurgison',
 'mdfb79',
 'plaid75',
 'metter98',
 'slatetank',
 'tpd975',
 'beerandraiderfan',
 'dogbrick',
 'mothman',
 'cc