In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../scraper/src/data/beeradvocate_raw.csv')

In [3]:
beeradv_df = df.copy()

In [4]:
beeradv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586615 entries, 0 to 1586614
Data columns (total 15 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1586615 non-null  int64  
 1   review_appearance_score  1586614 non-null  float64
 2   beer_style               1586614 non-null  object 
 3   review_palate_score      1586614 non-null  float64
 4   review_taste_score       1586614 non-null  float64
 5   beer_name                1586614 non-null  object 
 6   review_time              1586614 non-null  float64
 7   user_gender              637221 non-null   object 
 8   user_birthday            327596 non-null   float64
 9   beer_ABV                 1518829 non-null  float64
 10  beer_id                  1586614 non-null  object 
 11  review_overall_score     1586614 non-null  float64
 12  review_text              1586614 non-null  object 
 13  review_user              1586614 non-null 

In [5]:
import re 

def scale(series):
    max_value = series.max()
    return series.map(lambda x: x / max_value * 10)

# Drop rows without beer_id.
beeradv_df.dropna(subset=['beer_id'], inplace=True)

# Transform features to numeric.
beeradv_df['beer_id'] = beeradv_df['beer_id'].map(lambda x: int(re.findall(r"\d+", x)[-1]))

# Scale values.
beeradv_df['review_appearance_score'] = scale(beeradv_df['review_appearance_score'])
beeradv_df['review_palate_score'] = scale(beeradv_df['review_palate_score'])
beeradv_df['review_taste_score'] = scale(beeradv_df['review_taste_score'])
beeradv_df['review_overall_score'] = scale(beeradv_df['review_overall_score'])

beeradv_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586614 entries, 0 to 1586613
Data columns (total 15 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   Unnamed: 0               1586614 non-null  int64  
 1   review_appearance_score  1586614 non-null  float64
 2   beer_style               1586614 non-null  object 
 3   review_palate_score      1586614 non-null  float64
 4   review_taste_score       1586614 non-null  float64
 5   beer_name                1586614 non-null  object 
 6   review_time              1586614 non-null  float64
 7   user_gender              637221 non-null   object 
 8   user_birthday            327596 non-null   float64
 9   beer_ABV                 1518829 non-null  float64
 10  beer_id                  1586614 non-null  int64  
 11  review_overall_score     1586614 non-null  float64
 12  review_text              1586614 non-null  object 
 13  review_user              1586614 non-null 

In [6]:
# Set review_id as index.
beeradv_df.columns = ['review_id'] + list(beeradv_df.columns)[1:]
beeradv_df.set_index('review_id');

In [7]:
grouped_beers = beeradv_df.groupby(['beer_id'])

In [8]:
distinct_beers = {
    'beer_name': grouped_beers.first()['beer_name'].str.strip("'\""),
    'n_reviews': grouped_beers.count()['review_id'],
    'beer_abv': grouped_beers.mean()['beer_ABV'],
    'beer_style': grouped_beers.first()['beer_style'].str.strip("'\""),
    'appearance_mean': grouped_beers.mean()['review_appearance_score'],
    'appearance_std': grouped_beers.std()['review_appearance_score'],
    'palate_mean': grouped_beers.mean()['review_palate_score'],
    'palate_std': grouped_beers.std()['review_palate_score'],
    'taste_mean': grouped_beers.mean()['review_taste_score'],
    'taste_std': grouped_beers.std()['review_taste_score'],
    'overall_mean': grouped_beers.mean()['review_overall_score'],
    'overall_std': grouped_beers.std()['review_overall_score'],
}

In [9]:
distinct_beers_df = pd.DataFrame(distinct_beers)
distinct_beers_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66055 entries, 3 to 77317
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   beer_name        66055 non-null  object 
 1   n_reviews        66055 non-null  int64  
 2   beer_abv         49012 non-null  float64
 3   beer_style       66055 non-null  object 
 4   appearance_mean  66055 non-null  float64
 5   appearance_std   42315 non-null  float64
 6   palate_mean      66055 non-null  float64
 7   palate_std       42315 non-null  float64
 8   taste_mean       66055 non-null  float64
 9   taste_std        42315 non-null  float64
 10  overall_mean     66055 non-null  float64
 11  overall_std      42315 non-null  float64
dtypes: float64(9), int64(1), object(2)
memory usage: 6.6+ MB


In [10]:
distinct_beers_df.tail()

Unnamed: 0_level_0,beer_name,n_reviews,beer_abv,beer_style,appearance_mean,appearance_std,palate_mean,palate_std,taste_mean,taste_std,overall_mean,overall_std
beer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
77313,Aass Gourmet Pale Ale,1,4.7,American Blonde Ale,8.0,,6.0,,6.0,,6.0,
77314,Betty Brown Norwegian Brwon Ale,1,4.7,American Brown Ale,8.0,,6.0,,8.0,,7.0,
77315,Icelandic White Beer,1,5.2,Witbier,6.0,,6.0,,7.0,,7.0,
77316,Crank Yanker IPA,1,7.8,American IPA,7.0,,7.0,,7.0,,7.0,
77317,Belgo Sutra,1,,Quadrupel (Quad),8.0,,7.0,,8.0,,7.0,


In [11]:
# Remove beers with no abv value.
mask = distinct_beers_df['beer_abv'].notna()
distinct_beers_df = distinct_beers_df.loc[mask]

In [12]:
# Remove duplicate names.
to_keep_df = distinct_beers_df.reset_index().groupby('beer_name').first()
mask = to_keep_df['beer_id']
distinct_beers_df = distinct_beers_df.loc[distinct_beers_df.index.intersection(mask)]

In [13]:
distinct_beers_df.describe().apply(lambda s: s.apply(lambda x: format(x, '.2f')))

Unnamed: 0,n_reviews,beer_abv,appearance_mean,appearance_std,palate_mean,palate_std,taste_mean,taste_std,overall_mean,overall_std
count,44083.0,44083.0,44083.0,30341.0,44083.0,30341.0,44083.0,30341.0,44083.0,30341.0
mean,33.24,6.29,7.38,0.88,7.16,0.99,7.22,1.0,7.31,1.08
std,133.86,2.08,1.02,0.47,1.16,0.52,1.26,0.55,1.24,0.58
min,1.0,0.01,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0
25%,1.0,5.0,7.0,0.67,6.6,0.71,6.67,0.71,6.83,0.71
50%,3.0,5.7,7.57,0.87,7.33,0.99,7.44,0.99,7.5,1.05
75%,11.0,7.3,8.0,1.12,8.0,1.26,8.0,1.29,8.0,1.41
max,3290.0,57.7,10.0,4.24,10.0,4.95,10.0,5.66,10.0,5.66


In [14]:
distinct_beers_df.to_csv('../scraper/src/data/beeradvocate.csv')