In [None]:
import re
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_rows = 10
sns.set(style='ticks', context='talk')
plt.rcParams['figure.figsize'] = (12, 6)

We'll use the same dataset of beer reviews.

In [None]:
df = pd.read_csv('data/beer_subset.csv.gz', parse_dates=['time'], compression='gzip')
review_cols = ['review_appearance', 'review_aroma', 'review_overall',
               'review_palate', 'review_taste']
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(5, 10))
sns.countplot(hue='kind', y='stars', data=(df[review_cols]
                                           .stack()
                                           .reset_index(level=1)
                                           .rename(columns={'level_1': 'kind',
                                                            0: 'stars',})),
              ax=ax, order=np.arange(0, 5.5, .5))
sns.despine()

# Groupby

Groupby is a fundamental operation to pandas and data analysis.

The components of a groupby operation are to

1. Split a table into groups
2. Apply a function to each group
3. Combine the results

In pandas the first step looks like

```python
df.groupby( grouper )
```

`grouper` can be many things

- Series (or string indicating a column in `df`)
- function (to be applied on the index)
- dict : groups by *values*
- `levels=[ names of levels in a MultiIndex ]`

In [None]:
gr = df.groupby('beer_style')
gr

Haven't really done anything yet. Just some book-keeping to figure out which **keys** go with which **rows**. Keys are the things we've grouped by (each `beer_style` in this case).

The last two steps, apply and combine, are just:

In [None]:
gr.agg('mean')

This says apply the `mean` function to each column. Non-numeric columns (nusiance columns) are excluded. We can also select a subset of columns to perform the aggregation on.

In [None]:
gr[review_cols].agg('mean')

`.` attribute lookup works as well.

In [None]:
gr.abv.agg('mean')

Certain operations are attached directly to the `GroupBy` object, letting you bypass the `.agg` part

In [None]:
gr.abv.mean()

Exercise: Find the `beer_style`s with the greatest variance in `abv`.

- hint: `.std` calculates the standard deviation, and is available on `GroupBy` objects like `gr.abv`.
- hint: use `.sort_values` to sort a Series by the values

In [None]:
# your code goes here

In [None]:
%load -r 15:17 solutions/solutions_groupby.py

Now we'll run the gamut on a bunch of grouper / apply combinations.
Keep sight of the target though: split, apply, combine.

- Grouper: Controls the output index
    * 1-D -> `Index`
    * 2+ -> `MultiIndex`
- Subject (Groupee): Controls the output data values
    * 1-D -> `Series` (or DataFrame if multiple aggregations)
    * 2+ -> `DataFrame`
- Aggregation: Controls the output columns
    * 1-D -> `Index` in the colums
    * 2+ -> `MultiIndex` in the columns (Or 1-D Index groupee is 1-d)

Multiple Aggregations on one column

In [None]:
gr['review_aroma'].agg([np.mean, np.std, 'count']).head()

Single Aggregation on multiple columns

In [None]:
gr[review_cols].mean()

Multiple aggregations on multiple columns

In [None]:
gr[review_cols].agg(['mean', 'count', 'std'])

Hierarchical Indexes in the columns can be awkward to work with, so I'll usually
move a level to the Index with `.stack`.

In [None]:
gr[review_cols].agg(['mean', 'count', 'std']).stack(level=0)

You can group by **levels** of a MultiIndex.

In [None]:
multi = gr[review_cols].agg(['mean', 'count', 'std']).stack(level=0)
multi.head()

In [None]:
multi.groupby(level='beer_style')['mean'].agg(['min', 'max'])

Group by **multiple** columns

In [None]:
df.groupby(['brewer_id', 'beer_style']).review_overall.mean()

In [None]:
df.groupby(['brewer_id', 'beer_style'])[review_cols].mean()

### Exercise: Plot the relationship between review length (the `text` column) and average `review_overall`.

Hint: Break the problem into pieces:

- Find the **len**gth of each reivew (remember the `df.text.str` namespace?)
- Group by that Series of review lengths
- Find the `mean` `review_overall` by review length
- I used `style='k.'` in the plot 

In [None]:
# Your solutioin

In [None]:
%load -r 1:5 solutions/solutions_groupby.py

Bonus exercise:

- Try grouping by the number of words.
- Try grouping by the number of sentances.

Remember that `str.count` accepts a regular expression.

Don't worry too much about these, especially if you don't remember the syntax
for regular expressions (I never can). Just jump to the next exercise.

In [None]:
# Your code goes here

In [None]:
%load -r 18:20 solutions/solutions_groupby.py

### Exercise: Which **brewer** (`brewer_id`) has the largest gap between the min and max `review_overall` for two of their beers.

Hint: You'll need to do this in two steps.

1. Find the average `review_overall` by brewer and beername.
2. Find the difference between the max and min by brewer (rembember `.groupby(level=)`)

In [None]:
# Your code goes here. You've got this!

In [None]:
%load -r 6:13 solutions/solutions_groupby.py

In [None]:
# Show for those with counts > 20ish

Create our own "kind" of beer, which aggregates `style`.

In [None]:
style = df.beer_style.str.lower()
style.head()

In [None]:
kinds = ['ipa', 'apa', 'amber ale', 'rye', 'scotch', 'stout', 'barleywine', 'porter', 'brown ale', 'lager', 'pilsner',
         'tripel', 'biter', 'farmhouse', 'malt liquour', 'rice']

In [None]:
expr = '|'.join(['(?P<{name}>{pat})'.format(pat=kind, name=kind.replace(' ', '_')) for kind in kinds])
expr

In [None]:
beer_kind = (style.replace({'india pale ale': 'ipa',
                            'american pale ale': 'apa'})
            .str.extract(expr, expand=False).fillna('').sum(1)
            .str.lower().replace('', 'other'))
beer_kind.head()

In [None]:
df.groupby(['brewer_id', beer_kind]).review_overall.mean()

In [None]:
df.groupby(['brewer_id', beer_kind]).beer_id.nunique().unstack(1).fillna(0)

### Exercise: Which Brewers have the most different `kinds` of beer?

Hint: we used `df.profile_name.nunique()` to find the number of different profile names.
What are we grouping, and what is our grouper?

In [None]:
# Your solution

In [None]:
%load -r 27:29 solutions/solutions_groupby.py

### Exercise: Which kinds of beer have the most brewers?

In [None]:
# Your solution

In [None]:
%load -r 30:32 solutions/solutions_groupby.py

We've seen a lot of permutations among number of groupers, number of columns to aggregate, and number of aggregators.
In fact, the `.agg`, which returns one row per group, is just one kind of way to combine the results. The three ways are

- `agg`: one row per results
- `transform`: identicaly shaped output as input
- `apply`: anything goes


# Transform

Combined Series / DataFrame is the same shape as the input. For example, say you want to standardize the reviews by subtracting the mean.

In [None]:
def de_mean(reviews):
    s = reviews - reviews.mean()
    return s

In [None]:
de_mean(df.review_overall)

We can do this at the *person* level with `groupby` and `transform`.

In [None]:
df['review_overall_demeaned'] = df.groupby('profile_name').review_overall.transform(de_mean)

In [None]:
df.head()

# Apply

So there's `gr.agg`. and `gr.transform`, and finally `gr.apply`. We're going to skip apply for now. I have an example in a later notebook. 

# Resample

Resample is a special kind of groupby operation for when you have a `DatetimeIndex`.

In [None]:
review_times = df.time.value_counts().sort_index()
review_times

In [None]:
review_times.index

The number of reviews within a given second isn't that interesting.

In [None]:
review_times.plot()

Right now the frequency is way to high to be meaningful. `resample` lets you adjust the frequency.

In [None]:
review_times.resample("3H").mean().plot()

# Aside: Beer Recommender

See [Harvard CS109](https://github.com/cs109/content) for a more complete example (with chocolate instead of beer).

One place where transform comes in handy is as a preprocessing step for any kind of recommender. In some sense, raw score I assign a beer is less important the the score relative to *my* mean.

In [None]:
deduped = df[['beer_id', 'profile_name', 'review_overall']].drop_duplicates()
deduped.head()

In [None]:
user_counts = deduped.profile_name.value_counts()
top_users = user_counts[user_counts > user_counts.quantile(.75)].index

In [None]:
beer_counts = deduped.beer_id.value_counts()
top_beers = beer_counts[beer_counts > beer_counts.quantile(.9)].index

In [None]:
top = deduped.query('beer_id in @top_beers and profile_name in @top_users')
user_means = top.groupby('profile_name').review_overall.mean()
beer_means = top.groupby('beer_id').review_overall.mean()


In [None]:
fig, axes = plt.subplots(figsize=(16, 4), ncols=2, sharey=True, sharex=True)

sns.distplot(user_means, kde=False, ax=axes[0], color='k', norm_hist=True, hist_kws={'alpha': 1})
sns.distplot(beer_means, kde=False, ax=axes[1], color='k', norm_hist=True, hist_kws={'alpha': 1})
axes[0].set_title("User Averages")
axes[1].set_title("Beer Averages")

In [None]:
s = top.set_index(['beer_id', 'profile_name']).review_overall.sort_index()
s.head()

###  `de_mean` the scores in `s`

In [None]:
standardized = s.groupby(level='profile_name').transform(de_mean)
standardized.head()

In [None]:
from scipy.stats import pearsonr

In [None]:
def pearson_sim(reviews_1, reviews_2, reg=2):
    """
    (regularized) Pearson correlation coefficient between sets
    of reviews for two beers, made by a common subset
    of reviewers.
    
    `reviews_1` and `reviews_2` should be have the same index,
    the `profile_name`s of people who reviewed both beers.
    """
    n_common = len(reviews_1)
    if n_common == 0:
        similarity = 0
    else:
        rho = pearsonr(reviews_1, reviews_2)[0]
        similarity = (n_common * rho) / (n_common + reg)  # regularization if few reviews
    return similarity, n_common

In [None]:
def beer_similarity(standardized, beer_1, beer_2, simfunc=pearson_sim, **simfunc_kwargs):
    """
    Compute the similarity between two beers.
    """
    # get common subset...
    reviewers_1 = standardized.loc[beer_1].index
    reviewers_2 = standardized.loc[beer_2].index
    common_idx = reviewers_1 & reviewers_2  # set intersection

    # slice the Multiindex, unstack to be N x 2
    common_reviews = standardized.loc[[beer_1, beer_2], common_idx].unstack('beer_id')
    # ... review similairty for subset
    rho, n_common = simfunc(common_reviews[beer_1], common_reviews[beer_2], **simfunc_kwargs)
    return rho, n_common

In [None]:
beer_ids = s.index.levels[0]
len(beer_ids)

In [None]:
beer_similarity(standardized, beer_ids[0], beer_ids[10])

In [None]:
%%time
sims = []

for i, beer_1 in enumerate(beer_ids):
    for j, beer_2 in enumerate(beer_ids):
        if j >= i:
            continue
        sim, n_common = beer_similarity(s, beer_1, beer_2)
        sims.append((beer_1, beer_2, sim, n_common))
        print((i, j), end='\r')
        
sim = pd.DataFrame(sims, columns=['beer_1', 'beer_2', 'score', 'n_common'])
sim.to_csv('beer_subset_similarity.csv', index=False)

In [None]:
sim = pd.read_csv('beer_subset_similarity.csv.gz')
sim.head()

In [None]:
sns.kdeplot(sim[sim.score != 0].dropna().score)

In [None]:
sim = sim.set_index(['beer_1', 'beer_2']).score

In [None]:
sim.loc[21690].nlargest(5)

In [None]:
m = pd.DataFrame({"a": [1], "b": [1.]})
m

In [None]:
m.style._translate()['body'][0][1]['display_value']

In [None]:
m.style._translate()['body'][0][2]['display_value']

In [None]:
m.style