In [None]:
import os 
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.preprocessing
import sklearn.metrics

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
import seaborn as sns

import pymc3 as pm
import arviz as az

import voteestimator
import top2000analysis

## Read

In [None]:
a = top2000analysis.AnalysisSetCreator()
df_song = a.create_analysis_set('Data')
df_artist = a.create_artist_set('Data')
df = a.create_full_feature_set('Data')

In [None]:
df_artist['Boost'].describe()

In [None]:
df_artist.query('Boost < 1')[['Name', 'Boost', 'LogPopularityNorm']]

In [None]:
full_list = a._combine_data('Data')
print(f"In its history, the Top 2000 has seen {full_list['SongID'].nunique()} songs and {full_list['ArtistID'].nunique()} artists")

In [None]:
df_song['Boost'].describe()

In [None]:
def calculate_new_position(old_position, votesmodel, boost=1.75):
    old_votes = votesmodel.percentage_of_votes(old_position)
    new_votes = old_votes
    new_position = old_position
    while new_votes < boost * old_votes and new_position >= 1:
        new_position -= 1
        new_votes = votesmodel.percentage_of_votes(new_position)
    return max(new_position, 1)

In [None]:
positioning = (pd.DataFrame({'OldPosition': range(1, 2001)})
               .assign(NewPosition = lambda df: [calculate_new_position(p, a.votesmodel) for p in df['OldPosition']],
                      FactorRanking = lambda df: df['NewPosition'] / df['OldPosition']))

In [None]:
positioning.plot(x='OldPosition', y='FactorRanking')

## Create analysis set

## Univariate analysis
* Newsworthyness of artist passing
  * Age when passing away
  * Dutch nationality
  * Popularity
  * Year
  * Recency of hits
* Days to stemperiode

* Song
  * Popularity within artist
  * Recency of song within artist
  
### Newsworthyness

#### Artist age

In [None]:
def plot_with_trend(df, column, logy=False):
    ycolname = 'LogBoost' if logy else 'Boost'
    preds = LinearRegression().fit(df[[column]], df[ycolname]).predict(df[[column]])
    ax = df.plot(x=column, y=ycolname, kind='scatter', label='Passed away artists', c='grey')
    ax.plot(df[column], preds, 'k', label='Trend')
    return ax

In [None]:
ax = plot_with_trend(df_artist, 'AgePassing', logy=True)

#### Jaar

In [None]:
ax = plot_with_trend(df_artist, 'JaarTop2000', logy=False)
ax.xaxis.set_major_locator(mtick.MaxNLocator(integer=True))
ax.set_ylim(None, 6)
plt.legend()
plt.tight_layout()
ax.set_frame_on(False)
plt.gcf().savefig('YearEffect.jpg')

#### Nationality

In [None]:
_ = sns.catplot(x='IsDutch', y='LogBoost', data=df_artist)

#### Popularity

In [None]:
ax = plot_with_trend(df_artist, 'PctVotes', logy=True)

In [None]:
plot_with_trend(df_artist, 'LogPopularity', logy=True)

In [None]:
dftest = df_artist.assign(Popularity = lambda df: df['LogPopularity'].sub(df['LogPopularity'].median()),
                            Boost = lambda df: df['Boost'])

preds = LinearRegression().fit(dftest[['Popularity']], dftest['Boost']).predict(dftest[['Popularity']])
ax = dftest.plot(x='Popularity', y='Boost', kind='scatter', c='grey', ylim=(None, 6))
ax.plot(dftest[['Popularity']], preds, 'k', label='trend')
plt.legend()
plt.tight_layout()
plt.gcf().savefig('PopularityEffect.jpg')

#### Recency

In [None]:
ax = plot_with_trend(df_artist.query('DaysToStemperiode > -365'), 'DaysToStemperiode', logy=True)

In [None]:
recency_buckets = (pd.cut(df_artist['DaysToStemperiode'].clip(lower=-365), 6, labels=False, retbins=False, right=False)
                  .map({i: v for i, v in enumerate(range(-333, 0, 60))}))

In [None]:
df_artist.groupby(recency_buckets)['LogBoost'].agg(['mean', 'sem', 'std', 'count'])

In [None]:
ax = df_artist.groupby(recency_buckets)['LogBoost'].agg(['mean', 'sem', 'std', 'count']).plot(y='mean', yerr='sem')
plt.xlim(-365, 0)
plt.show()

### Recency of last hit

In [None]:
ax = plot_with_trend(df_artist, 'YearsSinceLastHit', logy=True)

### Song

In [None]:
df.filter(like='Nr').columns.tolist()

In [None]:
df['SongRelativeBoost'].describe()

In [None]:
df.query('NameArtist != "André Hazes" & NrsBeforeDeath > 1')['SongRelativeBoost'].describe()

In [None]:
df.nlargest(10, 'SongRelativeBoost')

In [None]:
df.nsmallest(10, 'SongRelativeBoost')

In [None]:
df_artist['Boost'].describe()

In [None]:
df.query('NameArtist != "André Hazes" & NrsBeforeDeath > 1')['SongRelativeBoost'].describe()

In [None]:
ax = sns.violinplot(x='NrsBeforeDeath', y='SongRelativeBoost', data=df)
ax.set_ylim(0,2)

### Solo song

In [None]:
df.groupby(['NrArtists'])['LogBoost'].agg(['mean', 'std', 'sem', 'count'])

In [None]:
df.groupby(['MultiplePerformers'])['LogBoost'].agg(['mean', 'std', 'sem', 'count'])

### Popularity within artist oeuvre

In [None]:
ax = plot_with_trend(df, 'PopularityWithinArtist', logy=True)

In [None]:
ax = plot_with_trend(df, 'LogSongPopularityWithinArtist', logy=True)

### Recency within artist

In [None]:
ax = plot_with_trend(df.query('NrsBeforeDeath > 2'), 'RecencyWithinArtist', logy=True)

In [None]:
def plot_waterfall(data, color=None, buildup=False, **kwargs):
    '''
    Plot a buildup or builddown waterfall chart from data
    This function was adapted from https://pbpython.com/waterfall-chart.html

    Parameters
    ----------
    data: pd.Series to be shown as waterfall
    color: optionally give color as a list for each bar (to highlight some bars)
    buildup: False (default) for builddown, True for buildup

    Returns
    -------
    ax: Axis object
    data: the data, including a "total"-row
    blank: the size of the blank space before each bar
    '''
    #TODO: add connecting lines
    if color is None:
        color = ['lightgray'] * len(data)

    blank = data.cumsum().shift(1).fillna(0)
    total = data.sum()
    data.loc['Total'] = total
    blank.loc['Total'] = 0
    color = color + ['gray']
    
    step = blank.reset_index(drop=True).repeat(3).shift(-1)
    step[1::3] = np.nan
    
    if buildup:
        data = data[::-1]
        blank = blank[::-1]
        color = color[::-1]

    ax = data.plot(kind='barh', stacked=True, left=blank, color=color, **kwargs)
#     ax.plot(step.values, step.index, 'k--')

    return ax, data, blank

data = pd.cut(df_artist['Boost'], [0, 1, 1.25, 1.5, 2.5, np.inf],
               labels=['No boost',
                       'Up to 25% more votes',
                       'Up to 50% more votes',
                       '1.5 - 2.5 x\nas many votes',
                       'More than 2.5\nx as many votes']).value_counts(normalize=True).sort_index(ascending=False)
data.index = data.index.astype(str)
ax, _, _ = plot_waterfall(data, color=['purple', 'purple', 'purple', 'purple', 'lightgray', 'gray'])
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
ax.set_frame_on(False)
plt.tight_layout()
plt.gcf().savefig('BoostDistribution.jpg')

In [None]:
df_artist.nlargest(10, 'Boost')

In [None]:
df_artist['Boost'].describe()

In [None]:
(df.drop_duplicates('NameArtist')['PassingTooEarly'] == 0).mean()

## Multivariate

In [None]:
RANDOM_SEED = 42
def model_factory(X, y):
    passed_away_artists = X['ArtistID'].unique()
    artist_lookup = dict(zip(passed_away_artists, range(len(passed_away_artists))))
    artist_vals = X['ArtistID'].replace(artist_lookup).values
    artist_model = (X.assign(ArtistIDModel = lambda df: X['ArtistID'].map(artist_lookup),
                             )
                .sort_values('ArtistIDModel')
                .drop_duplicates(['ArtistIDModel'])
               )
    
    
    coords = {"obs_id": np.arange(X.shape[0]),
              'Artist': range(len(passed_away_artists))
         }
    with pm.Model(coords=coords) as model:
        artist_idx = pm.Data("artist_idx", artist_vals, dims="obs_id")
        days_to_stemperiode = pm.Data('days_to_stemperiode', artist_model['DaysToStemperiode'], dims='Artist')
        logpopularity = pm.Data('logpopularity', artist_model['LogPopularityNorm'], dims='Artist')
        jaren_geleden = pm.Data("jaren_geleden", artist_model['JarenGeleden'], dims='Artist')
        passing_too_early = pm.Data('passing_too_early', artist_model['PassingTooEarly'], dims='Artist')
        is_dutch = pm.Data('is_dutch', artist_model['IsDutchArtist'], dims='Artist')

        multiple_performers = pm.Data('multiple_performers', X['MultiplePerformers'], dims="obs_id")
        popularity_within_oeuvre = pm.Data('popularity_within_oeuvre', X['LogSongPopularityWithinArtist'], dims="obs_id")

        # Hyperpriors:
        a = pm.Normal("a", mu=0, sigma=2.0)
        sigma_a = pm.Exponential("sigma_a", 1.0)

        recency_effect_exponent = pm.Normal('recency_effect_exponent', mu=-1.5,sigma=1)
        max_recency_effect = pm.Normal('max_recency_effect', mu=2, sigma=2)
        effect_popularity = pm.Normal('effect_popularity', mu=0.5, sigma=2)
        history_effect = pm.Normal('history_effect', mu=0, sigma=0.03)
        age_passing_effect = pm.Normal('age_passing_effect', mu=0.01, sigma=0.05)
        is_dutch_effect = pm.Normal('is_dutch_effect', mu=0, sigma=2)

        # Expected value per artist:
        mu_artist = (a
                     + logpopularity * effect_popularity
                     # The correction of subtracting the minimum value breaks the degeneracy between _a_ and the recency effect
                     # It is important for two reasons:
                     # 1. It makes sampling much more stable
                     # 2. It allows for much easier interpretation of the recency effect
                     + (np.exp(10**recency_effect_exponent * days_to_stemperiode)- np.exp(10**recency_effect_exponent * -365))* max_recency_effect
                     + jaren_geleden * history_effect
                     + passing_too_early * age_passing_effect
                     + is_dutch * is_dutch_effect
                    )

        # This is the non-centered version of the model for a much more stable sampling
        # See https://twiecki.io/blog/2017/02/08/bayesian-hierchical-non-centered/ for more information
        # By making mu_artist and a_artist a pm.Deterministic, we can still access them via the InferenceData, but it is not strictly necessary
        mu_artist = pm.Deterministic("mu_artist", mu_artist, dims="Artist")
        za_artist = pm.Normal("za_artist", mu=0.0, sigma=1.0, dims='Artist')
        a_artist = pm.Deterministic("a_artist", mu_artist + za_artist * sigma_a, dims="Artist")
        sharing_effect = pm.Normal('sharing_effect', mu=0, sigma=2.0)
        within_oeuvre_effect = pm.Normal('within_oeuvre_effect', mu=0, sigma=2.0)
        theta = (a_artist[artist_idx]
                 + multiple_performers * sharing_effect
                 + popularity_within_oeuvre * within_oeuvre_effect
                )
        # Model error:
        sigma = pm.Exponential("sigma", 1.0)

        y_like = pm.Normal("y_like", theta, sigma=sigma, observed=y, dims="obs_id")

        return model

In [None]:
with model_factory(X=df.drop(columns='LogBoost'),
                   y=df['LogBoost'],
                   ) as multilevel_noncentered_model:
    display(pm.model_to_graphviz(multilevel_noncentered_model))
    multilevel_noncentered_model_idata = pm.sample(10000, tune=3000, return_inferencedata=True, random_seed=RANDOM_SEED, target_accept=0.95)

In [None]:
az.summary(multilevel_noncentered_model_idata, var_names=['~za_artist', '~a_artist', '~mu_artist'], round_to=3)

In [None]:
az.rcParams['plot.max_subplots'] = 100  # Since we have many parameters, the number of subplots is larger than the default - allow az to take more time plotting
var_names = [
            '~a_artist',
            '~sigma_a',
            '~sigma',
            '~za_artist',
            '~mu_artist'
            ]
_ = pm.pairplot(multilevel_noncentered_model_idata,
                var_names=var_names, marginals=True,
                divergences=True, kind=['scatter', 'kde'],
                figsize=(30, 30), scatter_kwargs={'alpha': 0.06})

In [None]:
with multilevel_noncentered_model:
    prior_checks = pm.sample_prior_predictive(random_seed=RANDOM_SEED)

In [None]:
multi_prior = az.from_dict(prior={k: v.T for k, v in prior_checks.items() if k != 'y_like'})

In [None]:
multilevel_noncentered_model_idata.extend(multi_prior)

In [None]:
az.plot_dist_comparison(multilevel_noncentered_model_idata, var_names=['a', 'recency_effect_exponent', 'max_recency_effect',
                                         'effect_popularity', 'history_effect', 'age_passing_effect',
                                         'is_dutch_effect', 'sharing_effect', 'within_oeuvre_effect', 'sigma', 'sigma_a'])
plt.show()

In [None]:
axes = az.plot_posterior(multilevel_noncentered_model_idata, ref_val=0, var_names=var_names)

In [None]:
az.plot_autocorr(multilevel_noncentered_model_idata, var_names=var_names, combined=True, max_lag=20)
plt.show()

In [None]:
def plot_correlations(df):
    fig, ax = plt.subplots(figsize=(12, 12))
    ax = sns.heatmap(df.corr(), cmap='RdBu_r', vmin=-0.8, vmax=0.8, annot=True, fmt='.1%', ax=ax, cbar=False)
    return ax

In [None]:
codes, artistnames = pd.factorize(df['NameArtist'])
mapping = {c: a for c, a in zip(set(codes), artistnames)}

In [None]:
artist_magic = multilevel_noncentered_model_idata.posterior.to_dataframe().groupby('Artist')['za_artist'].describe()
artist_magic.index = artist_magic.index.map(mapping)
artist_magic.sort_values('mean')

In [None]:
parameters = multilevel_noncentered_model_idata.posterior.to_dataframe().droplevel('Artist').loc[lambda x: ~x.index.duplicated()]
plot_correlations(parameters)

## Prediction

In [None]:
%%time
with multilevel_noncentered_model:
    ppc = pm.fast_sample_posterior_predictive(multilevel_noncentered_model_idata)
    ppc_no_artist = pm.fast_sample_posterior_predictive(multilevel_noncentered_model_idata.posterior.drop_vars(['mu_artist', 'a_artist', 'za_artist']))
    
predictions = pd.DataFrame(ppc['y_like'].T, index=df.index)
predictions_no_artist = pd.DataFrame(ppc_no_artist['y_like'].T, index=df.index)

In [None]:
# NOTE!! It only works with at least 2 different artists (which is far from perfect... but it is what it is)
df_new_data = df.tail(2).copy()
df_new_data['JarenGeleden'] = 100

# Second comes the hold out data posterior predictive
with model_factory(X=df_new_data,
                   y=df_new_data['LogBoost'],
                   ) as test_model:
    # For newly passed artists, we do not know what za_artist should be
    ppc_new_data = pm.fast_sample_posterior_predictive(multilevel_noncentered_model_idata.posterior.drop_vars(['mu_artist', 'a_artist', 'za_artist']),
                                         var_names=['y_like'],
                                        )

## Understanding the recency effect

In [None]:
outcomes = {}
days = np.arange(-365, 0)
for i, row in parameters.iterrows():
    recencyeffect = (np.exp(10**row['recency_effect_exponent'] * days) - np.exp(10**row['recency_effect_exponent'] * -365))* row['max_recency_effect']
    outcomes[i] = recencyeffect

In [None]:
df_recencyeffect = pd.DataFrame(outcomes, index=np.arange(-365, 0)).apply(np.exp)

In [None]:
%%time
fig, ax = plt.subplots()
df_recencyeffect.median(axis='columns').plot(ax=ax, lw=3, c='darkblue', label='Estimated effect')
(df_artist.assign(RecencyEffect = lambda df: df['Boost']/df['Boost'].median())
          .plot(x='DaysToStemperiode', y='RecencyEffect', kind='scatter', ax=ax, c='k', alpha=0.6, label='Passed away artists')
)
plt.legend()
df_recencyeffect.sample(1500, axis='columns').plot(c='grey', alpha=0.01, legend=False, ax=ax)

ax.set_xlabel('Days until end of voting')
ax.set_ylabel('Extra boost')

In [None]:
def convert_to_date(days_from_date, reference_date='2020-12-07'):
    return pd.Timestamp(reference_date) + pd.Series([pd.Timedelta(i, 'days') for i in days_from_date], index=days_from_date)

ax =  (df_recencyeffect.median(axis='columns').to_frame()
     .assign(Date = lambda df: convert_to_date(df.index))
     .plot(x='Date', y=0, lw=3, c='darkblue', label='Estimated effect')
    )

ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.ylabel('Extra boost')
plt.xlabel('Date of passing')
plt.yticks([1, 1.5, 2, 2.5])
plt.savefig('recency_effect.jpg')

In [None]:
days_to_use = [-365,
               *list(range(-350, -50, 50)),
               *list(range(-70, -10, 7)),
               *list(range(-10, 0, 2)),
              ]
df_recencyeffect.quantile([0.025, 0.16, 0.5, 0.84, 0.975], axis='columns').loc[:, days_to_use]

## Understanding prediction quality

In [None]:
b = top2000analysis.BoostExplainer(parameters.median(), multilevel_noncentered_model_idata)

In [None]:
predictions_agg = (pd.concat([predictions.quantile([0.16, 0.5, 0.84], axis='columns').transpose(), df['LogBoost']], axis='columns')
                   .rename(columns={0.5: 'yhat'})
                  )

In [None]:
%matplotlib inline
ax = predictions_agg.plot(x='yhat', y='LogBoost', kind='scatter')
valmin, valmax = predictions_agg[['LogBoost', 'yhat']].min().min(), predictions_agg[['LogBoost', 'yhat']].max().max()
ax.plot([valmin, valmax], [valmin, valmax], 'k--')

In [None]:
predictions_exp = predictions_agg.apply(np.exp).rename(columns={'LogBoost': 'Boost'})

In [None]:
predictions_exp.sort_values('Boost')

In [None]:
plt.errorbar(predictions_exp['yhat'], predictions_exp['Boost'],
             xerr=[predictions_exp['yhat'].sub(predictions_exp[0.16]), predictions_exp[0.84].sub(predictions_exp['yhat'])],
             ls=' ', marker='o', alpha=0.6, ms=4)
plt.gca().annotate('$\it{Zij\/gelooft\/in\/mij}$\n by $\it{André\/Hazes}$', (2.123288,17.81695), (2.5, 14),
                   arrowprops=dict(arrowstyle="->", connectionstyle= "angle3,angleA=0,angleB=90"),
)
plt.plot([0, 8], [0, 8], 'k--')
plt.ylabel('Boost in practice')
plt.xlabel('Predicted boost')
plt.savefig('Allboosts_compared.jpg')
plt.show()
plt.errorbar(predictions_exp['yhat'], predictions_exp['Boost'],
             xerr=[predictions_exp['yhat'].sub(predictions_exp[0.16]), predictions_exp[0.84].sub(predictions_exp['yhat'])],
             ls=' ', marker='o', alpha=0.2, ms=4)
plt.plot([0, 11], [0, 11], 'k--')
plt.ylabel('Boost in practice')
plt.xlabel('Predicted boost')
plt.ylim(0, 11.5)
plt.xlim(0, 11.5)
plt.savefig('boostcompared.jpg')

In [None]:
posterior = (df
              .assign(yhat = predictions.median(axis='columns'),
                      error = lambda df: df['yhat'].sub(df['LogBoost']),
                      abserror = lambda df: df['error'].abs(),
                      yhat_no_artist = predictions_no_artist.median(axis='columns'),
                      error_without_artisteffect = lambda df: df['yhat_no_artist'].sub(df['LogBoost']),
                      abserror_without_artisteffect = lambda df: df['error_without_artisteffect'].abs(),
                      improvement_artisteffect = lambda df: df['abserror_without_artisteffect'].sub(df['abserror']),
                      yhat_median_params = lambda df: np.log([b.all_effects(i)['EffectSize'].prod() for i in range(len(df))]),
                      error_median_params = lambda df: df['yhat_median_params'].sub(df['LogBoost']),
                      abserror_median_params = lambda df: df['error_median_params'].abs(),
                     )
            )

In [None]:
posterior['improvement_artisteffect'].describe()

In [None]:
posterior[['LogBoost', 'yhat', 'yhat_no_artist', 'yhat_median_params']].corr()

In [None]:
posterior['LogBoost'].sub(posterior['LogBoost'].median()).abs().mean()

In [None]:
posterior['abserror'].mean()

In [None]:
posterior['abserror_without_artisteffect'].mean()

In [None]:
posterior['abserror_median_params'].mean()

In [None]:
posterior['abserror_median_params'].describe()

In [None]:
posterior['error_without_artisteffect'].mean()

In [None]:
posterior.nlargest(10, 'abserror')

In [None]:
az.concat(multilevel_noncentered_model_idata, 
          az.from_pymc3_predictions(ppc_new_data, model=test_model), inplace=True)

In [None]:
loo_multilevel = az.loo(multilevel_noncentered_model_idata, pointwise=True)
az.plot_khat(loo_multilevel, show_bins=True)

## Finding representative samples

In [None]:
variables = ['JarenGeleden',
             'LogPopularityNorm',
             'IsDutchArtist',
             'PassingTooEarly',
             'DaysEffect',
             'LogSongPopularityWithinArtist',
             'MultiplePerformers',          
             'LogBoost']
mm = sklearn.preprocessing.MinMaxScaler()

all_data = (posterior
            .assign(DaysEffect = lambda df: df_recencyeffect.median(axis='columns').loc[df['DaysToStemperiode'].clip(lower=-365)].tolist())
            .filter(variables)
            )
mm.fit(all_data)

data = (posterior
        .query('abserror < 0.15 & LogPopularityNorm > -0.3 & DaysToStemperiode < -2')
        .assign(DaysEffect = lambda df: df_recencyeffect.median(axis='columns').loc[df['DaysToStemperiode'].clip(lower=-365)].tolist())
        .filter(variables)
        .drop(
        [10036, # Never be clever is technically performed also by Herman Broods band, but it's considered to be performed by a single artist
        ]
        )
       )
data.head()

In [None]:
normalized_data = mm.transform(data)
dists = sklearn.metrics.pairwise_distances(normalized_data, metric='minkowski', p=1)
inds = np.argsort(dists, axis=None)[::-1]

In [None]:
boosting_effects = ['LogBoost', 'yhat', 'error','yhat_no_artist', 'error_without_artisteffect']
variables = [v for v in variables if v != 'DaysEffect']
# Use step size of 2 because of symmetry: each pair is present twice
for ind in inds[:10:2]:
    x, y = divmod(ind, len(data))
    print(f'Distance: {dists[x, y]:.3f}; ({x}, {y})')
    ind = data.iloc[[x, y]].index

    new_df = pd.concat([df.assign(pos = range(len(df)))[['NameSong', 'Title', 'LogBoost', 'BoostSong', 'DaysToStemperiode', 'pos'] + variables],
                        posterior[boosting_effects]], axis='columns')
    display(new_df.loc[ind])

## Figuring out the boost for two songs

In [None]:
def preprocess(df, include_difference):
    df = df.assign(TotalEffect = lambda df: df['EffectSize'].cumprod(),
                   Diff = lambda df: df['TotalEffect'].diff().fillna(df['TotalEffect']))
    return df

In [None]:
def find_colours(data, mask_from, including_difference):
    if mask_from is None:
        if including_difference:
            c = ['gray'] + (len(data) - 4) * ['lightgray'] + ['purple', 'red', 'purple']
            return c
        else:
            mask_from = len(data)
    if mask_from == 1:
        c =  ['purple']
    elif mask_from <= len(data):
        c = ['gray'] + (mask_from - 2) * ['lightgray'] + ['purple']
    return c
    
def plot_waterfall(effects, mask_from, including_difference=False, ax=None, horizontal=False):
    if ax is None:
        fig, ax = plt.subplots()
    data = effects.copy()
    #Store data and create a blank series to use for the waterfall
    total = data['Diff'].sum()

    if mask_from is not None:
        data.iloc[mask_from:] = None
    blank = data['Diff'].cumsum().shift(1).fillna(0)

    #Get the net total number for the final element in the waterfall
    if mask_from is None:
        data.loc[("Total", ''), 'EffectSize'] = 1
        data.loc[("Total", ''), 'Diff'] = total
        data.loc[("Total", ''), 'TotalEffect'] = total
        blank.loc["Total"] = total # This is only to get the steps right - it will later correctly be set to 0

    #The steps graphically show the levels as well as used for label placement
    step = blank.reset_index(drop=True).repeat(3).shift(-1)
    step[1::3] = np.nan
    if mask_from is None:
        blank.loc["Total"] = 0
    else:
        step.iloc[mask_from * 3:] = None
    
    if including_difference:
        blank.loc[('Prediction', '')] = 0
        data.loc[('Prediction', ''), 'Diff'] = data.loc[('Prediction', ''), 'TotalEffect']

    #Plot and label
    colours = find_colours(data, mask_from, including_difference)
    kind = 'bar' if not horizontal else 'barh'
    ax = data['Diff'].plot(kind=kind,
                           stacked=True,
                           bottom=blank,
                           left=blank,
                           legend=None,
                           color=colours,
                           ax=ax)
    if horizontal:
        ax.plot(step.values, step.index,'k', linewidth=1)
    else:
        ax.plot(step.index, step.values,'k', linewidth=1)

    #Get the y-axis position for the labels
    y_height = data['Diff'].cumsum().shift(1).fillna(0)

    #Get an offset so labels don't sit right on top of the bar
    neg_offset = max / 25
    pos_offset = max / 50
    
    #Start label loop
    loop = 0
    for index, row in data.iterrows():
        # For the last item in the list, we don't want to double count
        y = row['TotalEffect']
        # Determine if we want a neg or pos offset
        if row['Diff'] >= 0:
            y += pos_offset
            va = 'bottom'
            ha = 'left'
        else:
            y -= neg_offset
            va = 'top'
            ha = 'right'
        if index not in [('Prediction', ''), ('Total', '')]:
            label = f'x {row["EffectSize"]:.2f}'
        else:
            label = ''
        if loop > 0:
            label += f'\n= {row["TotalEffect"] : .2f}'
        if horizontal:
            ax.annotate(label, (y, loop), va="center", ha=ha, fontsize=11)
        else:
            ax.annotate(label, (loop, y), ha="center", va=va, fontsize=11)
        loop += 1

    #Scale up the axis so there is room for the labels
    if horizontal:
        ax.axvline(1, c='k', ls='--')
        ax.set_xlim(0, 3)
    else:
        ax.axhline(1, c='k', ls='--')
        ax.set_ylim(0, 3)
    
    labels = ['Base',
                'Historical\neffect',
                'Popularity\neffect',
                'Artist is\nDutch',
                'Artist\ndied young',
                'Timing of\ndeath',
                'Artist\ndeviation',
                'Song popularity\nwithin artist oeuvre',
                'Multiple\nperformers',
                'Prediction'
                ]
    if including_difference:
        labels += ['Difference from\nactual boost', 'Actual boost']
    if horizontal:
        ax.set_yticklabels(labels)
        ax.invert_yaxis()
    else:
        ax.set_xticklabels(labels)
    ax.tick_params(axis='both', which='major', labelsize=12)
    if horizontal:
        ax.set_xlabel('Boost', fontsize=14)
        ax.set_ylabel('Effects', fontsize=14)
    else:
        ax.set_ylabel('Boost', fontsize=14)
        ax.set_xlabel('Effects', fontsize=14)
    return ax

In [None]:
def calculate_placement(series):
    result = {'absolute': series.rank(pct=False, ascending=False).astype(int),
             'percent': series.rank(pct=True, ascending=False)}
    return pd.DataFrame(result)

full_list['PctVotes'] = full_list['Rank'].apply(voteestimator.MeindertsmaVotesEstimator().percentage_of_votes)
votes_per_artist_per_year = full_list.groupby(['Name', 'Year'])[['PctVotes']].sum()
votes_per_artist_per_year.groupby('Year')['PctVotes'].apply(calculate_placement).loc[[('David Bowie', 2015), ('André Hazes', 2003)]]

In [None]:
focus = None
song_pos = 76
explanation = b.all_effects(song_pos)
data_eenzamekerst = explanation.pipe(preprocess, False)
data_eenzamekerst_incl_diff = b.all_effects(song_pos, True, multilevel_noncentered_model_idata).pipe(preprocess, True)

song_pos = 1
explanation = b.all_effects(song_pos)
data_underpressure = explanation.pipe(preprocess, False)
data_underpressure_incl_diff = b.all_effects(song_pos, True, multilevel_noncentered_model_idata).pipe(preprocess, True)

fig, subplots = plt.subplots(1, 2, figsize=(11, 7))
ax = plot_waterfall(data_eenzamekerst_incl_diff, focus, True, subplots[0], True)
ax.set_title('Eenzame Kerst by André Hazes')
plt.setp(ax.get_xticklabels()[-1], visible=False)

ax2 = plot_waterfall(data_underpressure_incl_diff, focus, True, subplots[1], True)
ax2.set_title('Under Pressure by David Bowie & Queen')
ax2.set_yticklabels([])
ax2.set_ylabel('')
plt.setp(ax2.get_xticklabels()[0], visible=False)
plt.subplots_adjust(wspace=1e-2)

plt.tight_layout()
plt.savefig(f'EffectsEKUP_incl_diff_horizontal.jpg')
plt.close(fig)

In [None]:
fig, ax = plt.subplots(figsize=(6, 7))
plot_waterfall(data_eenzamekerst, 1, False, horizontal=True, ax=ax)
plt.tight_layout()
plt.savefig(f'Effects_base_horizontal.jpg')
plt.close(fig)

In [None]:
for focus in list(range(2, len(data_eenzamekerst) + 1)) + [None]:
    fig, subplots = plt.subplots(1, 2, figsize=(11, 7))
    
    ax = plot_waterfall(data_eenzamekerst, focus, False, subplots[0], True)
    ax.set_title('Eenzame Kerst by André Hazes')
    plt.setp(ax.get_xticklabels()[-1], visible=False)

    ax2 = plot_waterfall(data_underpressure, focus, False, subplots[1], True)
    ax2.set_title('Under Pressure by David Bowie & Queen')
    ax2.set_yticklabels([])
    ax2.set_ylabel('')
    plt.setp(ax2.get_xticklabels()[0], visible=False)
    
    plt.subplots_adjust(wspace=1e-2)
    plt.tight_layout()
    plt.savefig(f'EffectsEKUP_focus_{focus}_horizontal.jpg')
    plt.close(fig) # Do not show