In [None]:
import os 
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.preprocessing
import sklearn.metrics

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

import voteestimator

## Read

In [None]:
class AnalysisSetCreator:
    
    def __init__(self, votesmodel='Meindertsma'):

        votesmodels = {'Meindertsma': voteestimator.MeindertsmaVotesEstimator(),
                      'Exponential': voteestimator.ExponentialVotesEstimator()
                      }
        self.votesmodel = votesmodels[votesmodel]
    
    def _combine_data(self, filefolder):
        self.notering = pd.read_parquet(os.path.join(filefolder, 'notering.parquet'))
        self.song = pd.read_parquet(os.path.join(filefolder, 'song.parquet'))
        self.songartist = pd.read_parquet(os.path.join(filefolder, 'songartist.parquet'))
        self.artist = (pd.read_parquet(os.path.join(filefolder, 'artist.parquet')) # TODO: This should not happen here
                          .pipe(self._artist_features)
                        )
        
        df = (self.notering.merge(self.song, left_on='SongID', right_index=True)
                           .merge(self.songartist.reset_index())
                           .merge(self.artist, left_on='ArtistID', right_index=True, suffixes=('Song', 'Artist'))
             )
        return df
    
    def _read_stemperiodes(self, path=os.path.join('Data', 'EindeStemperiode.xlsx')):
        einde_stemperiode = (pd.read_excel(path, engine='openpyxl')  # openpyxl does support xlsx
                               .dropna(subset=['EindeStemperiode'])
                               .drop(columns=['Bron'])
                               .sort_values('EindeStemperiode')
                            )
        return einde_stemperiode
    

    def _check_passed_away_during_top2000(self, df, top2000_stemperiodes):
        first_stemperiode = top2000_stemperiodes['EindeStemperiode'].min()
        relevant_date_of_death = first_stemperiode + pd.Timedelta('365 days')
        df['IsOverleden'] = df['Overlijdensdatum'].ge(relevant_date_of_death)
        return df
    
    def _find_next_top2000_after_death(self, df, top2000_stemperiodes):
        not_passed_away_during_top_2000 = df[~df['IsOverleden']].copy()
        passed_away_during_top2000 = (df.loc[df['IsOverleden']]
                                      .sort_values('Overlijdensdatum')
                                      .reset_index()
                                     )

        passed_away_during_top2000 = (pd.merge_asof(passed_away_during_top2000, top2000_stemperiodes,
                                                   left_on='Overlijdensdatum', right_on='EindeStemperiode', direction='forward')
                                     .set_index('ArtistID')
                                     )
        df = pd.concat([not_passed_away_during_top_2000, passed_away_during_top2000], sort=False)
        return df


    def _artist_features(self, df):
        einde_stemperiode = self._read_stemperiodes()
        df = (df                                          
                .pipe(self._check_passed_away_during_top2000, einde_stemperiode)
                .pipe(self._find_next_top2000_after_death, einde_stemperiode)
                .assign(AgePassing = lambda df: df['Overlijdensdatum'].sub(df['Geboortedatum']).dt.days / 365.25,
                        PassingTooEarly = lambda df: df['AgePassing'].sub(80).mul(-1).clip(lower=0),
                        IsDutch = lambda df: df['IsDutch'].astype(int),
                        )
             )
        return df
    
    def _rank_features(self, df):
        return df.assign(PctVotes = lambda df: df['Rank'].apply(self.votesmodel.percentage_of_votes))
    
    
    def _normalize_by_years_before_death(self, df, years_to_normalize=2):        
        mi = pd.MultiIndex.from_product([df.query('IsOverleden')['SongID'].unique(),
                                         df.query('IsOverleden')['YearsSinceOverlijden'].unique(),],
                                        names=['SongID', 'YearsSinceOverlijden'])
        votes_before_death = (pd.DataFrame(index=mi)
                              .join(self.songartist)
                              .join(df.set_index(['SongID', 'YearsSinceOverlijden', 'ArtistID'])[['Year', 'PctVotes']])
                              .join(self.artist[['JaarTop2000']])
                              .join(self.song[['YearMade']])
                              .assign(YearTop2000 = lambda df: df['JaarTop2000'].add(df.index.get_level_values('YearsSinceOverlijden')),
                                      PctVotes = lambda df: np.where(df['YearTop2000'].gt(df['YearMade']) & df['YearTop2000'].le(df['Year'].max()),
                                                             df['PctVotes'].fillna(self.votesmodel.lower_than_2000), np.nan)
                                     )
                             ['PctVotes']
                             .unstack('YearsSinceOverlijden')
                             .loc[:, range(-years_to_normalize, 0)]
                             .mean(axis='columns')
                             .rename('PctVotesBeforeDeath')
                             .reset_index()
                             )
        
        df = df.merge(votes_before_death, how='left')
        return df

    
    def _song_features(self, df):
        
        df = (df.assign(NrArtists = lambda df: df.groupby(['SongID', 'Year'])['Rank'].transform('count'),
                        YearsSinceOverlijden = lambda df: df['Year'].sub(df['JaarTop2000']),
                       )
                .pipe(self._normalize_by_years_before_death)
             )
        return df
    
    def _song_features_after_passing(self, df):
        df = (df.assign(NrsBeforeDeath = lambda df: df.groupby('ArtistID')['ArtistID'].transform('count'),
                        PopularityWithinArtist = lambda df: df.groupby('ArtistID')['PctVotesBeforeDeath'].apply(lambda v: v.div(v.mean())),
                        LogSongPopularityWithinArtist = lambda df: np.log10(df['PopularityWithinArtist']),
                        RecencyWithinArtist = lambda df: df.groupby('ArtistID')['YearMade'].apply(lambda v: v.sub(v.min()).div(v.max() - v.min())),
                        YearsBeforeDeath = lambda df: df['YearMade'].sub(df['JaarTop2000']),
                        Boost = lambda df: df['PctVotes'].div(df['PctVotesBeforeDeath']),
                        MultiplePerformers = lambda df: df['NrArtists'].gt(1).astype(int),
                        JarenGeleden = lambda df: df['JaarTop2000'].sub(df['JaarTop2000'].max()),
                        )
             )
        return df
    
    def create_analysis_set(self, filefolder):
        df = (self._combine_data(filefolder)
                  .pipe(self._rank_features)
                  .pipe(self._song_features)
                  .query('YearsSinceOverlijden == 0')
                  .query(f'PctVotesBeforeDeath > {self.votesmodel.lower_than_2000}')
                  .pipe(self._song_features_after_passing)
             )
        return df
    
    def create_artist_set(self, filefolder):
        df = self.create_analysis_set(filefolder)
        df_artist = (df.groupby('ArtistID')
                        .agg({'PctVotes': 'sum',
                              'PctVotesBeforeDeath': 'sum',
                               'YearMade': 'last'
                            }
                            )
                        .join(self.artist[['Name', 'IsDutch', 'AgePassing', 'JaarTop2000', 'Overlijdensdatum', 'EindeStemperiode']])
                        .assign(DaysToStemperiode = lambda df: df['Overlijdensdatum'].sub(df['EindeStemperiode']).dt.days,
                                YearsSinceLastHit = lambda df: df['JaarTop2000'].sub(df['YearMade']),
                                LogPopularity = lambda df: np.log10(df['PctVotesBeforeDeath']),
                                LogPopularityNorm = lambda df: df['LogPopularity'].sub(df['LogPopularity'].median()),
                                Boost = lambda df: df['PctVotes'].div(df['PctVotesBeforeDeath']),
                                LogBoost = lambda df: np.log(df['Boost']),
                                )
                    )
        return df_artist
    
    def create_full_feature_set(self, filefolder):
        df = self.create_analysis_set(filefolder)
        df_artist = self.create_artist_set(filefolder)#.pipe(self._artist_features)
        full_set = (df.merge(df_artist, left_on='ArtistID', right_index=True, suffixes=('Song', 'Artist'))
                      .assign(
                              SongRelativeBoost = lambda df: df['BoostSong'].div(df['BoostArtist']),
                              LogRelativeBoost = lambda df: np.log2(df['SongRelativeBoost']),
                              LogBoost = lambda df: np.log(df['BoostSong']),
                             )
           )
        return full_set

a = AnalysisSetCreator()
df_song = a.create_analysis_set('Data')
df_artist = a.create_artist_set('Data')
df = a.create_full_feature_set('Data')

In [None]:
df_artist['Boost'].describe()

In [None]:
full_list = a._combine_data('Data')
print(f"In its history, the Top 2000 has seen {full_list['SongID'].nunique()} songs and {full_list['ArtistID'].nunique()} artists")

In [None]:
df_song['Boost'].describe()

In [None]:
def calculate_new_position(old_position, votesmodel, boost=1.75):
    old_votes = votesmodel.percentage_of_votes(old_position)
    new_votes = old_votes
    new_position = old_position
    while new_votes < boost * old_votes and new_position >= 1:
        new_position -= 1
        new_votes = votesmodel.percentage_of_votes(new_position)
    return max(new_position, 1)

In [None]:
positioning = (pd.DataFrame({'OldPosition': range(1, 2001)})
               .assign(NewPosition = lambda df: [calculate_new_position(p, a.votesmodel) for p in df['OldPosition']],
                      FactorRanking = lambda df: df['NewPosition'] / df['OldPosition']))

In [None]:
positioning.plot(x='OldPosition', y='FactorRanking')

## Create analysis set

## Univariate analysis
* Newsworthyness of artist passing
  * Age when passing away
  * Dutch nationality
  * Popularity
  * Year
  * Recency of hits
* Days to stemperiode

* Song
  * Popularity within artist
  * Recency of song within artist
  
### Newsworthyness

#### Artist age

In [None]:
def plot_with_trend(df, column, logy=False):
    ycolname = 'LogBoost' if logy else 'Boost'
    preds = LinearRegression().fit(df[[column]], df[ycolname]).predict(df[[column]])
    ax = df.plot(x=column, y=ycolname, kind='scatter', label='Passed away artists', c='grey')
    ax.plot(df[column], preds, 'k', label='Trend')
    return ax

In [None]:
ax = plot_with_trend(df_artist, 'AgePassing', logy=True)

#### Jaar

In [None]:
ax = plot_with_trend(df_artist, 'JaarTop2000', logy=False)
ax.xaxis.set_major_locator(mtick.MaxNLocator(integer=True))
ax.set_ylim(None, 6)
plt.legend()
plt.tight_layout()
ax.set_frame_on(False)
plt.gcf().savefig('YearEffect.jpg')

#### Nationality

In [None]:
_ = sns.catplot(x='IsDutch', y='LogBoost', data=df_artist)

#### Popularity

In [None]:
ax = plot_with_trend(df_artist, 'PctVotes', logy=True)

In [None]:
plot_with_trend(df_artist, 'LogPopularity', logy=True)

In [None]:
dftest = df_artist.assign(Popularity = lambda df: df['LogPopularity'].sub(df['LogPopularity'].median()),
                            Boost = lambda df: df['Boost'])

preds = LinearRegression().fit(dftest[['Popularity']], dftest['Boost']).predict(dftest[['Popularity']])
ax = dftest.plot(x='Popularity', y='Boost', kind='scatter', c='grey', ylim=(None, 6))
ax.plot(dftest[['Popularity']], preds, 'k', label='trend')
plt.legend()
plt.tight_layout()
plt.gcf().savefig('PopularityEffect.jpg')

#### Recency

In [None]:
ax = plot_with_trend(df_artist.query('DaysToStemperiode > -365'), 'DaysToStemperiode', logy=True)

In [None]:
recency_buckets = (pd.cut(df_artist['DaysToStemperiode'].clip(lower=-365), 6, labels=False, retbins=False, right=False)
                  .map({i: v for i, v in enumerate(range(-333, 0, 60))}))

In [None]:
df_artist.groupby(recency_buckets)['LogBoost'].agg(['mean', 'sem', 'std', 'count'])

In [None]:
ax = df_artist.groupby(recency_buckets)['LogBoost'].agg(['mean', 'sem', 'std', 'count']).plot(y='mean', yerr='sem')
plt.xlim(-365, 0)
plt.show()

### Recency of last hit

In [None]:
ax = plot_with_trend(df_artist, 'YearsSinceLastHit', logy=True)

### Song

### Solo song

In [None]:
df.groupby(['NrArtists'])['LogBoost'].agg(['mean', 'std', 'sem', 'count'])

In [None]:
df.groupby(['MultiplePerformers'])['LogBoost'].agg(['mean', 'std', 'sem', 'count'])

### Popularity within artist oeuvre

In [None]:
ax = plot_with_trend(df, 'PopularityWithinArtist', logy=True)

In [None]:
ax = plot_with_trend(df, 'LogSongPopularityWithinArtist', logy=True)

### Recency within artist

In [None]:
ax = plot_with_trend(df.query('NrsBeforeDeath > 2'), 'RecencyWithinArtist', logy=True)

In [None]:
def plot_waterfall(data, color=None, buildup=False, **kwargs):
    '''
    Plot a buildup or builddown waterfall chart from data
    This function was adapted from https://pbpython.com/waterfall-chart.html

    Parameters
    ----------
    data: pd.Series to be shown as waterfall
    color: optionally give color as a list for each bar (to highlight some bars)
    buildup: False (default) for builddown, True for buildup

    Returns
    -------
    ax: Axis object
    data: the data, including a "total"-row
    blank: the size of the blank space before each bar
    '''
    #TODO: add connecting lines
    if color is None:
        color = ['lightgray'] * len(data)

    blank = data.cumsum().shift(1).fillna(0)
    total = data.sum()
    data.loc['Total'] = total
    blank.loc['Total'] = 0
    color = color + ['gray']
    
    step = blank.reset_index(drop=True).repeat(3).shift(-1)
    step[1::3] = np.nan
    
    if buildup:
        data = data[::-1]
        blank = blank[::-1]
        color = color[::-1]

    ax = data.plot(kind='barh', stacked=True, left=blank, color=color, **kwargs)
#     ax.plot(step.values, step.index, 'k--')

    return ax, data, blank

data = pd.cut(df_artist['Boost'], [0, 1, 1.5, 2.5, np.inf],
               labels=['No boost',
                       'Up to 50% more votes',
                       '1.5 - 2.5 x\nas many votes',
                       'More than 2.5\nx as many votes']).value_counts(normalize=True).sort_index(ascending=False)
data.index = data.index.astype(str)
ax, _, _ = plot_waterfall(data, color=['purple', 'purple', 'purple', 'lightgrey'])
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))
ax.set_frame_on(False)
plt.tight_layout()
plt.gcf().savefig('BoostDistribution.jpg')

In [None]:
df_artist.nlargest(10, 'Boost')

In [None]:
df_artist['Boost'].describe()

## Multivariate

In [None]:
import pymc3 as pm
import arviz as az

In [None]:
RANDOM_SEED = 42
def model_factory(X, y):
    passed_away_artists = X['ArtistID'].unique()
    artist_lookup = dict(zip(passed_away_artists, range(len(passed_away_artists))))
    artist_vals = X['ArtistID'].replace(artist_lookup).values
    artist_model = (X.assign(ArtistIDModel = lambda df: X['ArtistID'].map(artist_lookup),
                             )
                .sort_values('ArtistIDModel')
                .drop_duplicates(['ArtistIDModel'])
               )
    
    
    coords = {"obs_id": np.arange(X.shape[0]),
              'Artist': range(len(passed_away_artists))
         }
    with pm.Model(coords=coords) as model:
        artist_idx = pm.Data("artist_idx", artist_vals, dims="obs_id")
        days_to_stemperiode = pm.Data('days_to_stemperiode', artist_model['DaysToStemperiode'], dims='Artist')
        logpopularity = pm.Data('logpopularity', artist_model['LogPopularityNorm'], dims='Artist')
        jaren_geleden = pm.Data("jaren_geleden", artist_model['JarenGeleden'], dims='Artist')
        passing_too_early = pm.Data('passing_too_early', artist_model['PassingTooEarly'], dims='Artist')
        is_dutch = pm.Data('is_dutch', artist_model['IsDutchArtist'], dims='Artist')

        multiple_performers = pm.Data('multiple_performers', X['MultiplePerformers'], dims="obs_id")
        popularity_within_oeuvre = pm.Data('popularity_within_oeuvre', X['LogSongPopularityWithinArtist'], dims="obs_id")

        # Hyperpriors:
        a = pm.Normal("a", mu=0, sigma=2.0)
        sigma_a = pm.Exponential("sigma_a", 1.0)

        recency_effect_exponent = pm.Normal('recency_effect_exponent', mu=-1.5,sigma=1)
        max_recency_effect = pm.Normal('max_recency_effect', mu=2, sigma=2)
        effect_popularity = pm.Normal('effect_popularity', mu=0, sigma=2)
        history_effect = pm.Normal('history_effect', mu=0, sigma=1)
        age_passing_effect = pm.Normal('age_passing_effect', mu=0, sigma=1)
        is_dutch_effect = pm.Normal('is_dutch_effect', mu=0, sigma=2)

        # Expected value per artist:
        mu_artist = (a
                     + logpopularity * effect_popularity
                     # The correction of subtracting the minimum value is important for two reasons:
                     # 1. Since it fixes the minimum value at 1, it breaks the degeneracy with _a_, which makes sampling much more stable
                     # 2. It allows for much easier interpretation
                     + (np.exp(10**recency_effect_exponent * days_to_stemperiode)- np.exp(10**recency_effect_exponent * -365))* max_recency_effect
                     + jaren_geleden * history_effect
                     + passing_too_early * age_passing_effect
                     + is_dutch * is_dutch_effect
                    )

        # This is the non-centered version of the model for a much more stable sampling
        # See https://twiecki.io/blog/2017/02/08/bayesian-hierchical-non-centered/ for more information
        mu_artist = pm.Deterministic("mu_artist", mu_artist, dims="Artist")
        za_artist = pm.Normal("za_artist", mu=0.0, sigma=1.0, dims='Artist')
        a_artist = pm.Deterministic("a_artist", mu_artist + za_artist * sigma_a, dims="Artist")
        sharing_effect = pm.Normal('sharing_effect', mu=0, sigma=2.0)
        within_oeuvre_effect = pm.Normal('within_oeuvre_effect', mu=0, sigma=2.0)
        theta = (a_artist[artist_idx]
                 + multiple_performers * sharing_effect
                 + popularity_within_oeuvre * within_oeuvre_effect
                )
        # Model error:
        sigma = pm.Exponential("sigma", 1.0)

        y_like = pm.Normal("y_like", theta, sigma=sigma, observed=y, dims="obs_id")

        return model

In [None]:
with model_factory(X=df.drop(columns='LogBoost'),
                   y=df['LogBoost'],
                   ) as multilevel_noncentered_model:
    display(pm.model_to_graphviz(multilevel_noncentered_model))
    multilevel_noncentered_model_idata = pm.sample(10000, tune=2000, return_inferencedata=True, random_seed=RANDOM_SEED, target_accept=0.95)

In [None]:
# with pm.Model(coords=coords) as unpooled_model:
#     jaren_geleden = pm.Data("jaren_geleden", df['JarenGeleden'], dims='obs_id')
#     logpopularity = pm.Data('logpopularity', df['LogPopularityNorm'], dims='obs_id')
#     days_to_stemperiode = pm.Data('days_to_stemperiode', df['DaysToStemperiode'], dims='obs_id')
#     age_passing = pm.Data('age_passing', df['AgePassingArtist'], dims='obs_id')
    
#     multiple_performers = pm.Data('multiple_performers', df['MultiplePerformers'], dims="obs_id")
#     popularity_within_oeuvre = pm.Data('popularity_within_oeuvre', df['LogSongPopularityWithinArtist'], dims="obs_id")
    
#     # Hyperpriors:
#     a = pm.Normal("a", mu=0, sigma=10.0)
#     recency_effect_exponent = pm.Normal('recency_effect_exponent', mu=-1.5,sigma=1)
#     max_recency_effect = pm.Normal('max_recency_effect', 2, )
#     effect_popularity = pm.Normal('effect_popularity', mu=0, sigma=10)
#     history_effect = pm.Normal('history_effect', mu=0, sigma=10)
#     age_passing_effect = pm.Normal('age_passing_effect', mu=0, sigma=10)
    
#     # Expected value per artist:
#     mu_artist = (a
#                  + effect_popularity * logpopularity
#                  + np.exp(10**recency_effect_exponent * days_to_stemperiode) * max_recency_effect
#                  + jaren_geleden * history_effect
#                  + age_passing * age_passing_effect
#                 )
    
#     sharing_effect = pm.Normal('sharing_effect', mu=0, sigma=10.0)
#     within_oeuvre_effect = pm.Normal('within_oeuvre_effect', mu=0, sigma=10.0)
#     theta = (mu_artist
#              + multiple_performers * sharing_effect
#              + within_oeuvre_effect * popularity_within_oeuvre
#             )
#     # Model error:
#     sigma = pm.Exponential("sigma", 1.0)

#     y = pm.Normal("y", theta, sigma=sigma, observed=df['LogBoost'], dims="obs_id")
# pm.model_to_graphviz(unpooled_model)

In [None]:
# RANDOM_SEED = 42
# with unpooled_model:
#     unpooled_model_idata = pm.sample(5000, tune=3000, return_inferencedata=True, random_seed=RANDOM_SEED, target_accept=0.95)


In [None]:
az.summary(multilevel_noncentered_model_idata, var_names=['~za_artist', '~a_artist', '~mu_artist'], round_to=3)

In [None]:
az.rcParams['plot.max_subplots'] = 100  # Since we have many parameters, the number of subplots is larger than the default - allow az to take more time plotting
var_names = [
            '~a_artist',
            '~sigma_a',
            '~sigma',
            '~za_artist',
            '~mu_artist'
            ]
_ = pm.pairplot(multilevel_noncentered_model_idata,
                var_names=var_names, marginals=True,
                divergences=True, kind=['scatter', 'kde'],
                figsize=(30, 30), scatter_kwargs={'alpha': 0.06})

In [None]:
_ = az.plot_trace(multilevel_noncentered_model_idata, compact=True)

In [None]:
az.plot_autocorr(multilevel_noncentered_model_idata, var_names=var_names, combined=True, max_lag=20)
plt.show()

In [None]:
def plot_correlations(df):
    fig, ax = plt.subplots(figsize=(12, 12))
    ax = sns.heatmap(df.corr(), cmap='RdBu_r', vmin=-0.8, vmax=0.8, annot=True, fmt='.1%', ax=ax, cbar=False)
    return ax

In [None]:
parameters = multilevel_noncentered_model_idata.posterior.to_dataframe().droplevel('Artist').loc[lambda x: ~x.index.duplicated()]
plot_correlations(parameters)

In [None]:
fig, ax = plt.subplots()
multilevel_noncentered_model_idata.posterior = multilevel_noncentered_model_idata.posterior.assign_coords({"N_artist": ("Artist", artist_model['NrsBeforeDeath'])})
# plot means
multilevel_noncentered_model_idata.posterior.mean(dim=("chain", "draw")).plot.scatter(
    x="N_artist", y="a_artist", ax=ax, alpha=0.9
)
ax.axhline(
    multilevel_noncentered_model_idata.posterior['a'].median(),
    alpha=0.4,
    ls="--",
    label="Est. population mean",
)

# plot hdi
hdi = az.hdi(multilevel_noncentered_model_idata)['a_artist']
ax.vlines(artist_model['NrsBeforeDeath'], hdi.sel(hdi="lower"), hdi.sel(hdi="higher"), color="orange", alpha=0.5)

ax.set(
    xlabel="Nr songs in Top 2000 by artist before death",
    ylabel="Artist boost",
)
plt.legend()


## Prediction

In [None]:
%%time
with train_model:
    ppc = pm.fast_sample_posterior_predictive(multilevel_noncentered_model_idata)

with train_model:
    ppc_no_artist = pm.fast_sample_posterior_predictive(multilevel_noncentered_model_idata.posterior.drop_vars(['mu_artist', 'a_artist', 'za_artist']))
predictions = pd.DataFrame(ppc['y_like'].T, index=df.index)
predictions_no_artist = pd.DataFrame(ppc_no_artist['y_like'].T, index=df.index)

In [None]:
# NOTE!! It only works with at least 2 different artists (which is far from perfect... but it is what it is)
df_new_data = df.tail(2).copy()
df_new_data['JarenGeleden'] = 100

# Second comes the hold out data posterior predictive
with model_factory(X=df_new_data,
                   y=df_new_data['LogBoost'],
                   ) as test_model:
    # For newly passed artists, we do not know what za_artist should be
    ppc = pm.fast_sample_posterior_predictive(multilevel_noncentered_model_idata.posterior.drop_vars(['mu_artist', 'a_artist', 'za_artist']),
                                         var_names=['y_like'],
                                        )
#     plt.figure()
#     plt.hist(ppc['y_like'], 30)
#     plt.axvline(new_site_Y, linestyle='--', color='r')

## Understanding the recency effect

In [None]:
outcomes = {}
days = np.arange(-365, 0)
for i, row in parameters.iterrows():

    recencyeffect = (np.exp(10**row['recency_effect_exponent'] * days) - np.exp(10**row['recency_effect_exponent'] * -365))* row['max_recency_effect']
    outcomes[i] = recencyeffect

In [None]:
df_recencyeffect = pd.DataFrame(outcomes, index=np.arange(-365, 0)).apply(np.exp)

In [None]:
%%time
# Every column contains one parameter set
ax = df_recencyeffect.sample(1500, axis='columns').plot(c='grey', alpha=0.01, legend=False)
ax = df_recencyeffect.median(axis='columns').plot(ax=ax, lw=3, c='darkblue')
plt.show()

In [None]:
ax = df_recencyeffect.median(axis='columns').plot(lw=3, c='darkblue', label='Estimated effect')
(df_artist.assign(RecencyEffect = lambda df: df['Boost']/df['Boost'].median())
          .plot(x='DaysToStemperiode', y='RecencyEffect', kind='scatter', ax=ax, c='grey', label='Passed away artists')
)
ax.set_xlabel('Days until end of voting')
ax.set_ylabel('Extra boost')
plt.legend()
plt.tight_layout()
plt.gcf().savefig('RecencyEffect.jpg')

In [None]:
days_to_use = [-365] + list(range(-350, -50, 50)) + list(range(-70, -10, 7)) + list(range(-10, 0, 2))
df_recencyeffect.quantile([0.025, 0.16, 0.5, 0.84, 0.975], axis='columns').loc[:, days_to_use]

## Understanding prediction quality

In [None]:
with multilevel_noncentered_model:
    ppc = pm.fast_sample_posterior_predictive(
                multilevel_noncentered_model_idata, random_seed=RANDOM_SEED, var_names=['y', 'za_artist']
                )

predictions = pd.DataFrame(ppc['y'].T, index=df.index)
za_artist = pd.DataFrame(ppc['za_artist'].T, index=artist_model.index) # The artist magic

artist_outcomes = pd.concat([artist_model, za_artist.median(axis='columns').rename('ArtistEffect')], axis='columns')

In [None]:
predictions_agg = (pd.concat([predictions.quantile([0.16, 0.5, 0.84], axis='columns').transpose(), df['LogBoost']], axis='columns')
                   .rename(columns={0.5: 'yhat'})
                  )

In [None]:
%matplotlib inline
ax = predictions_agg.plot(x='yhat', y='LogBoost', kind='scatter')
valmin, valmax = predictions_agg[['LogBoost', 'yhat']].min().min(), predictions_agg[['LogBoost', 'yhat']].max().max()
ax.plot([valmin, valmax], [valmin, valmax], 'k--')



In [None]:
posterior[['LogBoost', 'yhat']].corr()

In [None]:
predictions_exp = predictions_agg.apply(np.exp).rename(columns={'LogBoost': 'Boost'})

In [None]:
plt.errorbar(predictions_exp['yhat'], predictions_exp['Boost'],
             xerr=[predictions_exp['yhat'].sub(predictions_exp[0.16]), predictions_exp[0.84].sub(predictions_exp['yhat'])],
             ls=' ', marker='o', alpha=0.2, ms=4)
plt.plot([0, 8], [0, 8], 'k--')
plt.ylabel('Boost')
plt.xlabel('Predicted boost')
plt.show()

plt.errorbar(predictions_exp['yhat'], predictions_exp['Boost'],
             xerr=[predictions_exp['yhat'].sub(predictions_exp[0.16]), predictions_exp[0.84].sub(predictions_exp['yhat'])],
             ls=' ', marker='o', alpha=0.2, ms=4)
plt.plot([0, 8], [0, 8], 'k--')
plt.ylabel('Boost')
plt.xlabel('Predicted boost')
plt.ylim(0, 8)
plt.xlim(0, 8)
plt.show()

In [None]:
posterior = (df.reset_index().merge(artist_outcomes[['ArtistID', 'ArtistEffect']]).set_index('index')
              .assign(yhat = predictions.median(axis='columns'),
                     error = lambda df: df['yhat'].sub(df['LogBoost']),
                     abserror = lambda df: df['error'].abs(),
                     error_without_artisteffect = lambda df: df['error'].sub(df['ArtistEffect']),
                     abserror_without_artisteffect = lambda df: df['error_without_artisteffect'].abs(),                   
                     )
            )

In [None]:
posterior['LogBoost'].sub(posterior['LogBoost'].median()).abs().mean()

In [None]:
posterior['abserror'].mean()

In [None]:
posterior['abserror_without_artisteffect'].mean()

In [None]:
posterior.nlargest(10, 'abserror')

In [None]:
artist_outcomes.nlargest(25, 'ArtistEffect')[['NameArtist', 'ArtistEffect']]

In [None]:
artist_outcomes.nsmallest(5, 'ArtistEffect')[['NameArtist', 'ArtistEffect']]

## Finding representative samples

In [None]:
variables = [
             'DaysEffect',
            'LogPopularityNorm',
            'JarenGeleden',
            'MultiplePerformers',
            'LogSongPopularityWithinArtist',
            'PassingTooEarly',
            'IsDutchArtist',
             'LogBoost']
mm = sklearn.preprocessing.MinMaxScaler()

all_data = (posterior
            .assign(DaysEffect = lambda df: np.exp(0.007 * df['DaysToStemperiode']))
            .filter(variables)
            )
mm.fit(all_data)

data = (posterior
        .query('abserror < 0.15 & JarenGeleden > -19 & -0.15 < ArtistEffect < 0.15')
        .assign(DaysEffect = lambda df: np.exp(0.007 * df['DaysToStemperiode']))
        .filter(variables)
       )
data.head()

In [None]:
normalized_data = mm.transform(data)
dists = sklearn.metrics.pairwise_distances(normalized_data, metric='manhattan')
inds = np.argsort(dists, axis=None)[::-1]

In [None]:
boosting_effects = ['LogBoost', 'yhat', 'error', 'error_without_artisteffect']
variables = [v for v in variables if v != 'DaysEffect']
for ind in inds[:10:2]:
    
    x, y = divmod(ind, len(data))
    print(f'Distance: {dists[x, y]:.3f} {dists[y, x]:.3f}')
    ind = data.iloc[[x, y]].index

    new_df = pd.concat([df[['NameSong', 'Title', 'LogBoost', 'BoostSong', 'DaysToStemperiode'] + variables], posterior[['ArtistEffect']  + boosting_effects]], axis='columns')
    display(new_df.loc[ind])

## Figuring out the boost for two songs

We choose to compare Avicii's Hey Brother with Syreeta's With You I Am Born Again

In [None]:
class BoostExplainer:
    
    def __init__(self, parameters, df):
        self.parameters = parameters
        self.df = df
        
    def _get_song(self, song):
        if isinstance(song, str):
            song = self.df.query(f'Title == "{song}"').squeeze()
        elif isinstance(song, int):
            song = self.df.loc[song]
        else:
            raise TypeError(f'`song` must be str or int, not {type(song)} ')
        return song
    
    def _calculate_artist_effect(self, song):
        base_boost = np.exp(self.parameters['a'])
        history_effect = np.exp(self.parameters['history_effect'] * song['JarenGeleden'] )
        recency_effect = np.exp((np.exp(10**self.parameters['recency_effect_exponent'] * song['DaysToStemperiode'])
                                 - np.exp(10**self.parameters['recency_effect_exponent'] * -365))
                                * self.parameters['max_recency_effect'])
        popularity_effect = np.exp(self.parameters['effect_popularity'] * song['LogPopularityNorm'])
        dutch_effect = np.exp(self.parameters['is_dutch_effect'] * song['IsDutchArtist'])
        age_effect = np.exp(self.parameters['age_passing_effect'] * song['PassingTooEarly'])
        effects = {
                    'Base': base_boost,
                    'History': history_effect,
                    'Popularity': popularity_effect,
                    'Dutch': dutch_effect,
                    'PassingTooEarly': age_effect,
                    'Recency': recency_effect,
                    }
        return effects
    
    def _calculate_song_effects(self, song):
        oeuvre_effect = np.exp(self.parameters['within_oeuvre_effect'] * song['LogSongPopularityWithinArtist'])
        sharing_effect = np.exp(self.parameters['sharing_effect'] * song['MultiplePerformers'])
        effects = {
                    'WithinOeuvrePopularity': oeuvre_effect,
                    'MultiplePerformers': sharing_effect
                    }
        return effects
    
    def _print_effects(self, effects, starting_point=1):
        effect = starting_point
        for name, size in effects.items():
            effect *= size
            print(f'The effect is {effect:.2f} after {name} - distinct effect: {size:.2f}')
            
    def explain(self, song):
        """
        Explain the boost of a song
        
        Ignores the artist specific boost, which we cannot know beforehand
        
        Parameters
        ----------
        song: int or str
            int: the index of the song in df
            str: the title of the song (must be unique)
        """
        song = self._get_song(song)
        print('\033[1m' + f"{song.loc['Title']} by {song.loc['NameArtist']}" + '\033[0m')  # Using bold face
        effects_artist = self._calculate_artist_effect(song)
        total_artist_effect = np.prod(list(effects_artist.values()))
        effects_song = self._calculate_song_effects(song)
        self._print_effects(effects_artist)
        print('-')
        self._print_effects(effects_song, total_artist_effect)

In [None]:
b = BoostExplainer(parameters.median(), df)
b.explain('Hey Brother')

In [None]:
b.explain('With You I Am Born Again')

In [None]:
b.explain(1777)