In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import micplot

# Introduction

# Collecting the data

In [None]:
try:
    df = pd.read_csv('Data\SkyRadio2022.csv', parse_dates=['Date'])
except FileNotFoundError:
    import playlistdownloader
    df = playlistdownloader.Playlistdownloader().get_playlist_period('2022-11-01', '2022-12-26')
    df.to_csv('Data\SkyRadio2022.csv', index=False)

# Data preparation

In [None]:
def combine_date_and_time(df):
    """
    Combine columns Date and Time to single column Datetime
    """
    datecolasstr = df["Date"].dt.strftime("%Y-%m-%d")
    datetime = pd.to_datetime(datecolasstr + " " + df["Time"], format="%Y-%m-%d %H:%M")
    return df.assign(Datetime=lambda df: datetime).drop(columns=["Date", "Time"])

def add_hash(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    """Add column `Hash` to a DataFrame per row

    Args:
        df (pd.DataFrame]): 
            The DataFrame on which to add the hash
        columns (list, optional): 
            A list of columns on which the hash must be based. Defaults to using
            all columns

    Returns:
        pd.DataFrame: the dataframe with extra column `Hash`
    """
    df = df.copy()

    if not columns:
        columns = df.columns
    
    return df.assign(Hash = lambda df: df[columns].apply(lambda r: hash(tuple(r)), axis=1))

def add_if_christmassong(df: pd.DataFrame, startdate_christmasstation='2022-12-06') -> pd.DataFrame:
    """Adds column whether each song is a Christmas song.

    Does so by checking whether the song is played during the time Sky Radio is the 
    Christmas Station


    Parameters
    ----------
        df (pd.DataFrame): The Data Frame for which to check the songs
        startdate_christmasstation (str, optional): Date after which the radio station
        only plays CHristmas song. Defaults to '2022-12-06'.

    Returns
    --------
        pd.DataFrame: Per ID (Hash) whether it is a Christmas Song
    """
    df = df.copy()
    return (df.assign(ChristmasSong = lambda df: df.groupby('Hash')['Datetime'].transform('last')
                                                    .gt(startdate_christmasstation)
                                            )
                    )

In [None]:
df = (df.pipe(combine_date_and_time)
        .pipe(add_hash, ['Title', 'Artist'])
        .pipe(add_if_christmassong)
        )
df_song = df.groupby('Hash')[['Title', 'Artist']].first()

# Analysis: when do the Christmas songs start playing?

In [None]:
df_day = (df.groupby(pd.Grouper(key='Datetime', freq='D')).agg(NSongs = pd.NamedAgg('Hash', 'count'),
                                                               NUniqueSongs=pd.NamedAgg('Hash', 'nunique'),
                                                              NMostPlayed = pd.NamedAgg('Hash', lambda s: s.value_counts().max()),
                                                              PctChristmasSongs = pd.NamedAgg('ChristmasSong', 'mean'),
)
.assign(AvgTimesPlayed = lambda df: df['AantalNummers'] / df['AantalUniekeNummers'])
)

ax = df_day['PctChristmasSongs'].plot()
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
ax.set_ylabel('Percentage Christmas Songs')
ax.set_xlabel('')
plt.savefig('Figures/PercentageChristmasSongs.jpg')

In [None]:
def filter_by_date_range(df, start_date, num_days):
  start_date = pd.to_datetime(start_date)
  end_date = start_date + pd.Timedelta(num_days, unit='d')
  mask = (df['Datetime'] >= start_date) & (df['Datetime'] < end_date)
  return df[mask]

def compare_unique_songs(df, days, first_start_date='2022-11-01', second_start_date='2022-12-01'):
    df_first_start_date = filter_by_date_range(df, first_start_date, days)
    df_second_start_date = filter_by_date_range(df, second_start_date, days)
    result = pd.Series({(first_start_date, days): df_first_start_date['Hash'].nunique(),
                (second_start_date, days): df_second_start_date['Hash'].nunique()})
    return result

df_nunique = (pd.concat([compare_unique_songs(df, d) for d in [1, 7, 20]])
                .unstack(level=0)
                .rename(columns={'2022-11-01': 'Pre-Christmas Period',
                                 '2022-12-01': 'Christmas Period'}
                                 )
              )


# Analysis: not many new songs are played

In [None]:
vis = micplot.visualize(df_nunique.loc[[1]], plottype='vertical_bar')
vis.ax.set_xlabel('')
vis.ax.set_ylabel('Number of different songs\n on a single day')
plt.savefig('Figures/UniqueSongs1Day.jpg', bbox_inches='tight')

In [None]:
vis = micplot.visualize(df_nunique.loc[[1, 7]], plottype='vertical_bar')
vis.ax.set_xlabel('Days of listening')
vis.ax.set_ylabel('Number of different songs')
plt.savefig('Figures/UniqueSongs7Days.jpg', bbox_inches='tight')

In [None]:
vis = micplot.visualize(df_nunique, plottype='vertical_bar')
plt.arrow(2.15, 830, 0, - (830 - 183), head_width=0.1, head_length=40, color='k', length_includes_head=True)
plt.annotate('÷ 4.5', (2.18, 450))
vis.ax.set_xlabel('Days of listening')
vis.ax.set_ylabel('Number of different songs')
plt.savefig('Figures/UniqueSongs20Days.jpg', bbox_inches='tight')

In [None]:
songs_per_day = df.groupby([pd.Grouper(key='Datetime', freq='D'), 'Hash']).size()

# Analysis: a few favorite songs are replayed everytime

In [None]:
songs_per_day.loc[:'2022-11-19'].max()#.loc[lambda s: s.eq(s.max())].to_frame(name='#_played').join(df_song)

In [None]:
songs_per_day.loc[lambda s: s.eq(s.max())].to_frame(name='#_played').join(df_song)

In [None]:
songs_per_day.loc['2022-11-01':'2022-11-20'].loc[lambda s: s.eq(s.max())].to_frame(name='#_played').join(df_song)

In [None]:
songs_per_day.unstack(fill_value=0).loc['2022-12-01':].min().loc[lambda s: s.ge(5)].to_frame(name='min. # played every day').join(df_song)

This wasn't the case before Christmas

In [None]:
songs_per_day.unstack(fill_value=0).loc['2022-11-01':'2022-11-20'].min().loc[lambda s: s.ge(3)].to_frame(name='min. # played every day').join(df_song)

# Analysis: These songs amount to a serious percentage of all songs

In [None]:
n_played_dec = songs_per_day.loc['2022-12-01':].groupby('Hash').sum().sort_values(ascending=False)

(n_played_dec.cumsum() / n_played_dec.sum()).loc[lambda s: s<0.25]

In [None]:
n_played_dec.to_frame().join(df_song).head(12)