In [None]:
import datetime

import bs4
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

import micplot

# Introduction

# Collecting the data

In [None]:
def url_to_soup(url):
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text, 'html5lib')

def create_url(date):
    datestr = f'{date.day}-{date.month}-{date.year}'
    return f'https://www.muziekweb.nl/Muziekweb/Radio/?station=SkyRadio&date={datestr}&RangeStart=1&RangeEnd=500'

def find_all_rows(soup):
    return soup.find('ul', attrs={'class': 'radio-playlist'}).find_all('li', class_=['odd', 'even'])

def row_to_dct(ele) -> dict:
    result = {}
    result['Time'] = ele.find('div', attrs={'class': "col-time"}).text.strip()
    result['Title'] = ele.find('span', class_=['cat-songtitle', 'col-songtitle']).text.strip()
    result['Artist'] = ele.find('div', attrs={'class': "col-performers"}).text.strip()
    return result

def json_to_df(json, date):
    df = pd.DataFrame(json)
    df['Datetime'] = [pd.Timestamp(date.strftime('%Y-%m-%d ') + time) for time in df['Time']]
    return df.drop(columns=['Time'])


In [None]:
dfs = []
for date in pd.date_range('2022-11-01', '2022-12-25'):
    url = create_url(date)
    soup = url_to_soup(url)
    rows = find_all_rows(soup)
    json = [row_to_dct(div) for div in rows]
    df = json_to_df(json, date)
    dfs.append(df)

df = pd.concat(dfs)

# Analysis: when do the Christmas songs start playing?

In [None]:
df = df.assign(Hash = lambda df: df[['Title', 'Artist']].apply(lambda r: hash(tuple(r)), axis=1))
STARTDATE_CHRISTMASSTATION = '2022-12-06'
kerstnummer = df.groupby('Hash')['Datetime'].last().gt(STARTDATE_CHRISTMASSTATION)
df = df.assign(Kerstnummer = df['Hash'].map(kerstnummer))
df_song = df.groupby('Hash')[['Title', 'Artist']].first()

In [None]:
df_day = (df.groupby(pd.Grouper(key='Datetime', freq='D')).agg(AantalNummers = pd.NamedAgg('Hash', 'count'),
                                                                AantalUniekeNummers=pd.NamedAgg('Hash', 'nunique'),
                                                              VaakstGedraaid = pd.NamedAgg('Hash', lambda s: s.value_counts().max()),
                                                              PctKerstnummers = pd.NamedAgg('Kerstnummer', 'mean'),
)
.assign(GemiddeldKerenGedraaid = lambda df: df['AantalNummers'] / df['AantalUniekeNummers'])
)

ax = df_day['PctKerstnummers'].plot()
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1))
ax.set_ylabel('Percentage kerstliedjes')
ax.set_xlabel('')
plt.savefig('Figures/PercentageChristmasSongs.jpg')

In [None]:
def filter_by_date_range(df, start_date, num_days):
  start_date = pd.to_datetime(start_date)
  end_date = start_date + pd.Timedelta(num_days, unit='d')
  mask = (df['Datetime'] >= start_date) & (df['Datetime'] < end_date)
  return df[mask]

def compare_unique_songs(df, days, first_start_date='2022-11-01', second_start_date='2022-12-01'):
    df_first_start_date = filter_by_date_range(df, first_start_date, days)
    df_second_start_date = filter_by_date_range(df, second_start_date, days)
    result = pd.Series({(first_start_date, days): df_first_start_date['Hash'].nunique(),
                (second_start_date, days): df_second_start_date['Hash'].nunique()})
    return result

df_nunique = (pd.concat([compare_unique_songs(df, d) for d in [1, 7, 20]])
                .unstack(level=0)
                .rename(columns={'2022-11-01': 'Pre-Christmas Period',
                                 '2022-12-01': 'Christmas Period'}
                                 )
              )


# Analysis: not many new songs are played

In [None]:
vis = micplot.visualize(df_nunique.loc[[1]], plottype='vertical_bar')
vis.ax.set_xlabel('')
vis.ax.set_ylabel('Number of different songs\n on a single day')
plt.savefig('Figures/UniqueSongs1Day.jpg', bbox_inches='tight')

In [None]:
vis = micplot.visualize(df_nunique.loc[[1, 7]], plottype='vertical_bar')
vis.ax.set_xlabel('Days of listening')
vis.ax.set_ylabel('Number of different songs')
plt.savefig('Figures/UniqueSongs7Days.jpg', bbox_inches='tight')

In [None]:
vis = micplot.visualize(df_nunique, plottype='vertical_bar')
plt.arrow(2.15, 830, 0, - (830 - 183), head_width=0.1, head_length=40, color='k', length_includes_head=True)
plt.annotate('÷ 4.5', (2.18, 450))
vis.ax.set_xlabel('Days of listening')
vis.ax.set_ylabel('Number of different songs')
plt.savefig('Figures/UniqueSongs20Days.jpg', bbox_inches='tight')

In [None]:
songs_per_day = df.groupby([pd.Grouper(key='Datetime', freq='D'), 'Hash']).size()

# Analysis: a few favorite songs are replayed everytime

In [None]:
songs_per_day.loc[:'2022-11-19'].max()#.loc[lambda s: s.eq(s.max())].to_frame(name='#_played').join(df_song)

In [None]:
songs_per_day.loc[lambda s: s.eq(s.max())].to_frame(name='#_played').join(df_song)

In [None]:
songs_per_day.unstack(fill_value=0).loc['2022-12-01':].min().loc[lambda s: s.ge(5)].to_frame(name='min. # played every day').join(df_song)

This wasn't the case before Christmas

In [None]:
songs_per_day.unstack(fill_value=0).loc['2022-11-01':'2022-11-20'].min().loc[lambda s: s.ge(5)].to_frame(name='min. # played every day').join(df_song)

# Analysis: These songs amount to a serious percentage of all songs

In [None]:
n_played_dec = songs_per_day.loc['2022-12-01':].groupby('Hash').sum().sort_values(ascending=False)

(n_played_dec.cumsum() / n_played_dec.sum()).loc[lambda s: s<0.25]

In [None]:
n_played_dec.to_frame().join(df_song).head(12)