In [None]:
# !pip install polars hvplot altair --upgrade
import polars as pl
import hvplot.polars
import os
import matplotlib as plt
import altair

def adjust_bpm(df_):
    return (df_
             #correctify spotify's bpm guessing
             .with_columns(new_bpm = pl.when(pl.col('BPM') >= 135)
                                        .then(pl.col('BPM')/2)
                                        .when(pl.col('BPM') <= 50)
                                        .then(pl.col('BPM')*2)
                                        .otherwise(pl.col('BPM')),
                           song = pl.col('Song').str.slice(0,18),
                          )
            .sort('#')
           )

# West Coast Swing Playlist Analytics

Stemming from a one of the fantastic talks at [Westie Wisdom](https://www.super-secret-moves.com/workshops/fuer-trainer-und-organizer) where Flo G. detailed how he went about analyzing his sets using [Spotify's various metrics](https://developer.spotify.com/documentation/web-api/reference/get-audio-features) provided for songs. This is an attempt at continuing down that analytic path to answer some questions:

* **What do various WCS DJ playlists/sets look like?** (from the good to the bad - focusing mainly on the ordered playlists)
* **What music is (un)common in my bubble?** (get spotify connections and map their playlists)

## What do various WCS DJ playlists/sets look like?

### Tools
* Playlist analyzer: https://www.chosic.com/spotify-playlist-analyzer/
* BPM tool: https://www.all8.com/tools/bpm.htm
* Aleš' Event/DJ Playlist Tracker: http://klopca.com/dance/west-coast-swing/music/2024events

I pasted the spotify playlist links into the [playlist analyzer](https://www.chosic.com/spotify-playlist-analyzer/), and downloaded the `.csv` at the bottom

### Graphs! 
(😔 Have to use Matplotlib for Github viewing)

A beginner DJ made some playlists that were too fast (first 3 graphs), but you can see that on the 3rd, with guidance, the set improved around halfway through. Mistakes are great, because you get to improve, and it gives us examples to work with!

The subsequent graphs are sets during events from established DJ's, and you can see the slow pulses and/or gradual slowing of BPM.

`Note:`
*If you a really high or low spike that doesn't match the surrounding bpm's, it's likely because Spotify doesn't always get it right. I've tried to correct the worst ones, but there will still be some that escape.*

In [None]:
for i in sorted(os.listdir('../Playlists/')):
    if 'unordered' not in i and '.csv' in i:
        (pl.read_csv(f'../Playlists/{i}')
         .with_columns(
             #correctifies the worst of Spotify's bpm attempts
             new_bpm = pl.when(pl.col('BPM') >= 135)
                        .then(pl.col('BPM')/2)
                        .when(pl.col('BPM') <= 50)
                        .then(pl.col('BPM')*2)
                        .otherwise(pl.col('BPM')),
             #truncates loooong song names so they don't kill graph proportions
             song = pl.col('Song').str.slice(0,18),
         )
        .to_pandas()
        #plot things
        ).plot(x='song',
               y='new_bpm',
               kind='line',
               title=i,
               figsize=(16, 5),
               legend=True,
               rot=90,
               ylabel='BPM',
              )

### Observations

With Spotify's bpm inaccuracy, you can actually see how clear/simple the beats are in some of the sets. If there's a lot of variation or jumps, it likely means the songs have some complex rhythms that are confusing Spotify. Some playlists above show a clear `/\/\/\/\/\` which might be a good indicator that the beats are clear enough to dance to for beginners!



### Graphing with `hvplot`

In [None]:
for i in sorted(os.listdir('../Playlists/')):
    if 'unordered' not in i and '.csv' in i:
        display(pl.read_csv(f'../Playlists/{i}').pipe(adjust_bpm)

                #plot things
                .hvplot(x='song',
                         y=['new_bpm',
                            # 'Energy'
                           ],
                         width=1300,
                         height=500,
                         stacked=True,
                         legend='top',
                         title=i,
                        ).opts(xrotation=90)) #rotating song names

## Prepping the data for analysis

This is where we're gonna go from 0-100 real quick. I'm doing a lot of modifications - with comments - to prep our dataset for things we'll want to do later. This involves:
* Adding playlist names
* Correctify Spotify bpm blunders
* Truncate long song names
* Show where a song is in each playlist

In [None]:
dfs = []

for _ in sorted(os.listdir('../Playlists/')):
    if '.csv' in _:
        df_ = pl.read_csv(f'../Playlists/{_}')
        dfs.append(
            df_
             .pipe(adjust_bpm)
             .with_columns(

                 #correctifies the worst of Spotify's bpm attempts
                 new_bpm = pl.when(pl.col('BPM') >= 135)
                            .then(pl.col('BPM')/2)
                            .when(pl.col('BPM') <= 50)
                            .then(pl.col('BPM')*2)
                            .otherwise(pl.col('BPM')),

                 #truncates loooong song names so they don't kill graph proportions
                 song = pl.col('Song').str.slice(0,18),

                 #set the playlist name
                 playlist = pl.lit(_),

                 #where the song is positioned in the playlist
                 song_position_in_playlist = pl.concat_str([pl.col('#'), pl.lit('/'), df_.shape[0]]),

                 #where the song is positioned - beginning/middle/end
                 apprx_song_position_in_playlist = pl.when((pl.col('#')*100 / pl.lit(df_.shape[0])) <= 33)
                                                      .then(pl.lit('beginning'))
                                                      .when((pl.col('#')*100 / pl.lit(df_.shape[0])) >= 34,
                                                            (pl.col('#')*100 / pl.lit(df_.shape[0])) <= 66)
                                                      .then(pl.lit('middle'))
                                                      .when((pl.col('#')*100 / pl.lit(df_.shape[0])) >= 67)
                                                      .then(pl.lit('end'))
              )

             #prevents the numbers from being summed when they shouldn't be
             .select(pl.all().cast(pl.String))
            )

df = pl.concat(dfs)
df

### What music is (un)common in my bubble?

In [None]:
#tbd

### What're the most and least common songs?

In [None]:
(df
 .group_by('Artist', 'Song')
 .agg(pl.col('song_position_in_playlist', 'apprx_song_position_in_playlist', 'playlist').sort())
 .with_columns(pl.col('playlist').list.unique().list.sort(),
               num_playlists = pl.col('playlist').list.unique().list.len())
 .sort('num_playlists', descending = True)
)

### Which Artists are most played among everyone?

In [None]:
pl.Config.set_fmt_table_cell_list_len(15)

(df
 .group_by('Artist')
 .agg(pl.col('song_position_in_playlist', 'apprx_song_position_in_playlist', 'playlist').sort())
 .with_columns(pl.col('playlist').list.unique().list.sort(),
               num_playlists = pl.col('playlist').list.unique().list.len())
 .sort('num_playlists', descending = True)
)

### Relative positions of songs in playlists.


In [None]:
pl.Config.set_tbl_rows(50)
(df
 .group_by('Artist', 'Song')
 .agg(pl.col('song_position_in_playlist', 'apprx_song_position_in_playlist', 'playlist').sort())
 .with_columns(pl.col('playlist').list.unique().list.sort(),
               num_playlists = pl.col('playlist').list.unique().list.len())
 .sort('num_playlists', descending = True)
 # .filter(pl.col('apprx_song_position_in_playlist').list.join(',').str.contains('middle'))
)

### What Artists are most played among my country?

In [None]:
# (pl.concat(dfs)
#  # .select(pl.all().cast(pl.String))
#  .group_by('Artist')
#  .agg(pl.col('Country').unique().sort(),
#       pl.col('playlist').unique().sort(),
#       pl.col('Song')
#      )
#  .with_columns(num_playlists = pl.col('playlist').list.len())
#  .sort('num_playlists', descending = True)
# )

### What's the most common time signature
These are currently factoring in all the songs in all the playlists, which might skew results, I should distill each playlist first, and then aggregate.


In [None]:
(df
 .group_by('Time Signature')
 .agg(pl.all().unique().sort())
 .with_columns(pl.col('playlist').list.unique().list.sort(),
               num_playlists = pl.col('playlist').list.unique().list.len())
 .sort('num_playlists', descending = True)
)