In [8]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from os.path import isfile, join

import time

sns.set()

Spot check an individual songs file to explore its format and data.

In [9]:
df = pd.read_csv('../data/Songs/songs285.csv')

In [10]:
print(df.shape)
list(df.columns)

(64928, 9)


['pid',
 'pos',
 'artist_name',
 'track_uri',
 'artist_uri',
 'track_name',
 'album_uri',
 'duration_ms',
 'album_name']

This is a *lot* of data: ball-park $65,000$ rows in $1,000$ files, totaling around $65,000,000$ observations – or songs in playlists, many of which are certainly repeated.

In [11]:
df.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,0,0,Deftones,spotify:track:4rEGJ9KirDlKiOHxqVwcVg,spotify:artist:6Ghvu1VvMGScGpOUJBAHNH,Sextape,spotify:album:4RQnFSkkZlA65Xxchhnaha,241533,Diamond Eyes
1,0,1,Muse,spotify:track:0It6VJoMAare1zdV2wxqZq,spotify:artist:12Chz98pHFMPJEknJQMWvI,Undisclosed Desires,spotify:album:0eFHYz8NmK75zSplL5qlfM,235000,The Resistance
2,0,2,Pearl Jam,spotify:track:0LBmvPJYmtEJ7kkWvc3kbT,spotify:artist:1w5Kfo2jwwIPruYS2UWh56,Oceans,spotify:album:5B4PYA7wNN4WdEXdIJu58a,161893,Ten
3,0,3,My Chemical Romance,spotify:track:0uukw2CgEIApv4IWAjXrBC,spotify:artist:7FBcuc1gsnv6Y1nwFtNRCb,Dead!,spotify:album:0FZK97MXMm5mUQ8mtudjuK,195520,The Black Parade
4,0,4,Red Hot Chili Peppers,spotify:track:1iFIZUVDBCCkWe705FLXto,spotify:artist:0L8ExT028jH3ddEcZwqJJ5,Dosed,spotify:album:6deiaArbeoqp1xPEGdEKp1,311866,By The Way


In [12]:
# Some very different playlist lengths
df.groupby('pid')['track_uri'].nunique()[10:20]

pid
10     22
11     63
12    212
13     12
14     11
15     25
16     96
17     24
18      9
19    165
Name: track_uri, dtype: int64

Every file seems to be an arbitrary-length list of playlists, each of which has an id `pid` (scoped to that file) and an arbitrary-length list of songs, each with a position id `pos` in that playlist. For each song, the `artist_name`, `track_uri`, `artist_uri`, `track_name`, `album_uri`, `duration_ms`, and `album_name` are all stored. Most of these are repeated every time a song is repeated (within and across files), so there is a lot of room for simplification here. Furthermore, any URI is effectively a unique identified for the song, artist, or album, and can be used as such.

$1,000$ playlist files, as expected.

In [13]:
all_files = listdir('../data/Songs')
print(len(all_files))
all_files[0:3]

1000


['songs284.csv', 'songs290.csv', 'songs247.csv']

### Structuring the data
A reasonable first step to slim down the size of the dataset without losing information or fidelity, is to parse through all the files to create a reference table/file of all songs and their metadata. Each playlist can then be stored as a simple named object, where the name is the overall playlist id and its value a vector of song ids.

### Looping over all our files to fill out the master DataFrame (songs) and Series (playlists)

In [14]:
start_time = time.time()
loop_start = time.time()

# List of all files
all_files = listdir('../data/Songs')
# Limit to some of the playlists
#all_files = all_files[0:200]

# Load first file to get columns (standard across all)
df = pd.read_csv('../data/Songs/' + all_files[0])

# Master DataFrame of all unique songs included across all playlists
#songs = pd.DataFrame(columns = list(df.columns)[2:])
songs = pd.DataFrame()

# Master Series of playlists and the songs included in each
playlists = pd.Series()

# Aggregator functions to limit to one row per song and count occurrences across playlists
a1 = dict()
for key in df.columns[2:]:
    a1[key] = 'first'
a1['track_uri'] = 'count'

# Aggregator to consolidate into sum of songs across playlists
a2= dict()
for key in df.columns[2:]:
    a2[key] = 'first'
del a2['track_uri']
a2['count'] = 'sum'

# Loop over each file to extract data
for i, file in enumerate(all_files):
    # split on "." to split into "filename" and "csv"
    # Then select "filename" and ditch the first five letters "songs"
    filenum = file.split(".")[0][5:]
    
    # Load file and store in temporary dataframe
    fdf = pd.read_csv('../data/Songs/' + file)
    
    # --- SONGS IN FILE ---
    fdf_counts = fdf.iloc[:, 2:]
    fdf_counts = fdf_counts.groupby('track_uri').agg(a1)
    fdf_counts.rename(columns = {'track_uri': 'count'}, inplace = True)
    
    # Add to df of unique songs, update counters, and remove duplicates
    songs = songs.append(fdf_counts)
    
    # -- SONGS IN EACH PLAYLIST --
    # Songs included in every playlist (ordered) in file
    # For each playlist, get list of track_uri's (unique identifiers)
    songs_in_playlist = fdf.groupby('pid')['track_uri'].unique()

    # Update index to be not the pid in file (id), but a combination of them
    #songs_in_playlist.index = [filenum + '_' + str(pid) for pid in songs_in_playlist.index.values]
    songs_in_playlist.index = list(map(lambda x: filenum + '_' + str(x), songs_in_playlist.index.values))
    
    # Add playlists to master Series of all playlists
    playlists = playlists.append(songs_in_playlist)
    
    # Every 50 files, consolidate the songs table so it doesn't grow too big
    if (i+1)%25 == 0: 
        print('{}/{} -- {} s'.format(i+1, len(all_files), time.time() - loop_start))
        loop_start = time.time()
        songs = songs.groupby('track_uri').agg(a2, sort = True)
        print('   Consolidation: {} s'.format(time.time() - loop_start))
    
print("--- %s seconds ---" % (time.time() - start_time))

25/1000 -- 27.366091012954712 s
   Consolidation: 3.045917272567749 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


50/1000 -- 45.434144020080566 s
   Consolidation: 5.051048994064331 s
75/1000 -- 58.47398400306702 s
   Consolidation: 6.09772515296936 s
100/1000 -- 69.52241206169128 s
   Consolidation: 7.910443067550659 s
125/1000 -- 73.77834796905518 s
   Consolidation: 8.910668134689331 s
150/1000 -- 76.7184042930603 s
   Consolidation: 8.3334059715271 s
175/1000 -- 77.8903558254242 s
   Consolidation: 10.934021949768066 s
200/1000 -- 81.73972201347351 s
   Consolidation: 10.62891697883606 s
225/1000 -- 86.98233485221863 s
   Consolidation: 11.32219123840332 s
250/1000 -- 88.56120204925537 s
   Consolidation: 10.380228996276855 s
275/1000 -- 91.60101199150085 s
   Consolidation: 11.818976879119873 s
300/1000 -- 97.27157711982727 s
   Consolidation: 13.4116051197052 s
325/1000 -- 100.56316900253296 s
   Consolidation: 14.040070056915283 s
350/1000 -- 108.0798499584198 s
   Consolidation: 14.448730945587158 s
375/1000 -- 1242.5574660301208 s
   Consolidation: 578.487918138504 s
400/1000 -- 763.65207

In [15]:
# Do a final consolidation just to be safe (should be very fast)
# Add song ID to table, now that it only contains unique songs
start_time = time.time()
songs_counts = songs.groupby('track_uri').agg(a2)
songs_counts['song_id'] = np.arange(len(songs_counts))
print("--- %s seconds ---" % (time.time() - start_time))

print(songs_counts.shape)
display(songs_counts.head())

--- 28.654460906982422 seconds ---
(2262292, 8)


Unnamed: 0_level_0,artist_name,artist_uri,track_name,album_uri,duration_ms,album_name,count,song_id
track_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
spotify:track:0000uJA4xCdxThagdLkkLR,Cherryholmes,spotify:artist:5kEVfWQGTw0rIDO2Jqq1ww,Heart As Cold As Stone,spotify:album:3SPMBGMEvPw21lmT5b1ApW,161186,Cherryholmes,1,0
spotify:track:0002yNGLtYSYtc0X6ZnFvp,Sidney Bechet's Blue Note Jazzmen,spotify:artist:2XouUSO0EAJ9gMMoHiXqMt,Muskrat Ramble,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,220293,Jazz Classics,6,1
spotify:track:00039MgrmLoIzSpuYKurn9,Zach Farlow,spotify:artist:2jTojc4rAsOMx6200a8Ah1,Thas What I Do,spotify:album:0UHfgx3ITlxePDXLaN5Y6x,222727,The Great Escape 2,3,2
spotify:track:0003Z98F6hUq7XxqSRM87H,Жак-Энтони,spotify:artist:08RxfNkJpjJ4dJb4xASWzj,Меня больше тут нет,spotify:album:6bwGC40nfS1uxz5fkugvjw,241666,#NoName,1,3
spotify:track:0004ExljAge0P5XWn1LXmW,RPM,spotify:artist:7lfmakKFOeQWdUrbmVK6EQ,Gita,spotify:album:4lEFcaL9IwlJPCQyA3rjRA,358506,Maxximum - RPM,1,4


In [20]:
# Replace playlist track_uri with song_id
start_time = time.time()
loop_start = time.time()

playlists_songids = pd.Series(index = playlists.index)
playlists_songids = playlists_songids.astype(object)

i = 0
for ind, row in playlists.items():
    songids = np.array(songs_counts.loc[row, 'song_id'], 'int')
    playlists_songids.loc[str(ind)] = songids
    
    i += 1
    if i % (len(playlists)/100) == 0 == 0: 
        print('{}/{} -- {} s'.format(i, int(len(playlists)), time.time() - loop_start))
        loop_start = time.time()
    
print("--- %s seconds ---" % (time.time() - start_time))

print(playlists_songids.shape)
print(playlists_songids.head())

10000/1000000 -- 156.32376503944397 s
20000/1000000 -- 169.8954930305481 s
30000/1000000 -- 176.60787105560303 s
40000/1000000 -- 187.77768278121948 s
50000/1000000 -- 201.5223469734192 s
60000/1000000 -- 253.54231190681458 s
70000/1000000 -- 244.03459095954895 s
80000/1000000 -- 220.40614986419678 s
90000/1000000 -- 182.15377497673035 s
100000/1000000 -- 197.7716588973999 s
110000/1000000 -- 206.35164284706116 s
120000/1000000 -- 206.6775300502777 s
130000/1000000 -- 211.5508930683136 s
140000/1000000 -- 215.5301342010498 s
150000/1000000 -- 213.4286379814148 s
160000/1000000 -- 222.0904278755188 s
170000/1000000 -- 229.650043964386 s
180000/1000000 -- 235.4369432926178 s
190000/1000000 -- 235.65158987045288 s
200000/1000000 -- 263.13593792915344 s
210000/1000000 -- 286.1963789463043 s
220000/1000000 -- 248.16142678260803 s
230000/1000000 -- 251.43092584609985 s
240000/1000000 -- 275.1317768096924 s
250000/1000000 -- 257.06030225753784 s
260000/1000000 -- 291.9050660133362 s
270000/10

In [21]:
# Change songs table to have song id as index and track_uri as column
# We will be doing lookups on song_id while running
songs_counts_id = songs_counts.copy()
songs_counts_id['track_uri'] = songs_counts_id.index.values
songs_counts_id.set_index('song_id', inplace = True)
songs_counts_id.head()

Unnamed: 0_level_0,artist_name,artist_uri,track_name,album_uri,duration_ms,album_name,count,track_uri
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Cherryholmes,spotify:artist:5kEVfWQGTw0rIDO2Jqq1ww,Heart As Cold As Stone,spotify:album:3SPMBGMEvPw21lmT5b1ApW,161186,Cherryholmes,1,spotify:track:0000uJA4xCdxThagdLkkLR
1,Sidney Bechet's Blue Note Jazzmen,spotify:artist:2XouUSO0EAJ9gMMoHiXqMt,Muskrat Ramble,spotify:album:04hQBJ7YSuNnZ0nbuXNYbY,220293,Jazz Classics,6,spotify:track:0002yNGLtYSYtc0X6ZnFvp
2,Zach Farlow,spotify:artist:2jTojc4rAsOMx6200a8Ah1,Thas What I Do,spotify:album:0UHfgx3ITlxePDXLaN5Y6x,222727,The Great Escape 2,3,spotify:track:00039MgrmLoIzSpuYKurn9
3,Жак-Энтони,spotify:artist:08RxfNkJpjJ4dJb4xASWzj,Меня больше тут нет,spotify:album:6bwGC40nfS1uxz5fkugvjw,241666,#NoName,1,spotify:track:0003Z98F6hUq7XxqSRM87H
4,RPM,spotify:artist:7lfmakKFOeQWdUrbmVK6EQ,Gita,spotify:album:4lEFcaL9IwlJPCQyA3rjRA,358506,Maxximum - RPM,1,spotify:track:0004ExljAge0P5XWn1LXmW


### Write files to disk (csv and pickle)
Use `pd.read_pickle` to easily read back in a data frame or series with the exact same structure as the one you dumped.

In [22]:
songs_counts_id.to_csv('../data/songs_counts_'+str(len(all_files))+'.csv')
songs_counts_id.to_pickle('../data/songs_counts_'+str(len(all_files))+'.pkl')

In [23]:
playlists_songids.to_csv('../data/playlists_song_ids_'+str(len(all_files))+'.csv', header = False)
playlists_songids.to_pickle('../data/playlists_song_ids_'+str(len(all_files))+'.pkl')

### For later use:
Quickly filtering out from playlists songs that do not exist. Use to filter out songs that appear below a certain threshold by first filtering `songs_counts` on that threshold, and run `get` on that. Some songs in some playlists will then not exist in `songs_counts`, and the below code will drop them from the playlist. Handy for limiting the size of the dataset.

In [None]:
songs_counts.song_id.get(['derp', *playlists[0]], np.NaN)

In [None]:
np.array(songs_counts.song_id.get(['derp', *playlists[0]], np.NaN).dropna(), 'int')