In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from os.path import isfile, join

sns.set()

Spot check an individual songs file to explore its format and data.

In [2]:
df = pd.read_csv('../data/Songs/songs285.csv')

In [3]:
print(df.shape)
list(df.columns)

(64928, 9)


['pid',
 'pos',
 'artist_name',
 'track_uri',
 'artist_uri',
 'track_name',
 'album_uri',
 'duration_ms',
 'album_name']

This is a *lot* of data: ball-park $65,000$ rows in $1,000$ files, totaling around $65,000,000$ observations – or songs in playlists, many of which are certainly repeated.

In [4]:
df.head()

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,0,0,Deftones,spotify:track:4rEGJ9KirDlKiOHxqVwcVg,spotify:artist:6Ghvu1VvMGScGpOUJBAHNH,Sextape,spotify:album:4RQnFSkkZlA65Xxchhnaha,241533,Diamond Eyes
1,0,1,Muse,spotify:track:0It6VJoMAare1zdV2wxqZq,spotify:artist:12Chz98pHFMPJEknJQMWvI,Undisclosed Desires,spotify:album:0eFHYz8NmK75zSplL5qlfM,235000,The Resistance
2,0,2,Pearl Jam,spotify:track:0LBmvPJYmtEJ7kkWvc3kbT,spotify:artist:1w5Kfo2jwwIPruYS2UWh56,Oceans,spotify:album:5B4PYA7wNN4WdEXdIJu58a,161893,Ten
3,0,3,My Chemical Romance,spotify:track:0uukw2CgEIApv4IWAjXrBC,spotify:artist:7FBcuc1gsnv6Y1nwFtNRCb,Dead!,spotify:album:0FZK97MXMm5mUQ8mtudjuK,195520,The Black Parade
4,0,4,Red Hot Chili Peppers,spotify:track:1iFIZUVDBCCkWe705FLXto,spotify:artist:0L8ExT028jH3ddEcZwqJJ5,Dosed,spotify:album:6deiaArbeoqp1xPEGdEKp1,311866,By The Way


In [5]:
# Some very different playlist lengths
df.groupby('pid')['track_uri'].nunique()[10:20]

pid
10     22
11     63
12    212
13     12
14     11
15     25
16     96
17     24
18      9
19    165
Name: track_uri, dtype: int64

Every file seems to be an arbitrary-length list of playlists, each of which has an id `pid` (scoped to that file) and an arbitrary-length list of songs, each with a position id `pos` in that playlist. For each song, the `artist_name`, `track_uri`, `artist_uri`, `track_name`, `album_uri`, `duration_ms`, and `album_name` are all stored. Most of these are repeated every time a song is repeated (within and across files), so there is a lot of room for simplification here. Furthermore, any URI is effectively a unique identified for the song, artist, or album, and can be used as such.

$1,000$ playlist files, as expected.

In [6]:
all_files = listdir('../data/Songs')
print(len(all_files))
all_files[0:3]

1000


['songs284.csv', 'songs290.csv', 'songs247.csv']

### Structuring the data
A reasonable first step to slim down the size of the dataset without losing information or fidelity, is to parse through all the files to create a reference table/file of all songs and their metadata. Each playlist can then be stored as a simple named object, where the name is the overall playlist id and its value a vector of song ids.

In [7]:
# Master DataFrame of all unique songs included across all playlists
songs = pd.DataFrame(columns = list(df.columns)[2:])

# Master Series of playlists and the songs included in each
playlists = pd.Series()

### Looping over all our files to fill out the master DataFrame (songs) and Series (playlists)

In [8]:
import time
start_time = time.time()

# List of filenames for all files in dataset
all_files = listdir('../data/Songs')

# Loop over each file to extract data
for i, file in enumerate(all_files):
    # split on "." to split into "filename" and "csv"
    # Then select "filename" and ditch the first five letters "songs"
    filenum = file.split(".")[0][5:]
    
    # Load file and store in temporary dataframe
    fdf = pd.read_csv('../data/Songs/' + file)
    
    # -- SONGS IN EACH PLAYLIST --
    # Songs included in every playlist (ordered) in file
    # For each playlist, get list of track_uri's (unique identifiers)
    songs_in_playlist = fdf.groupby('pid')['track_uri'].unique()

    # Update index to be not the pid in file (id), but a combination of them
    songs_in_playlist.index = [filenum + '_' + str(pid) for pid in songs_in_playlist.index.values]

    # Add playlists to master Series of all playlists
    playlists = playlists.append(songs_in_playlist)
    
    # -- UNIQUE SONGS IN FILE --
    # Get the cumulative count of each track appearing
    # I.e. we only want cumcount == 1, as that's the first time
    track_occurrence = fdf.groupby(['track_uri']).cumcount()+1
    #assert len(track_occurrence) == len(df)

    # Filter to unique songs (i.e. first occurrence of each)
    # Also ditch the first two columns, as only the latter are needed
    unique_songs = fdf.iloc[:, 2:][track_occurrence == 1]

    # Ignore those that have been seen before
    new_songs = unique_songs[~unique_songs.track_uri.isin(songs.track_uri.values)]

    # Add the new songs to the master song table
    songs = songs.append(new_songs)
    
    if (i+1)%50 == 0: print('{}/{}'.format(i+1, len(all_files)))
    
print("--- %s seconds ---" % (time.time() - start_time))

50/1000
100/1000
150/1000
200/1000
250/1000
300/1000
350/1000
400/1000
450/1000
500/1000
550/1000
600/1000
650/1000
700/1000
750/1000
800/1000
850/1000
900/1000
950/1000
1000/1000
--- 3248.3688600063324 seconds ---


In [16]:
songs.to_csv('../data/unique_songs.csv')

In [13]:
print(len(songs))
songs.head()

2262292


Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,Sleeping At Last,spotify:track:2d7LPtieXdIYzf7yHPooWd,spotify:artist:0MeLMJJcouYXCymQSHPn8g,Chasing Cars,spotify:album:0UIIvTTWNB3gRQWFoxoEDh,242564,"Covers, Vol. 2"
1,Rachael Yamagata,spotify:track:0y4TKcc7p2H6P0GJlt01EI,spotify:artist:7w0qj2HiAPIeUcoPogvOZ6,Elephants,spotify:album:6KzK9fDNmj7GHFbcE4gVJD,253701,Elephants...Teeth Sinking Into Heart
2,The Cinematic Orchestra,spotify:track:6q4c1vPRZREh7nw3wG7Ixz,spotify:artist:32ogthv0BdaSMPml02X9YB,That Home,spotify:album:5cPHT4yMCfETLRYAoBFcOZ,103920,Ma Fleur
3,The Cinematic Orchestra,spotify:track:54KFQB6N4pn926IUUYZGzK,spotify:artist:32ogthv0BdaSMPml02X9YB,To Build A Home,spotify:album:5cPHT4yMCfETLRYAoBFcOZ,371320,Ma Fleur
4,Leon Bridges,spotify:track:0NeJjNlprGfZpeX2LQuN6c,spotify:artist:3qnGvpP8Yth1AqSBMqON5x,River,spotify:album:4svLfrPPk2npPVuI4kXPYg,238560,Coming Home


In [19]:
playlists.to_csv('../data/playlist_songs.csv')

  """Entry point for launching an IPython kernel.


In [14]:
print(len(playlists))
playlists.head()

1000000


284_0    [spotify:track:2d7LPtieXdIYzf7yHPooWd, spotify...
284_1    [spotify:track:5j9iuo3tMmQIfnEEQOOjxh, spotify...
284_2    [spotify:track:4HBVGSeSPpSZ1QmMBhEtqp, spotify...
284_3    [spotify:track:1f5AW15GV76mk8JNxaPJIx, spotify...
284_4    [spotify:track:4Sj3djQIFuaH3VICDN3uAA, spotify...
dtype: object

Convert `track_uri` to ID of song in our master lookup table.

**This seems computationally intractable.**

In [48]:
start_time = time.time()

playlists_songids = pd.Series(index = playlists.index)
playlists_songids = playlists_songids.astype(object)

j = 0
for ind, row in playlists.items():
    songids = [songs.index[songs.track_uri == uri].values[0] for uri in row]
    playlists_songids[ind] = np.array(songids)
    
    j+= 1
    if j%50 == 0: print('{}/{}'.format(j, len(playlists)))
    
print("--- %s seconds ---" % (time.time() - start_time))

50/1000000


KeyboardInterrupt: 

In [None]:
playlists_songids.to_csv('../data/playlist_songs_ids.csv')