In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlalchemy as sa
import psycopg2
import json
import time
import sys
import csv
import uuid
import sqlite3

from tqdm import tqdm

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from difflib import get_close_matches
from dotenv import dotenv_values

In [2]:
cfg = dotenv_values("../.env")

PG_user = cfg['PGSQL_USER']
PG_pw = cfg['PGSQL_PW']
PG_host = cfg['PGSQL_HOST']
PG_db = cfg['PGSQL_DB']

SPOT_id = cfg['SPOTIPY_CLIENT_ID']
SPOT_secret = cfg['SPOTIPY_CLIENT_SECRET']

In [3]:
raw_data_path = '../data/_raw/'
processed_data_path = '../data/processed/'
df_songs = pd.read_csv(raw_data_path + 'SpotifyAudioFeaturesApril2019.csv')
print(df_songs.shape)

(130663, 17)


Trying to read the playlist csv was throwing errors due to the use of **"** in some song titles.  As we're only dealing with 8 problematic lines, we'll just drop them.

In [4]:
# df_playlists turns out to be problematic due to use of double quotes in the csv file!

err_lines = []
def capture_err_lines(err_line):
    err_lines.append(err_line)
    return None

df_playlists = pd.read_csv(raw_data_path + 'Playlist_dataset.csv',
                           engine="python",
                           on_bad_lines=capture_err_lines)

print('dropped entries: ', len(err_lines))

print(df_playlists.shape)

dropped entries:  8
(12791369, 4)


-----
### starting with the songs dataset -

In [5]:
# dropping the few duplicated track_id entries
df_len = len(df_songs)
uid_count = df_songs['track_id'].nunique()
print("df length: ", df_len, "unique id count: ", uid_count)

df_songs.drop_duplicates(subset=['track_id'], inplace=True)
df_songs.reset_index(drop=True)
print("dropped dupes, new df length: ", len(df_songs))

df length:  130663 unique id count:  130326
dropped dupes, new df length:  130326


In [6]:
df_songs.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
count,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0,130326.0
mean,0.342617,0.581332,212632.0,0.569137,0.224383,5.232409,0.194913,-9.979523,0.607906,0.111966,119.468106,3.878766,0.439659,24.130573
std,0.345706,0.190084,123270.6,0.260397,0.360517,3.60285,0.167782,6.547386,0.488219,0.124341,30.162445,0.514784,0.259157,19.662458
min,0.0,0.0,3203.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0315,0.459,163880.0,0.396,0.0,2.0,0.0975,-11.90575,0.0,0.0389,96.01325,4.0,0.224,7.0
50%,0.203,0.605,201905.0,0.603,0.000151,5.0,0.124,-7.983,1.0,0.0558,120.026,4.0,0.42,21.0
75%,0.637,0.727,241058.8,0.775,0.444,8.0,0.236,-5.686,1.0,0.129,139.613,4.0,0.638,38.0
max,0.996,0.996,5610020.0,1.0,1.0,11.0,0.999,1.806,1.0,0.966,249.983,5.0,1.0,100.0


In [7]:
df_songs.sample(5)

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
23284,Gunplay,26TywDaMqkUssNYSlqQmUm,My Phone,0.0183,0.938,256788,0.26,8.8e-05,0,0.0689,-17.122,0,0.121,100.017,4,0.56,14
46429,Mourn,1wvcDS3v4VD66u81Wgwq6d,Barcelona City Tour,0.00464,0.642,152185,0.93,0.0809,9,0.327,-5.052,1,0.067,166.966,4,0.742,10
5205,YG Hypnos,1s7qtaTu6fRBlvr985F2nr,Backtothebasics,0.845,0.845,81139,0.333,0.864,6,0.113,-9.265,0,0.519,142.146,4,0.555,18
106637,Johann Sebastian Bach,5qD3hP0aIM2SfbkENvyo3v,"Ich bin in mir vergnügt, BWV 204: No. 6, Meine...",0.924,0.269,407867,0.0955,3e-06,1,0.135,-19.478,0,0.0516,140.704,3,0.198,6
74119,Rain Sounds,2GHDyXMIXumoxhkCCYD3fq,Wild Rain,0.926,0.223,115526,0.999,0.992,4,0.958,-16.551,0,0.0442,71.355,5,1e-05,0


In [8]:
df_songs[['artist_name', 'key', 'mode', 'time_signature']].nunique()

artist_name       34599
key                  12
mode                  2
time_signature        5
dtype: int64

In [9]:
cat_cols = ['artist_name', 'key', 'mode', 'time_signature']
num_cols = ['acousticness', 'danceability', 'energy', 'instrumentalness',
            'liveness', 'loudness', 'speechiness', 'valence']

In [10]:
df_norm = df_songs.copy()
for col in num_cols:
    max_val = df_norm[col].max()
    min_val = df_norm[col].min()
    df_norm[col] = (df_songs[col] - min_val) / (max_val - min_val)

df_norm.head()

Unnamed: 0,artist_name,track_id,track_name,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,popularity
0,YG,2RM4jf1Xa9zPgMGRDiht8O,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj",0.005843,0.745984,238373,0.339,0.0,1,0.081281,0.846552,1,0.423395,203.927,4,0.118,15
1,YG,1tHDG53xJNGsItRA3vfVgs,BAND DRUM (feat. A$AP Rocky),0.024498,0.849398,214800,0.557,0.0,8,0.286286,0.853331,1,0.473085,159.009,4,0.371,0
2,R3HAB,6Wosx2euFPMT14UXiWudMy,Radio Silence,0.0251,0.605422,138913,0.723,0.0,9,0.082482,0.875481,0,0.046998,114.966,4,0.382,56
3,Chris Cooq,3J2Jpw61sO7l6Hc7qdYV91,Lactose,0.029518,0.803213,125381,0.579,0.912,5,0.099499,0.774714,0,0.072567,123.003,4,0.641,0
4,Chris Cooq,2jbYvQCyPgX3CdmAzeVeuS,Same - Original mix,3.5e-05,0.786145,124016,0.792,0.878,7,0.033233,0.804501,1,0.068427,120.047,4,0.928,0


In [11]:
df_songs.to_csv(processed_data_path + 'df_songs_norm.csv')

-----
### on to the playlists dataset!

In [12]:
print(df_playlists.shape)
df_playlists.sample(5)

(12791369, 4)


Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
4576181,fa849dabeb14a2800ad5130907fc5018,The Mahones,Queen And Tequila,Irish music
845475,984c9d805f92f3202a3f8cc34c99bf93,Weezer,Pink Triangle - Live at Reading Festival / 1996,Weezerlist
7498902,ed94664f1a9110733aa33bd6d3df0850,The Acorn,Dents,Woohoo
12208135,309b64b01cc2e4086a92e4f88c274006,Gorillaz,Welcome To The World of The Plastic Beach - fe...,rap
1044050,860db0604f9326813352aa04924da543,Grizzly Bear,Yet Again,Starred


In [13]:
df_playlists.isna().sum()

user_id                0
 "artistname"      33532
 "trackname"          85
 "playlistname"       41
dtype: int64

In [14]:
df_playlists.dropna(inplace=True)
df_playlists.reset_index(drop=True, inplace=True)

The `artistname` & `trackname` are means to matching it with the track_id in our songs dataset.

`user_id` will be used to group duplicated playlist names

In [15]:
new_cols = ['user_id', 'artist_name', 'track_name', 'playlist_name']
df_playlists.columns = new_cols
df_playlists.head()

Unnamed: 0,user_id,artist_name,track_name,playlist_name
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


-----

I've little faith this will work, but try we must!

If merging the datasets on artist & track name works, it will save us a ton of scraping efforts -

In [16]:
df_songs[['track_id', 'artist_name', 'track_name']].head()

Unnamed: 0,track_id,artist_name,track_name
0,2RM4jf1Xa9zPgMGRDiht8O,YG,"Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj"
1,1tHDG53xJNGsItRA3vfVgs,YG,BAND DRUM (feat. A$AP Rocky)
2,6Wosx2euFPMT14UXiWudMy,R3HAB,Radio Silence
3,3J2Jpw61sO7l6Hc7qdYV91,Chris Cooq,Lactose
4,2jbYvQCyPgX3CdmAzeVeuS,Chris Cooq,Same - Original mix


In [17]:
test_merge = pd.merge(df_playlists[['playlist_name', 'artist_name', 'track_name']],
                      df_songs[['track_id', 'artist_name', 'track_name']],
                      on=['artist_name', 'track_name'], how='left')
test_merge.shape

(12757768, 4)

In [18]:
test_merge.isna().sum()

playlist_name           0
artist_name             0
track_name              0
track_id         12624265
dtype: int64

In [19]:
test_merge[~test_merge.isna().any(axis=1)].sample(5)

Unnamed: 0,playlist_name,artist_name,track_name,track_id
7545544,Artistas,Elvis Presley,Wooden Heart,5p1vNflCDCkOKOJRyszojj
11287705,Rolig/sov,Ludovico Einaudi,Una Mattina,3iCNcS1uC2Pm9XKc9BERhq
9024297,Starred,Katy Perry,Roar,4qvduiVJVHg5WdjEAMV9Rg
10345423,Country,Thomas Rhett,Get Me Some Of That,5HhOsYysrTRWD2bCcSiAYB
276719,Mix,Journey,Don't Stop Believin',3DtK601PJiNPe8COo67unB


ah well, worth a try at least.

At this point, I gave fuzzy matching of `artist_name` and then `track_name` a go, but still came up with a woefully short list of accurate connections.

So, brutish though it may be, onto scraping.

Starting with creating a dict of artists and their tracks, and exporting it for easier access in future use.

-----
-----
<br>
<center> <b><i>Note!</b></i>
<br>

If this is the first time running this notebook, you'll need to uncomment and run the next few cells to create the artist-tracks dictionary.  This was process heavy and time consuming, so I exported the cleaned up data for future use!

-----

In [18]:
# # create and export artist-tracks dictionary

# scrape_df = df_playlists[['artist_name', 'track_name']].copy()
# scrape_df.drop_duplicates(inplace=True)
# print(f'Dropped rows: {len(df_playlists) - len(scrape_df):,}')
# print(f'remaining track count: {len(scrape_df):,}')

# artist_songs = {}

# for row in tqdm(scrape_df.iterrows(), total=len(scrape_df)):
#     artist, track = row[1][['artist_name', 'track_name']]
#     if artist in artist_songs:
#         if track not in artist_songs[artist]:
#             artist_songs[artist].append(track)
#         else:
#             pass
#     else:
#         artist_songs[artist] = [track]

# with open(processed_data_path + 'artist-songs-dict.json', 'w') as f:
#     json.dump(artist_songs, f)

In [20]:
with open(processed_data_path + 'artist-songs-dict.json', 'r') as f:
    artist_songs = json.load(f)

In [21]:
list_artists = list(artist_songs.keys())
print("\ncurrent count of artists: ", len(list_artists))

artists_lower = [i.lower() for i in list_artists]
print("\ncount of artists after .lower(): ", len(set(artists_lower)))

artists_nospace = [i.replace(" ", "") for i in artists_lower]
print("\ncount of .lower() after dropping spaces: ", len(set(artists_nospace)))



current count of artists:  287442

count of artists after .lower():  280206

count of .lower() after dropping spaces:  278241


As we can combine some of these to drop nearly 10k keys, let's give it a go.

We'll create a dictionary with keys made from the .lower().replace(" ", "") and a list of matching names that produce that key.  So - "GreenDay", "Greenday", "greenday", "Green Day", and "green day" all show up as:

`{"greenday": "GreenDay", "Greenday", "greenday", "Green Day", and "green day"}`

Once we have sussed out duplicate artists, we'll then go through our original dictionary and find which spelling has the most entries and consolidate all the tracks under that single key.

In [22]:
alt_names = {k:[] for k in artists_nospace}

for i in tqdm(list_artists):
    matching_key = i.lower().replace(" ", "")
    alt_names[matching_key].append(i)

list(alt_names.items())[:5]

100%|██████████| 287442/287442 [00:00<00:00, 628567.64it/s]


[('elviscostello', ['Elvis Costello']),
 ('elviscostello&theattractions',
  ['Elvis Costello & The Attractions', 'Elvis Costello & the Attractions']),
 ('tiffanypage', ['Tiffany Page']),
 ('lissie', ['Lissie']),
 ('paulmccartney', ['Paul McCartney', 'Paul Mccartney', 'Paul  McCartney'])]

In [23]:
# isolating all entries with multiple names
print('len(alt_names) before: ', len(alt_names))

for k, v in dict(alt_names).items():
    if len(v) == 1:
        del alt_names[k]

print('len(alt_names) after: ', len(alt_names), '\n')

list(alt_names.items())[:5]

len(alt_names) before:  278241
len(alt_names) after:  7862 



[('elviscostello&theattractions',
  ['Elvis Costello & The Attractions', 'Elvis Costello & the Attractions']),
 ('paulmccartney', ['Paul McCartney', 'Paul Mccartney', 'Paul  McCartney']),
 ('crowdedhouse', ['Crowded House', 'Crowded house', 'CROWDED HOUSE']),
 ('noahandthewhale',
  ['Noah And The Whale', 'Noah and the Whale', 'Noah and The Whale']),
 ('pearljam', ['Pearl Jam', 'Pearl jam', 'PEARL JAM'])]

In [24]:
print('before merge: ', len(artist_songs))

for k, v in alt_names.items():
    perms = list(v)
    best_key = perms[0]

    for i in perms:
        if len(artist_songs[i]) > len(artist_songs[best_key]):
            best_key = str(i)

    perms.remove(best_key)
    
    for i in perms:
        artist_songs[best_key].extend(artist_songs[i])
        del artist_songs[i]

print('after merge: ', len(artist_songs))

before merge:  287442
after merge:  278241


Removing duplicate tracks from merging these artists -

In [25]:
def track_filter(track_list):
    track_list = list(set(track_list))

    string_test = []
    trimmed_list = []

    for i in track_list:
        test_string = str(i).lower().replace(" ", "")
        if test_string in string_test:
            pass
        else:
            trimmed_list.append(i)
            string_test.append(test_string)

    return trimmed_list

In [26]:
# just to see how useful this filter is, let's count the dropped tracks along the way
dupes_removed = 0

for k, v in artist_songs.items():
    start_len = len(v)
    trim_list = track_filter(v)
    end_len = len(trim_list)
    dupes_removed += start_len - end_len
    artist_songs[k] = trim_list

print('Number of duplicate tracks dropped: ', dupes_removed)

Number of duplicate tracks dropped:  101464


Now that we've got some uniformity to these entries, let's bring this back to the `df_playlists` to make it more usable.

In [27]:
# applying that same process for consolidating the names to the artists in our df
replace_names = {i.lower().replace(" ", ""):i for i in list(artist_songs.keys())}

df_playlists['artist_name'] = df_playlists['artist_name'].str.lower()
df_playlists['artist_name'] = df_playlists['artist_name'].str.replace(" ", "")
df_playlists['artist_name'] = df_playlists['artist_name'].map(replace_names)

In [28]:
# track replacement is a little more in depth -
# we need a nested dictionary in case multiple artists have songs with the same title
track_rename_dict = {}
for k, v in artist_songs.items():
    lower_nospace = {str(i).lower().replace(" ", ""):str(i) for i in v}
    track_rename_dict[k] = lower_nospace

In [31]:
def consolidate_track_name(artist, track):
    track_check = track.lower().replace(" ", "")
    if track_check in track_rename_dict[artist]:
        return track_rename_dict[artist][track_check]
    else:
        return track

df_playlists['rename_track'] = df_playlists.apply(
                                    lambda x: consolidate_track_name(
                                        x['artist_name'], x['track_name']
                                        )
                                    , axis=1
                                )

In [39]:
df_playlists.iloc[np.where(df_playlists['track_name'] != df_playlists['rename_track'])]

Unnamed: 0,user_id,artist_name,track_name,playlist_name,rename_track
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010,(the Angels Wanna Wear My) Red Shoes
9,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010,Brightside
13,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House,Don't Dream It's Over,HARD ROCK 2010,Don't dream it's over
15,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House,Fall At Your Feet,HARD ROCK 2010,Fall at Your Feet
68,9cc0cfd4d7d7885102480dd99e7a90d6,Noel Gallagher's High Flying Birds,AKA... What A Life!,IOW 2012,AKA...What A Life!
...,...,...,...,...,...
12757746,2302bf9c64dc63d88a750215ed187f2c,Rage Against The Machine,Sleep Now In The Fire,iPhone,Sleep Now in The Fire
12757756,2302bf9c64dc63d88a750215ed187f2c,Def Leppard,Too Late For Love,iPhone,Too Late for Love
12757760,2302bf9c64dc63d88a750215ed187f2c,Johnny Cash,When It's Springtime In Alaska (It's Forty Below),iPhone,When It's Springtime in Alaska (It's Forty Below)
12757761,2302bf9c64dc63d88a750215ed187f2c,U2,When Love Comes To Town,iPhone,When Love Comes to Town


In [40]:
df_playlists.nunique()

user_id            15906
artist_name       278235
track_name       1973112
playlist_name     156706
rename_track     1913345
dtype: int64

In [41]:
df_playlists['track_name'] = df_playlists['rename_track']
df_playlists.drop(['rename_track'], axis=1, inplace=True)
df_playlists.nunique()

user_id            15906
artist_name       278235
track_name       1913345
playlist_name     156706
dtype: int64

Further consolidation so that "Song Title" and "Song Title feat. Somebody Else" merge

In [44]:
list_artists = list(artist_songs.keys())

# apparently still had a song or two that were not saved as strings.
for k, v in artist_songs.items():
    artist_songs[k] = [str(i) for i in v]

In [45]:
def check_substring_matches(list_of_strings):
    test_list = sorted(list_of_strings, key=len)
    subb_dict = {}
    while test_list:
        test = test_list[0]
        matches = [i for i in test_list if test.lower() in i.lower()]
        test_list = [i for i in test_list if i not in matches]
        subb_dict[test] = matches
    return subb_dict

track_subs = {}

for artist in tqdm(list_artists):
    consolidated = check_substring_matches(artist_songs[artist])
    track_subs[artist] = {}
    artist_songs[artist] = list(consolidated.keys())
    for k,v in consolidated.items():
        track_subs[artist].update({i:k for i in v})

100%|██████████| 278241/278241 [01:37<00:00, 2850.85it/s] 


In [47]:
def trim_track(x, y):
    if y in track_subs[x]:
        return track_subs[x][y]
    else:
        return y

df_playlists['trim_track_name'] = df_playlists.apply(lambda x: trim_track(x['artist_name'], x['track_name']), axis=1)

print(df_playlists.isna().sum())
df_playlists.nunique()

user_id            0
artist_name        0
track_name         0
playlist_name      0
trim_track_name    0
dtype: int64


user_id              15906
artist_name         278235
track_name         1913345
playlist_name       156706
trim_track_name    1565726
dtype: int64

In [48]:
df_playlists.drop(columns=['track_name'], inplace=True)
df_playlists.rename({'trim_track_name':'track_name'}, axis=1, inplace=True)
df_playlists.head()

Unnamed: 0,user_id,artist_name,playlist_name,track_name
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,HARD ROCK 2010,(the Angels Wanna Wear My) Red Shoes
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,HARD ROCK 2010,"(What's So Funny 'Bout) Peace, Love And Unders..."
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,HARD ROCK 2010,7 Years Too Late
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,HARD ROCK 2010,Accidents Will Happen
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,HARD ROCK 2010,Alison


A few interesting points -

In [49]:
one_hit_wonders = []
for k, v in artist_songs.items():
    if len(v) == 1:
        one_hit_wonders.append(k)
print(f'Number of artists with only one song in the catalog: {len(one_hit_wonders)}')

artist_occurance = df_playlists['artist_name'].value_counts()
rare_artists = artist_occurance[artist_occurance < 10]
print(f'Number of artists showing up less than 10 times total: {len(rare_artists)}')

artist_fan_count = df_playlists.groupby(['artist_name'])['user_id'].nunique()
single_fans = artist_fan_count[artist_fan_count == 1]
print(f'Number of artists with only one user adding them: {len(single_fans)}')

Number of artists with only one song in the catalog: 142607
Number of artists showing up less than 10 times total: 206455
Number of artists with only one user adding them: 149365


-----

Let's decide upon what we feel is too niche -

* If an artist only shows up in one user's playlists

* If an artist shows up a total of < 10 times

In [50]:
# artist with only their superfan!
df_playlists = df_playlists[~df_playlists['artist_name'].isin(list(single_fans.keys()))]

In [51]:
# artists who only appear fewer than 10 times total
df_playlists = df_playlists[~df_playlists['artist_name'].isin(list(rare_artists.keys()))]

In [52]:
# resetting index for the dropped rows
df_playlists.reset_index(drop=True, inplace=True)
df_playlists.nunique()

user_id            15852
artist_name        61078
playlist_name     152687
track_name       1222131
dtype: int64

In [55]:
start_len = 12791369
print(f'original df len: \t{start_len:,}')
print(f'cleaned & trimmed: \t{len(df_playlists):,}')

original df len: 	12,791,369
cleaned & trimmed: 	12,070,547


Having now dropped over 700,000 rows in the playlists dataset for the niche songs, let's do one final cleanup and export the processed df.

While `user_id` is unique ids, different useres could have used the same `playlist_name` string.

We'll create uuids for the user-playlist combo and use that as the playlist_id moving forward.

In [62]:
df_playlists[['user_id', 'playlist_name']] = df_playlists[['user_id', 'playlist_name']].astype('str')
zip_cols = zip(df_playlists['user_id'], df_playlists['playlist_name'])
df_playlists['uuid_input'] = [''.join(i) for i in zip_cols]
unique_userplaylists = list(set(df_playlists['uuid_input']))

In [63]:
df_playlists.head(3)

Unnamed: 0,user_id,artist_name,playlist_name,track_name,uuid_input
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,HARD ROCK 2010,(the Angels Wanna Wear My) Red Shoes,9cc0cfd4d7d7885102480dd99e7a90d6HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,HARD ROCK 2010,"(What's So Funny 'Bout) Peace, Love And Unders...",9cc0cfd4d7d7885102480dd99e7a90d6HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,HARD ROCK 2010,Accidents Will Happen,9cc0cfd4d7d7885102480dd99e7a90d6HARD ROCK 2010


In [64]:
# confirming the suspicion that different users have the same playlist_name
df_playlists[['playlist_name', 'uuid_input']].nunique()

playlist_name    152687
uuid_input       226486
dtype: int64

In [70]:
# played around with the id length until each entry was unique
uuid_dict = {}
uuids = []
for i in unique_userplaylists:
    hash_val = abs(hash(i)) % (10 ** 11)
    uuid_dict[i] = hash_val
    uuids.append(hash_val)
print(len(uuids))
print(len(set(uuids)))

226486
226486


In [71]:
df_playlists['playlist_uuid'] = df_playlists['uuid_input'].map(uuid_dict)

In [72]:
df_playlists.drop(['playlist_name', 'uuid_input'], axis=1, inplace=True)
df_playlists = df_playlists[['artist_name', 'track_name', 'playlist_uuid', 'user_id']]
df_playlists.head()

Unnamed: 0,artist_name,track_name,playlist_uuid,user_id
0,Elvis Costello,(the Angels Wanna Wear My) Red Shoes,58048995723,9cc0cfd4d7d7885102480dd99e7a90d6
1,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",58048995723,9cc0cfd4d7d7885102480dd99e7a90d6
2,Elvis Costello & The Attractions,Accidents Will Happen,58048995723,9cc0cfd4d7d7885102480dd99e7a90d6
3,Elvis Costello,Alison,58048995723,9cc0cfd4d7d7885102480dd99e7a90d6
4,Lissie,All Be Okay,58048995723,9cc0cfd4d7d7885102480dd99e7a90d6


In [73]:
df_playlists.to_csv(processed_data_path + 'df_playlists.csv')

In [74]:
# create and export artist-tracks dictionary

trim_df = df_playlists[['artist_name', 'track_name']].copy()
trim_df.drop_duplicates(inplace=True)

artist_songs_trimmed = {}

for row in tqdm(trim_df.iterrows(), total=len(trim_df)):
    artist, track = row[1][['artist_name', 'track_name']]
    if artist in artist_songs_trimmed:
        if track not in artist_songs_trimmed[artist]:
            artist_songs_trimmed[artist].append(track)
        else:
            pass
    else:
        artist_songs_trimmed[artist] = [track]

with open(processed_data_path + 'artist-songs-trimmed.json', 'w') as f:
    json.dump(artist_songs_trimmed, f)

100%|██████████| 1747681/1747681 [18:13<00:00, 1598.63it/s]


-----
-----
-----
