# Data Collection

## Abstract

Text

## Imports

In [1]:
import requests
import json
import os

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.util import prompt_for_user_token

import numpy as np
import pandas as pd

### Tracks Read In

In [2]:
my_library_df = pd.read_csv("../data/tracks.csv")
my_library_df.head()

Unnamed: 0,Artist,Title,Track,Album,Year,Genre,Length,Bitrate
0,"10,000 Maniacs",These Are Days,1,MTV Unplugged,1993,Rock/Pop,04:53,192
1,2814,恢复,1,新しい日の誕生,2015,,05:51,277
2,2814,遠くの愛好家,2,新しい日の誕生,2015,,06:18,233
3,2814,新宿ゴールデン街,3,新しい日の誕生,2015,,08:51,277
4,2814,ふわっと,4,新しい日の誕生,2015,,06:36,266


## Authorization Flow

In [3]:
# https://stackoverflow.com/questions/4906977/how-to-access-environment-variable-values
# Storing API keys in env variables helps obscure secrets from the public while making the code not break
# if you would like to use this notebook please either replace these assignments directly OR set your env variables with the given names!

SPOTIFY_CLIENT_ID = os.environ.get("SPOTIFY_CLIENT_ID")
SPOTIFY_CLIENT_SECRET = os.environ.get("SPOTIFY_CLIENT_SECRET")
# No real reason to set localhost to an env variable, nothing special about it
SPOTIFY_REDIRECT_URL = "http://localhost/"

In [4]:
#Make helper object to handle client_credential flow
client_credits = SpotifyClientCredentials(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)

#Setup up wrapper object with details setup
sp = spotipy.client.Spotify(client_credentials_manager=client_credits)

In [6]:
track_id_df = pd.read_csv("../data/raw/tracks.csv")

In [8]:
track_ids = [track_id for track_id in track_id_df.track_id]

## Collect Artist IDs

Spotify works on its own set of IDs, so I have to use the search endpoint to essentially cross reference them.

### Retrieve Artists

In [20]:
artists = [artist for artist in my_library_df.groupby("Artist").mean().index]

In [68]:
artist_ids = []
for artist in artists:
    result = sp.search(artist,type="artist")
    if result["artists"]["items"]: #non empty lists are true like in python
        artist_ids.append(result["artists"]["items"][0]["id"])
    else:
        artist_ids.append(None)

In [99]:
artist_df = pd.DataFrame.from_dict({ artist_name : [artist_id] for artist_name, artist_id in zip(artists,artist_ids)},orient="index")
artist_df.columns = ["artist_id"]
artist_df.head()

Unnamed: 0,artist_id
'Til Tuesday,1L0y9srZMyh9XUnYGv37IP
(T-T)b,36ofeSYIu5DGRarLS0GhnG
"10,000 Maniacs",0MBIKH9DjtBkv8O3nS6szj
2814,2wBpW4bAGYVe0yJcBeCTyd
311,41Q0HrwWBtuUkJc7C1Rp6K


## Collect Album IDs

### Call API

In [103]:
results = {}
for artist in artist_ids:
    if artist:
        result = sp.artist_albums(artist, album_type="album")
        results[artist] = result
    else:
        results[artist] = None

### Parse Results

In [136]:
album_ids = []
album_names = []
album_artist_ids = []
for artist_id, corpus in results.items():
    if corpus:
        albums = corpus["items"]
        for album in albums:
            album_artist_ids.append(artist_id)
            album_names.append(album["name"])
            album_ids.append(album["id"])
    else:
        album_artist_ids.append(artist_id)
        album_names.append(None)
        album_ids.append(None)

In [142]:
albums_df = pd.DataFrame({"album_id" : album_ids,
                          "album_names" : album_names,
                          "album_artist_id" : album_artist_ids})

albums_df.head()

Unnamed: 0,album_id,album_names,album_artist_id
0,67OxSfZotEq8cCa5SCeX5r,Everything's Different Now,1L0y9srZMyh9XUnYGv37IP
1,3HiUlOjCrUd9ATG6uHyPI7,Welcome Home,1L0y9srZMyh9XUnYGv37IP
2,1y4s0JN8CQMRwPTJ64jCUp,Voices Carry (Expanded Edition),1L0y9srZMyh9XUnYGv37IP
3,02y1bX0SrCWK6WvJgPPpw7,Coming Up Close: A Retrospective,1L0y9srZMyh9XUnYGv37IP
4,7ExRUn1YpGbfL2j4ILYHh3,Coming Up Close: A Retrospective,1L0y9srZMyh9XUnYGv37IP


## Collect Track IDs

### Call API

In [166]:
results = {}
for album in album_ids:
    if album:
        result = sp.album_tracks(album)
        results[album] = result
    else:
        results[album] = None

### Parse Results

In [214]:
track_ids = []
track_titles = []
track_album_ids = []
track_durations = []
track_numbers = []
track_disc_numbers = []
for album_id, value in results.items():
    if value:
        tracks = value["items"]
        for track in tracks:
            track_ids.append(track["id"])
            track_titles.append(track["name"])
            track_album_ids.append(album_id)
            track_durations.append(track["duration_ms"])
            track_numbers.append(track["track_number"])
            track_disc_numbers.append(track["disc_number"])
    else:
        track_album_ids.append(album_id)
        track_ids.append(None)
        track_titles.append(None)
        track_durations.append(None)
        track_numbers.append(None)
        track_disc_numbers.append(None)

In [215]:
tracks_df = pd.DataFrame({"track_id" : track_ids,
                          "track_number" : track_numbers,
                          "track_title" : track_titles,
                          "track_duration": track_durations,
                          "track_album_id" : track_album_ids,
                          "track_disc_number" : track_disc_numbers})

tracks_df.head()

Unnamed: 0,track_id,track_number,track_title,track_duration,track_album_id,track_disc_number
0,5n9QFM4EiMeLGO0Mbwaqov,1.0,Everything's Different Now,236733.0,67OxSfZotEq8cCa5SCeX5r,1.0
1,0BKslA1XqG8HBUKBl4d7EN,2.0,Rip In Heaven,211200.0,67OxSfZotEq8cCa5SCeX5r,1.0
2,5G7NBdQLHc7GkehNmLG3yJ,3.0,Why Must I,221000.0,67OxSfZotEq8cCa5SCeX5r,1.0
3,0J6nZDgEuwXtECq8Pukn1n,4.0,J For Jules,265493.0,67OxSfZotEq8cCa5SCeX5r,1.0
4,3Lm7hSTabr01oav0WdSsuD,5.0,(Believed You Were) Lucky,216973.0,67OxSfZotEq8cCa5SCeX5r,1.0


## Collect Audio Features

### Call API

In [None]:
results = {}
for track in track_ids:
    if track:
        result = sp.audio_features(track)
        results[track] = result
    else:
        results[track] = None

In [130]:
track_danceabilities = []
track_energies = []
track_loudnesses = []
track_keys = []
track_modes = []
track_speechinesses = []
track_acousticnesses = []
track_instrumentalnesses = []
track_livenesses = []
track_valences = []
track_tempos = []
track_time_signatures = []

for track_id, track_features in results.items():
    #some of them came out a list of a single dictionary, some just a dictionary, this try except corrects for this.
    try:
        track_features = track_features[0]
    except:
        pass
    if track_features:
        track_danceabilities.append(track_features["danceability"])
        track_energies.append(track_features["energy"])
        track_loudnesses.append(track_features["loudness"])
        track_keys.append(track_features["key"])
        track_modes.append(track_features["mode"])
        track_speechinesses.append(track_features["speechiness"])
        track_acousticnesses.append(track_features["acousticness"])
        track_instrumentalnesses.append(track_features["instrumentalness"])
        track_livenesses.append(track_features["liveness"])
        track_valences.append(track_features["valence"])
        track_tempos.append(track_features["tempo"])
        track_time_signatures.append(track_features["time_signature"])
    else:
        track_danceabilities.append(None)
        track_energies.append(None)
        track_loudnesses.append(None)
        track_keys.append(None)
        track_modes.append(None)
        track_speechinesses.append(None)
        track_acousticnesses.append(None)
        track_instrumentalnesses.append(None)
        track_livenesses.append(None)
        track_valences.append(None)
        track_tempos.append(None)
        track_time_signatures.append(None)

In [131]:
track_features_df = pd.DataFrame({
    "track_id" : track_ids,
    "danceability" : track_danceabilities,
    "energy" : track_energies,
    "key" : track_keys,
    "mode" : track_modes,
    "speechiness" : track_speechinesses,
    "acousticness" : track_acousticnesses,
    "instrumentalness" : track_instrumentalnesses,
    "liveness" : track_livenesses,
    "valence" : track_valences,
    "tempo" : track_tempos,
    "time_signature" : track_time_signatures,
})

In [132]:
track_features_df.head()

Unnamed: 0,track_id,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5n9QFM4EiMeLGO0Mbwaqov,0.672,0.455,10.0,1.0,0.0393,0.0693,3.2e-05,0.108,0.73,123.777,4.0
1,0BKslA1XqG8HBUKBl4d7EN,0.612,0.615,4.0,1.0,0.0351,0.428,0.0,0.347,0.815,131.809,4.0
2,5G7NBdQLHc7GkehNmLG3yJ,0.586,0.67,1.0,1.0,0.0268,0.229,3.5e-05,0.323,0.838,96.486,4.0
3,0J6nZDgEuwXtECq8Pukn1n,0.475,0.355,10.0,1.0,0.0308,0.462,1e-06,0.119,0.262,151.651,4.0
4,3Lm7hSTabr01oav0WdSsuD,0.612,0.523,5.0,1.0,0.0321,0.448,2e-06,0.0727,0.495,124.315,4.0


## Output

#### Raw Output

In [70]:
artist_df.to_csv("../data/raw/artists.csv",index_label="artist")
albums_df.to_csv("../data/raw/albums.csv",index=False)
tracks_df.to_csv("../data/raw/tracks.csv",index=False)
track_features_df.to_csv("../data/raw/track_features.csv",index=False)

#### Merging

In [136]:
artist_df["artist"] = artist_df.index

In [137]:
merged_df = pd.merge(artist_df, albums_df, "right", left_on="artist_id", right_on="album_artist_id").drop(columns="album_artist_id")

In [139]:
merged_df = pd.merge(merged_df, tracks_df, "right", left_on="album_id", right_on="track_album_id").drop(columns="track_album_id")

In [142]:
merged_df = pd.merge(merged_df, track_features_df, "right", left_on="track_id", right_on="track_id")

#### Merged Output

In [143]:
merged_df.head()

Unnamed: 0,artist_id,artist,album_id,album_names,track_id,track_number,track_title,track_duration,track_disc_number,danceability,energy,key,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,1L0y9srZMyh9XUnYGv37IP,'Til Tuesday,67OxSfZotEq8cCa5SCeX5r,Everything's Different Now,5n9QFM4EiMeLGO0Mbwaqov,1.0,Everything's Different Now,236733.0,1.0,0.672,0.455,10.0,1.0,0.0393,0.0693,3.2e-05,0.108,0.73,123.777,4.0
1,1L0y9srZMyh9XUnYGv37IP,'Til Tuesday,67OxSfZotEq8cCa5SCeX5r,Everything's Different Now,0BKslA1XqG8HBUKBl4d7EN,2.0,Rip In Heaven,211200.0,1.0,0.612,0.615,4.0,1.0,0.0351,0.428,0.0,0.347,0.815,131.809,4.0
2,1L0y9srZMyh9XUnYGv37IP,'Til Tuesday,67OxSfZotEq8cCa5SCeX5r,Everything's Different Now,5G7NBdQLHc7GkehNmLG3yJ,3.0,Why Must I,221000.0,1.0,0.586,0.67,1.0,1.0,0.0268,0.229,3.5e-05,0.323,0.838,96.486,4.0
3,1L0y9srZMyh9XUnYGv37IP,'Til Tuesday,67OxSfZotEq8cCa5SCeX5r,Everything's Different Now,0J6nZDgEuwXtECq8Pukn1n,4.0,J For Jules,265493.0,1.0,0.475,0.355,10.0,1.0,0.0308,0.462,1e-06,0.119,0.262,151.651,4.0
4,1L0y9srZMyh9XUnYGv37IP,'Til Tuesday,67OxSfZotEq8cCa5SCeX5r,Everything's Different Now,3Lm7hSTabr01oav0WdSsuD,5.0,(Believed You Were) Lucky,216973.0,1.0,0.612,0.523,5.0,1.0,0.0321,0.448,2e-06,0.0727,0.495,124.315,4.0


In [144]:
merged_df.to_csv("../data/musicdata.csv",index=False)