In [7]:
import sys
sys.path.insert(0, '../')
import os

import pdb

import numpy as np
import pandas as pd
import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2


from keys.spotify_keys import * 

## Load in Data

In [4]:
data = pd.read_csv("../data/raw/raw_dropped_duplicates.zip")

In [5]:
import time
class SpotifyAPIParser:
    
    def __init__(self, save_directory="../data/api_calls/", file_name="track_data.csv"):
        self.save_directory = save_directory
        self.file_name = file_name
        
        # give initial value for save location
        self.save_loc = f"{save_directory}0-{file_name}"
        
        self.current_df = None
    
    
    
    def get_tracks_data(self, data, sp, index_range=[0], final_file_index=None):

        self.reset_current_df()
        for i in index_range:
            ids = list(data.ID.iloc[i: i + 50])

            # query spoify api
            res = sp.tracks(ids)

            # save results in parsed
            parsed = self.parse_api_response(res)

            # concat parsed and current_df
            self.current_df = pd.concat([self.current_df, parsed]) 
            
            # if the query was was for less than 50, save the current_df and end our loop
            if(len(ids) < 50):
                self.current_df.to_csv(f"{self.save_directory}{final_file_index}-{self.file_name}", index=False)
                return

            if(self.current_df.shape[0] % 10_000 == 0):
                self.current_df.to_csv(f"{self.save_directory}{int(i / 10_000)}-{self.file_name}", index=False)
                self.reset_current_df()
    
    
    def reset_current_df(self):
        self.current_df = pd.DataFrame(columns=[
            "ID", 
            "artist_names",
            "artist_ids",
            "release_date",
            "release_date_precision",])
    
    
    
    
    
    def parse_api_response(self, res):
        track_data = []
        for track in res['tracks']:
            track_data.append(self.parse_track_data(track))
        track_df = pd.DataFrame(track_data, columns=(
            "ID", 
            "artist_names",
            "artist_ids",
            "release_date",
            "release_date_precision",
        ))
        return track_df
        
            
    def parse_track_data(self, track):
        ID = track['id']
        artist_names = self.get_track_artists(track)
        artist_ids = self.get_track_artist_ids(track)
        release_date = self.get_track_release_date(track)
        release_date_precision = self.get_track_release_date_precision(track)
        
        return (
            ID,
            artist_names,
            artist_ids,
            release_date,
            release_date_precision
        )
    
    
    def get_track_artists(self, track):
        try:
            artists = []
            for artist in track['artists']:
                artists.append(artist['name'])
            return ",".join(artists)
        except:
            return np.NaN

    
    def get_track_artist_ids(self, track):
        try:
            artist_ids = []
            for artist in track['artists']:
                artist_ids.append(artist['id'])
            return ",".join(artist_ids)
        except:
            return np.NaN
        
    
    def get_track_release_date(self, track):
        try:
            return track['album']['release_date']
        except:
            return np.NaN
        
    
    def get_track_release_date_precision(self, track):
        try:
            return track['album']['release_date_precision']
        except:
            return np.NaN

## Spotify API 

* Get our api token from the `spotipy` package.
* initialize a SpotifyAPIParser instance
* Get data from each track id by running the `.get_tracks_data` method

In [6]:

CLIENT_ID = spotify_keys["CLIENT_ID"]
CLIENT_SECRET = spotify_keys["CLIENT_SECRET"]

credentials = oauth2.SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET)

token = credentials.get_access_token()
spotify = spotipy.Spotify(auth=token)

# res = spotify.tracks(ids)
# print(res)

In [98]:
parser = SpotifyAPIParser("../data/api_calls/", "track_data.csv")
parser.get_tracks_data(data, spotify, index_range=np.arange(100_000, 107_973, 50), final_file_index=10)

## Combine Tracks Data

* read each of the dfs in the provided path into a dataframe
* Concat all into a single dataframe and save to csv


In [20]:
def concat_track_data(path):
    files = list(os.walk(path))[0][2]

    df = pd.DataFrame(columns=[
            "ID", 
            "artist_names",
            "artist_ids",
            "release_date",
            "release_date_precision",])
    
    for file in files:
        current_df = pd.read_csv(f"{path}{file}")
        df = pd.concat([df, current_df])
        
    return df
        

In [21]:
concat_track_data("../data/api_calls/").to_csv("../data/api_calls/tracks_data_full.csv", index=False)