# Set Up

In [None]:
pip install spotipy --upgrade

In [1]:
import pandas as pd
import time
from tqdm import tqdm

import requests

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Auxiliary Functions

In [2]:
cid = #'Your Client ID'
secret =  #'Your Secret ID'

client_credentials_manager = SpotifyClientCredentials(client_id=cid,
                                                      client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
def getTrackIDs(user, playlist_id):
    '''
    Retrives Track IDs from a given playlist. The iputs are the username (found in the URL) of the person 
    who created the playlist, and then the playlist URI which you can find by 
    hitting the setting button on the playlist where you’d find the share link.
    Returns a list of track ids.
    '''
    ids = []
    playlist = sp.user_playlist(user, playlist_id)
    for item in playlist['tracks']['items']:
        track = item['track']
        ids.append(track['id'])
    return ids

In [4]:
def getTrackFeatures(id):
    '''
    gets track info via track ids
    https://medium.com/better-programming/how-to-extract-any-artists-data-using-spotify-s-api-python-and-spotipy-4c079401bc37
    '''
    meta = sp.track(id)
    features = sp.audio_features(id)
    
    # meta
    name = meta['name']
    track_id = meta['id']
    album = meta['album']['name']
    artists = meta['album']['artists'][0]['name']
    release_date = meta['album']['release_date']
    duration_ms = meta['duration_ms']
    popularity = meta['popularity']
    cover_url = meta['album']['images'][0]['url']
    
    # features
    acousticness = features[0]['acousticness']
    danceability = features[0]['danceability']
    energy = features[0]['energy']
    instrumentalness = features[0]['instrumentalness']
    liveness = features[0]['liveness']
    loudness = features[0]['loudness']
    speechiness = features[0]['speechiness']
    tempo = features[0]['tempo']
    time_signature = features[0]['time_signature']
    
    # artists' genres
    artist_id = meta['album']['artists'][0]['id']
    genres = sp.artist(artist_id)['genres']
        
    track = [name, track_id, album, artists, release_date, duration_ms, popularity, cover_url, danceability, acousticness, danceability, energy, instrumentalness, liveness, loudness, speechiness, tempo, time_signature, genres]
    return track

In [5]:
def getInfos(track):
    '''
    gets meta track info via track from search results
    '''
    meta = track

    # meta
    name = meta['name']
    track_id = meta['id']
    album = meta['album']['name']
    artists = meta['album']['artists'][0]['name']
    artists_id = meta['album']['artists'][0]['id']
    release_date = meta['album']['release_date']
    duration_ms = meta['duration_ms']
    popularity = meta['popularity']
    cover_url = meta['album']['images'][0]['url']
    explicit = meta['explicit']
    preview_url = meta['preview_url']
    album_track_no = meta['track_number']
    total_album_tracks = meta['album']['total_tracks']
    available_markets = meta['album']['available_markets']

    meta_info = [
        name, track_id, album, artists, artists_id, release_date, duration_ms,
        popularity, cover_url, explicit, preview_url, album_track_no,
        total_album_tracks, available_markets]
    
    return meta_info

In [28]:
def getFeatures(track_ids):
    '''
    gets song features via track ids and artist ids. Takes in a lists of max 100
    '''

    # features
    features = sp.audio_features(track_ids)

    feature_info = []
    for i in range(len(features)):
        try:
            acousticness = features[i]['acousticness']
            danceability = features[i]['danceability']
            energy = features[i]['energy']
            instrumentalness = features[i]['instrumentalness']
            liveness = features[i]['liveness']
            loudness = features[i]['loudness']
            speechiness = features[i]['speechiness']
            tempo = features[i]['tempo']
            time_signature = features[i]['time_signature']

            temp = [
                acousticness, danceability, energy, instrumentalness, liveness,
                loudness, speechiness, tempo, time_signature]
        except:
            temp=[]

        feature_info.append(temp)
    return feature_info

In [133]:
def getGenres(artist_ids, token):
    '''
    gets song features via track ids and artist ids. Takes a lists of max 50.
    '''    
    ids=','.join(artist_ids)
    
    
    headers = {'Content-Type': 'application/json',
               'Authorization': 'Bearer {0}'.format(token)}
    
    api_url='https://api.spotify.com/v1/artists?ids='+ids
    response = requests.get(api_url, headers = headers)
    if response.status_code == 401:
        print('Token expired!!')
    elif response.status_code == 200:
        result = json.loads(response.content.decode('utf-8'))   
        artists = result['artists']

        genres = []
        for i in range(0,len(artists)):
            #try:
            artist=list(artist_ids)[i]
            genre=artists[i]['genres']
            temp=[artist, genre]
            #except:
                #temp=[]
            genres.append(temp)
        return genres

In [88]:
def spotify_search(search_term, token):
    
    headers = {'Content-Type': 'application/json',
               'Authorization': 'Bearer {0}'.format(token)}

    ids = []
    errors = []
    tracks = []

    for i in range(0,2000,50):
        api_url = 'https://api.spotify.com/v1/search?q=track:'+search_term+'+year:2019&type=track&offset='+str(i)+'&limit=50'
        response = requests.get(api_url, headers = headers)
        if response.status_code == 401:
            print('Token expired!!')
            break
        elif response.status_code == 200:
            result = json.loads(response.content.decode('utf-8'))
            items = result['tracks']['items']
            for j in range(len(items)):
                track = items[j]
                ids.append(track['id'])
                try:
                    track_features = getInfos(track)
                    tracks.append(track_features)
                except:
                    errors.append(track['id'])
                    
                    
    # create dataset
    tracks = pd.DataFrame(tracks, columns = [
        "name", "track_id", "album", "artists", "artists_id", "release_date", "duration_ms",
        "popularity", "cover_url", "explicit", "preview_url", "album_track_no",
        "total_album_tracks", "available_markets"])
    
    #print('Total search results for '+search_term+': '+ str(result['tracks']['total']))
    
    return tracks

In [18]:
def spotify_search_results(search_term, token):
    headers = {'Content-Type': 'application/json',
               'Authorization': 'Bearer {0}'.format(token)}
    api_url = 'https://api.spotify.com/v1/search?q=track:'+search_term+'+year:2019&type=track&offset=0&limit=1'
    response = requests.get(api_url, headers = headers)
    if response.status_code == 401:
        print('Token expired!!')
    elif response.status_code == 200:
        result = json.loads(response.content.decode('utf-8'))
        return result['tracks']['total']

# Retrieving songs through API (alphabetically)

In [121]:
api_token = "BQC0xGxB05NSUJU1nz3JS5uFaKGMHf9fno15MuiypmcxvzTeu5IM9Kok2f4fpm45jNVnkmNfccWX3j0ToII_8867JeGZBXqQpdOS2BgxWqCnGYnZ91U4iTm__Ru3oNkE29RhHrSqqe0havM"

## Searching for character combinations in Spotify library

In [122]:
characters=list(map(chr, range(97, 123)))+list(range(10))

search_dict={}

for c1 in tqdm(characters[34:]):
    for c2 in characters:
        term=str(c1)+str(c2)
        search_dict[term]= spotify_search(term, api_token)
        print(term)


'''

# Longer version:

for c1 in tqdm(characters[34:]):
    term=str(c1)
    if spotify_search_results(term, api_token)<2000:
        search_dict[term]= spotify_search(term, api_token)
        print(term)
    else:
        for c2 in characters:
            term=str(c1)+str(c2)
            if spotify_search_results(term, api_token)<2000:
                search_dict[term]= spotify_search(term, api_token)
                print(term)
            else:
                for c3 in characters:
                    term=str(c1)+str(c2)+str(c3)
                    if spotify_search_results(term, api_token)<2000:
                        search_dict[term]= spotify_search(term, api_token)
                        print(term)
                    else:
                        for c4 in characters:
                            term=str(c1)+str(c2)+str(c3)+str(c4)
                            search_dict[term]= spotify_search(term, api_token)
                            print(term)
    '''


  0%|          | 0/2 [00:00<?, ?it/s]

8a
8b
8c
8d
8e
8f
8g
8h
8i
8j
8k
8l
8m
8n
8o
8p
8q
8r
8s
8t
8u
8v
8w
8x
8y
8z
80
81
82
83
84
85
86
87
88


 50%|█████     | 1/2 [04:45<04:45, 285.52s/it]

89
9a
9b
9c
9d
9e
9f
9g
9h
9i
9j
9k
9l
9m
9n
9o
9p
9q
9r
9s
9t
9u
9v
9w
9x
9y
9z
90
91
92
93
94
95
96
97
98


100%|██████████| 2/2 [08:49<00:00, 264.57s/it]

99





In [123]:
df=pd.concat(search_dict, axis=0, ignore_index=True)
df.reset_index(drop=True, inplace=True)

##  Audio Features

In [127]:
features = []
for i in tqdm(range(0,df.shape[0],100)):
    temp=getFeatures(df['track_id'][i:i+100])
    features+=temp
features=pd.DataFrame(features, columns = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "time_signature"])
df= df.join(features)

100%|██████████| 377/377 [01:43<00:00,  3.64it/s]


In [129]:
df.to_csv("tracks_2019_abc.csv", sep = ',')

## Genres

In [131]:
df1=pd.read_csv('tracks_2019_aa-f8.csv')
df2=pd.read_csv('tracks_2019_ga-lo.csv')
df3=pd.read_csv('tracks_2019_la-pa.csv')
df4=pd.read_csv('tracks_2019_pa-01.csv')
df5=pd.read_csv('tracks_2019_0a-80.csv')
df6=pd.read_csv('tracks_2019_8a-99.csv')

df=pd.concat([df1,df2,df3, df4, df5, df6], axis=0)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('Unnamed: 0.1', axis=1, inplace=True)
df = df.drop_duplicates(subset='track_id', keep="first")
df.reset_index(drop=True, inplace=True)
df.info()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1047790 entries, 0 to 1047789
Data columns (total 23 columns):
acousticness          1047547 non-null float64
album                 1047787 non-null object
album_track_no        1047790 non-null int64
artists               1047783 non-null object
artists_id            1047790 non-null object
available_markets     1047790 non-null object
cover_url             1047790 non-null object
danceability          1047547 non-null float64
duration_ms           1047790 non-null int64
energy                1047547 non-null float64
explicit              1047790 non-null bool
instrumentalness      1047547 non-null float64
liveness              1047547 non-null float64
loudness              1047547 non-null float64
name                  1047790 non-null object
popularity            1047790 non-null int64
preview_url           999789 non-null object
release_date          1047790 non-null object
speechiness           1047547 non-null float64
tempo       

In [134]:
artists_ids=list(df['artists_id'].drop_duplicates())

genres = []
errors = []
for i in tqdm(range(0,len(artists_ids),50)):
    temp=getGenres(artists_ids[i:i+50], api_token)
    if temp!= None:
        genres+=temp
    else: 
        errors += artists_ids[i:i+50]
genres=pd.DataFrame(genres, columns = ["artists_id", "genres"])
df = pd.merge(df,genres, how = "left", on='artists_id')

100%|██████████| 4723/4723 [10:41<00:00,  7.36it/s]  


In [140]:
df.to_csv("tracks_2019_genres.csv", sep = ',')