I need to decide which songs I want to collect

In [427]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import config
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

### FMA metadata needed for spotify scraping

Get track names, artist names, album names, where set-subset = 'small'

In [23]:
tracks_file_raw = pd.read_csv('tracks.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [68]:
# retrieve the indices for the 'small' subset
small_indices = tracks_file_raw[tracks_file_raw['set.1'] == 'small'].index

In [72]:
subset_tracks_df = tracks_file_raw.iloc[small_indices]
subset_tracks_df.shape

(8000, 53)

In [417]:
# finding which columns I need
tracks_file_raw.loc[[0]].iloc[:,33:53]

Unnamed: 0,track,track.1,track.2,track.3,track.4,track.5,track.6,track.7,track.8,track.9,track.10,track.11,track.12,track.13,track.14,track.15,track.16,track.17,track.18,track.19
0,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title


In [425]:
# slicing the tracks file to find the artist name, album name, track name, track bit rate,and track top genre
artists = subset_tracks_df['artist.12']
albums = subset_tracks_df['album.10']
tracks = subset_tracks_df['track.19']
tracks_br = subset_tracks_df['track'].astype(int)
tracks_top_genre = subset_tracks_df['track.7']
print('There are {} nulls in the artists name column'.format(artists.isnull().sum()))
print('There are {} nulls in the album name column'.format(albums.isnull().sum()))
print('There are {} nulls in the track name column'.format(tracks.isnull().sum()))
print('There are {} nulls in the track bit rate column'.format(tracks_br.isnull().sum()))
print('There are {} nulls in the track top genre column'.format(tracks_top_genre.isnull().sum()))

There are 0 nulls in the artists name column
There are 0 nulls in the album name column
There are 0 nulls in the track name column
There are 0 nulls in the track bit rate column
There are 0 nulls in the track top genre column


In [123]:
tracks.head()

2                   Food
4             This World
5                Freeway
17    Queen Of The Wires
18                  Ohio
Name: track.19, dtype: object

In [461]:
 df = pd.concat([tracks,artists,albums,tracks_br,tracks_top_genre],axis=1)

In [467]:
new_col_names={'track.19': 'track_name',
               'artist.12':'artist_name',
               'album.10':'album_name', 
               'track': 'track_bit', 
               'track.7':'track_top_genre'}

In [469]:
df.rename(columns=new_col_names,inplace=True)

### Using Spotipy to retrieve time signature of songs in FMA small dataset (tracks above) , from spotify

In [179]:
# To match the search query results from spotify and the tracks which I have from FMA
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [175]:
spotify = spotipy.Spotify(
    client_credentials_manager=SpotifyClientCredentials(
        client_id=config.SPOTIPY_CLIENT_ID, client_secret= config.SPOTIPY_CLIENT_SECRET))

Function to set up search request for spotify, for searching with multiple criteria

In [186]:
def search_q(track, artist=None, album=None, qtype='track'):
    '''
    This function takes in a track name, artist name, and album name and will format a query for Spotipy.search.
    The goal of the query is to find the qtype of the query given multiple conditions
    for example 'album:gold artist:abba', type='album' will return albums with
    'gold' in it as long as the artist contains 'abba'
    qtype can be changed to anything spotify allows 
    see https://developer.spotify.com/documentation/web-api/reference/search/search/
    '''
    if artist and album:
        return spotify.search(q = 'track:{t} artist:{ar} album:{al}'.format(t=track, ar= artist, al= album),type=qtype)
    if artist and not album:
        return spotify.search(q = 'track:{t} artist:{ar}'.format(t=track, ar= artist),type=qtype)
    if not artist and album:
        return spotify.search(q = 'track:{t} album:{al}'.format(t=track, al= album),type=qtype)
    if not artist and not album:
        return spotify.search(q = 'track:{t}'.format(t=track),type=qtype)

In [295]:
#  testing
s1 =   search_q(tracks.iloc[400], artists.iloc[400])

In [199]:
s1['tracks']['items'][0].keys()

dict_keys(['album', 'artists', 'available_markets', 'disc_number', 'duration_ms', 'explicit', 'external_ids', 'external_urls', 'href', 'id', 'is_local', 'name', 'popularity', 'preview_url', 'track_number', 'type', 'uri'])

Function to test how well the search results match using Fuzzywuzzy

In [208]:
def match_results(query_result, track, artist=None, album=None):
    '''
    This function uses fuzzywuzzy to measure the similarity of the search query results and the tracks needed
    query result should be json format direct result from the function 'search_q' above
    returns a dictionary like so {'track': result, 'artist': result, 'album': result}
    note: the search_q function must be used with q_type = 'track' for this function to work
    '''
    t = query_result['tracks']['items'][0]['name']
    t_val = fuzz.ratio(t, track)
    if artist:
        ar = query_result['tracks']['items'][0]['artists'][0]['name']
        ar_val = fuzz.ratio(ar, artist)
    if album:
        al = query_result['tracks']['items'][0]['album']['name']
        al_val = fuzz.ratio(al, album)

    if artist and album:
        return {t: t_val, ar: ar_val, al: al_val}
    if artist and not album:
        return {t: t_val, ar: ar_val}
    if not artist and album:
        return {t: t_val, al: al_val}
    if not artist and not album:
        return {t: t_val}

In [219]:
# testing
print(s1['tracks']['items'][0]['album']['name'], ':',albums.iloc[400])
print(s1['tracks']['items'][0]['artists'][0]['name'],':', artists.iloc[400])
print(s1['tracks']['items'][0]['name'],':', tracks.iloc[400])

Crooked : Speedbath (working)
Kristin Hersh : Kristin Hersh
Mississippi Kite : Mississippi Kite


In [215]:
print(match_results(s1, tracks.iloc[400]))
print(match_results(s1, tracks.iloc[400], artist = artists.iloc[400]))
print(match_results(s1, tracks.iloc[400], album = albums.iloc[400]))
print(match_results(s1, tracks.iloc[400], artist = artists.iloc[400], album = albums.iloc[400]))

{'Mississippi Kite': 100}
{'Mississippi Kite': 100, 'Kristin Hersh': 100}
{'Mississippi Kite': 100, 'Crooked': 15}
{'Mississippi Kite': 100, 'Kristin Hersh': 100, 'Crooked': 15}


In [234]:
def measure_results(match_results, thresh=66):
    '''
    This function takes in a dictionary of fuzz match results see function 'match_reults'
    returns a boolean if the match is successful 
    default threshold is 66
    '''
    result = np.mean(list(match_results.values()))
    if result >= thresh:
        return True
    else:
        return False

In [235]:
measure_results(match_results(s1, tracks.iloc[400], artist = artists.iloc[400], album = albums.iloc[400]))

True

### Loop through tracks to find song ID by search Spotify and maintaining a score above a certain threshold

In [296]:
# stores (track index, returned query json) - should be len(8000)
store_S = []
# stores track index & match result for queries with a result like so 
# (400, {'Mississippi Kite': 100, 'Kristin Hersh': 100, 'Crooked': 15})
store_M = []
# stores track index & boolean like so (400,True)
store_B = []
# stores tuple of track index and its spotify ID - only for those who passed the test in B
track_spot_id = []
for i, song in tracks.items():
    
#     set variables to all have same index
    track = tracks.loc[i]
    artist = artists.loc[i]
    album = albums.loc[i]
       
#     Spotipy makes a search query in spotify, returns json
    S = search_q(track=track, artist=artist, album=album)
    store_S.append((i, S))
#     if search query returns empty list, move along
    if not S['tracks']['items']:
        continue
        
    M = match_results(S,track=track,artist=artist,album=album)
    B = measure_results(M)
    
    store_M.append((i, M))
    store_B.append((i, B))
    
#     if the search results don't pass the test for accuracy, move along
    if not B:
        continue
    
#     retrieve spotify id for song
    spot_ID = S['tracks']['items'][0]['id']
    track_spot_id.append((i, spot_ID))

In [243]:
s1_analysis = spotify.audio_analysis(s1['tracks']['items'][0]['id'])

In [365]:
print('There are {} songs which matched over 66% retreived from spotify'.format(len(track_spot_id)))

There are 1419 songs which matched over 66% retreived from spotify


In [366]:
spot_id_df = pd.DataFrame([x[1] for x in track_spot_id],index=[i[0] for i in track_spot_id],columns=['spotify_track_id'])
spot_id_df.head()

Unnamed: 0,spotify_track_id
5,66381EvBZ6e3RXzYATpGmN
18,2ee4Do6eRNXGg7XqKGkvz3
86,1OrbImTwzIVWtW1msIfa2r
316,4ypiBQX6nsXTwLkaHwnaRI
322,4m8jcQJ7wg4RyFNnIvZZgn


### Loop through songs with a spotify ID and retrieve their time signature

In [379]:
spot_time_sig = []
for i, s_id in track_spot_id:
    try:
        S_ID = spotify.audio_analysis(s_id)
        S_time = S_ID['track']['time_signature']
        S_time_conf = S_ID['track']['time_signature_confidence']

        spot_time_sig.append((i, s_id, S_time, S_time_conf))
    except:
        continue

In [393]:
time_sig_df = pd.DataFrame(spot_time_sig,columns=['track_fma_id', 'track_spot_id', 'time_signature', 'time_sig_confidence'])

In [487]:
time_sig_df.head(3)

Unnamed: 0,track_fma_id,track_spot_id,time_signature,time_sig_confidence
0,5,66381EvBZ6e3RXzYATpGmN,4,1.0
1,18,2ee4Do6eRNXGg7XqKGkvz3,3,1.0
2,86,1OrbImTwzIVWtW1msIfa2r,4,0.815


In [472]:
complete_df = pd.merge(time_sig_df, df, right_index=True, left_on='track_fma_id')

In [488]:
complete_df.head()

Unnamed: 0,track_fma_id,track_spot_id,time_signature,time_sig_confidence,track_name,artist_name,album_name,track_bit,track_top_genre
0,5,66381EvBZ6e3RXzYATpGmN,4,1.0,Freeway,Kurt Vile,Constant Hitmaker,192000,Pop
1,18,2ee4Do6eRNXGg7XqKGkvz3,3,1.0,Ohio,Alec K. Redfearn & the Eyesores,Every Man For Himself,128000,Folk
2,86,1OrbImTwzIVWtW1msIfa2r,4,0.815,Boute,Au,Au,256000,Pop
3,316,4ypiBQX6nsXTwLkaHwnaRI,4,0.349,I Can See You,Charles Manson,One Mind,128000,Folk
4,322,4m8jcQJ7wg4RyFNnIvZZgn,3,0.143,The Black Pirate,Charles Manson,One Mind,128000,Folk


In [485]:
complete_df.to_csv('clean_track_info.csv')