# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
music_data = pd.read_csv('wikipedia_scraper.csv', encoding='ISO-8859-1')

In [3]:
music_data.head()

Unnamed: 0,rank,title,artist,year
0,1,Theme from A Summer Place,Percy Faith,1960
1,2,He'll Have to Go,Jim Reeves,1960
2,3,Cathy's Clown,The Everly Brothers,1960
3,4,Running Bear,Johnny Preston,1960
4,5,Teen Angel,Mark Dinning,1960


In [4]:
music_data.loc[0]

rank                              1
title     Theme from A Summer Place
artist                  Percy Faith
year                           1960
Name: 0, dtype: object

In [5]:
file = 'Spotify.txt'# replace this with the path and file name you use
with open(file,'r') as f: 
    f = f.read().splitlines()
    cid = f[0]
    secret = f[1]
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Test

In [6]:
track = music_data.title[0]
artist = music_data.artist[0]

In [7]:
artist,track

('Percy Faith', 'Theme from A Summer Place')

In [8]:
track_id = sp.search(q='artist:' + artist + ' track:' + track, type='track')
track_id

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=artist%3APercy+Faith+track%3ATheme+from+A+Summer+Place&type=track&offset=0&limit=10',
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/24DQLSng7bKZD4GXLIaQbv'},
       'href': 'https://api.spotify.com/v1/artists/24DQLSng7bKZD4GXLIaQbv',
       'id': '24DQLSng7bKZD4GXLIaQbv',
       'name': 'Percy Faith & His Orchestra',
       'type': 'artist',
       'uri': 'spotify:artist:24DQLSng7bKZD4GXLIaQbv'}],
     'available_markets': ['AD',
      'AE',
      'AL',
      'AR',
      'AT',
      'AU',
      'BA',
      'BE',
      'BG',
      'BH',
      'BO',
      'BR',
      'BY',
      'CA',
      'CH',
      'CL',
      'CO',
      'CR',
      'CY',
      'CZ',
      'DE',
      'DK',
      'DO',
      'DZ',
      'EC',
      'EE',
      'EG',
      'ES',
      'FI',
      'FR',
      'GB',
      'GR',
      'GT',
      'HK',
      'HN',
      'HR',
     

In [9]:
id_ = track_id['tracks']['items'][0]['id']
popularity = track_id['tracks']['items'][0]['popularity']

In [10]:
id_,popularity

('6zwvB879PJSpTyFcg2wwnL', 57)

In [11]:
def get_spotify_data(dataframe):
    """
    Takes a dataframe as input.
    Returns a list of track ids and a list of popularity scores from the Spotify API. 
    """
    from numpy import nan # we import np.nan to handle empty queries 
    track_ids = []
    popularities = []
    for [artist, song] in list(zip(dataframe['artist'], dataframe['title'])):
        try:
            song_data = sp.search(q='artist:' + artist + ' track:' + song, type='track')
            track_id = song_data['tracks']['items'][0]['id']
            popularity = song_data['tracks']['items'][0]['popularity']
            track_ids.append(track_id)
            popularities.append(popularity)
        except:
            track_ids.append(nan)
            popularities.append(nan)
    return track_ids, popularities
track_ids, popularities = get_spotify_data(music_data)
music_data['Spotify id'] = track_ids
music_data['Popularity'] = popularities
music_data

Unnamed: 0,rank,title,artist,year,Spotify id,Popularity
0,1,Theme from A Summer Place,Percy Faith,1960,6zwvB879PJSpTyFcg2wwnL,57.0
1,2,He'll Have to Go,Jim Reeves,1960,7dDE59NX0n466e705E8Itz,18.0
2,3,Cathy's Clown,The Everly Brothers,1960,1MA9StLzlFftLbuqOmoWij,52.0
3,4,Running Bear,Johnny Preston,1960,1RYznli2VNO7FCbW1Hq4KM,39.0
4,5,Teen Angel,Mark Dinning,1960,36NPEs4S7ik50NrlzaqoIJ,31.0
...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R. featuring YG,2020,,


In [13]:
music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6101 entries, 0 to 6100
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   rank        6101 non-null   object 
 1   title       6101 non-null   object 
 2   artist      6101 non-null   object 
 3   year        6101 non-null   int64  
 4   Spotify id  4950 non-null   object 
 5   Popularity  4950 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 286.1+ KB


In [14]:
music_data['Spotify id'].isnull().sum()

1151

There are a lot of songs without a Spotify id. One reason why the Spotify ids for these songs were not extracted is because the get_spotify_data function does not recognize artists in the artist column that have multiple artists. These data contain strings such as "featuring", "and", and "&", so we'll split the artists into multiple columns that contain the primary artist and featured artists.

In [15]:
# the last two songs are good examples where spotify ids weren't extracted
music_data.tail()

Unnamed: 0,rank,title,artist,year,Spotify id,Popularity
6096,96,More Than My Hometown,Morgan Wallen,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R. featuring YG,2020,,
6100,100,Walk Em Down,NLE Choppa featuring Roddy Ricch,2020,,


In [45]:
artist_list = []
feat_artist_list = []

def split_artists(dataframe):
    '''
    Check if the artist column has featured artists, then split artists into lists
    '''
    for index, row in dataframe.iterrows():
        if(pd.isnull(row['Spotify id'])):
            artist_list.append(row['artist'].replace(" & ", "_" ).replace(" featuring ","_").replace(" and ","_").replace(", ","_").replace(" with ","_").split("_")[0])
            
            #there may or not be a featured artist, so check if this is true
            try:
                feat_artist_list.append(row['artist'].replace(" & ", "_" ).replace(" featuring ","_").replace(" and ","_").replace(", ","_").replace(" with ","_").split("_")[1])
            except:
                feat_artist_list.append("None")
        else:
            artist_list.append(row['artist'])
            feat_artist_list.append("None")                               

In [17]:
split_artists(music_data)

In [18]:
artist_list[-5:]

['Morgan Wallen', 'Luke Combs', 'Moneybagg Yo', 'H.E.R.', 'NLE Choppa']

In [19]:
feat_artist_list[-5:]

['None', 'None', 'None', 'YG', 'Roddy Ricch']

In [20]:
music_data

Unnamed: 0,rank,title,artist,year,Spotify id,Popularity
0,1,Theme from A Summer Place,Percy Faith,1960,6zwvB879PJSpTyFcg2wwnL,57.0
1,2,He'll Have to Go,Jim Reeves,1960,7dDE59NX0n466e705E8Itz,18.0
2,3,Cathy's Clown,The Everly Brothers,1960,1MA9StLzlFftLbuqOmoWij,52.0
3,4,Running Bear,Johnny Preston,1960,1RYznli2VNO7FCbW1Hq4KM,39.0
4,5,Teen Angel,Mark Dinning,1960,36NPEs4S7ik50NrlzaqoIJ,31.0
...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R. featuring YG,2020,,


In [21]:
music_data["artist"] = artist_list

In [22]:
music_data["other artists"] = feat_artist_list

In [23]:
music_data

Unnamed: 0,rank,title,artist,year,Spotify id,Popularity,other artists
0,1,Theme from A Summer Place,Percy Faith,1960,6zwvB879PJSpTyFcg2wwnL,57.0,
1,2,He'll Have to Go,Jim Reeves,1960,7dDE59NX0n466e705E8Itz,18.0,
2,3,Cathy's Clown,The Everly Brothers,1960,1MA9StLzlFftLbuqOmoWij,52.0,
3,4,Running Bear,Johnny Preston,1960,1RYznli2VNO7FCbW1Hq4KM,39.0,
4,5,Teen Angel,Mark Dinning,1960,36NPEs4S7ik50NrlzaqoIJ,31.0,
...,...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,2020,0eBXyY4SatzpE7opnzgXvz,83.0,
6097,97,Lovin' on You,Luke Combs,2020,0nYvjcSlCgjcwogQAwIwNp,76.0,
6098,98,Said Sum,Moneybagg Yo,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0,
6099,99,Slide,H.E.R.,2020,,,YG


In [24]:
music_data = music_data[['rank', 'title', 'artist', 'other artists', 'year', 'Spotify id', 'Popularity']]

In [25]:
music_data

Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity
0,1,Theme from A Summer Place,Percy Faith,,1960,6zwvB879PJSpTyFcg2wwnL,57.0
1,2,He'll Have to Go,Jim Reeves,,1960,7dDE59NX0n466e705E8Itz,18.0
2,3,Cathy's Clown,The Everly Brothers,,1960,1MA9StLzlFftLbuqOmoWij,52.0
3,4,Running Bear,Johnny Preston,,1960,1RYznli2VNO7FCbW1Hq4KM,39.0
4,5,Teen Angel,Mark Dinning,,1960,36NPEs4S7ik50NrlzaqoIJ,31.0
...,...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R.,YG,2020,,


In [29]:
track_ids, popularities = get_spotify_data(music_data)
music_data['Spotify id'] = track_ids
music_data['Popularity'] = popularities
music_data

Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity
0,1,Theme from A Summer Place,Percy Faith,,1960,6zwvB879PJSpTyFcg2wwnL,57.0
1,2,He'll Have to Go,Jim Reeves,,1960,7dDE59NX0n466e705E8Itz,18.0
2,3,Cathy's Clown,The Everly Brothers,,1960,1MA9StLzlFftLbuqOmoWij,52.0
3,4,Running Bear,Johnny Preston,,1960,1RYznli2VNO7FCbW1Hq4KM,39.0
4,5,Teen Angel,Mark Dinning,,1960,36NPEs4S7ik50NrlzaqoIJ,31.0
...,...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R.,YG,2020,2rTnVB1bvwxHtaIl4uVu7f,77.0


In [30]:
music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6101 entries, 0 to 6100
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   rank           6101 non-null   object 
 1   title          6101 non-null   object 
 2   artist         6101 non-null   object 
 3   other artists  6101 non-null   object 
 4   year           6101 non-null   int64  
 5   Spotify id     5690 non-null   object 
 6   Popularity     5690 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 333.8+ KB


In [31]:
music_data['Spotify id'].isnull().sum()

411

We dropped the amount of null values from 1151 to 411. At this point, there's nothing else we can do with python to extract the remaining Spotify ids, so the rest must be done manually.

We will save the data frame as a csv file so we can add the Spotify ids manually in Excel.

In [32]:
music_data.to_csv('spotify_ids.csv')

After manually inserting Spotify ids and doing some cleaning in Excel, we import the csv file.

In [3]:
new_music_data = pd.read_csv('spotify_ids.csv', encoding='ISO-8859-1')

In [11]:
new_music_data

Unnamed: 0.1,Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity,Spotify id.1
0,0,1,Theme from A Summer Place,Percy Faith,,1960,6zwvB879PJSpTyFcg2wwnL,57.0,6zwvB879PJSpTyFcg2wwnL
1,1,2,He'll Have to Go,Jim Reeves,,1960,7dDE59NX0n466e705E8Itz,18.0,7dDE59NX0n466e705E8Itz
2,2,3,Cathy's Clown,The Everly Brothers,,1960,1MA9StLzlFftLbuqOmoWij,52.0,1MA9StLzlFftLbuqOmoWij
3,3,4,Running Bear,Johnny Preston,,1960,1RYznli2VNO7FCbW1Hq4KM,39.0,1RYznli2VNO7FCbW1Hq4KM
4,4,5,Teen Angel,Mark Dinning,,1960,36NPEs4S7ik50NrlzaqoIJ,31.0,36NPEs4S7ik50NrlzaqoIJ
...,...,...,...,...,...,...,...,...,...
6096,6096,96,More Than My Hometown,Morgan Wallen,,2020,0eBXyY4SatzpE7opnzgXvz,83.0,0eBXyY4SatzpE7opnzgXvz
6097,6097,97,Lovin' on You,Luke Combs,,2020,0nYvjcSlCgjcwogQAwIwNp,76.0,0nYvjcSlCgjcwogQAwIwNp
6098,6098,98,Said Sum,Moneybagg Yo,,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0,3sKz6Sd72K0ofPWcJPPk6H
6099,6099,99,Slide,H.E.R.,YG,2020,2rTnVB1bvwxHtaIl4uVu7f,77.0,2rTnVB1bvwxHtaIl4uVu7f


In [6]:
new_music_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6101 entries, 0 to 6100
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     6101 non-null   int64  
 1   rank           6101 non-null   object 
 2   title          6101 non-null   object 
 3   artist         6101 non-null   object 
 4   other artists  6101 non-null   object 
 5   year           6101 non-null   int64  
 6   Spotify id     6098 non-null   object 
 7   Popularity     5690 non-null   float64
 8   Spotify id.1   6098 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 429.1+ KB


In [7]:
new_music_data.isnull().sum()

Unnamed: 0         0
rank               0
title              0
artist             0
other artists      0
year               0
Spotify id         3
Popularity       411
Spotify id.1       3
dtype: int64

In [8]:
new_music_data[new_music_data["Spotify id"].isnull()]

Unnamed: 0.1,Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity,Spotify id.1
2182,2182,82,It's Now or Never,John Schneider,,1981,,,
2297,2297,97,Goin' Down,Greg Guidry,,1982,,,
3772,3772,72,ESPN Presents The Jock Jam,Various Artists,,1997,,,


There are still 3 Spotify ids that are missing. Since these are just a very small portion of our entire dataset, it should be okay to just drop these rows.

In [12]:
new_music_data = new_music_data.drop([new_music_data.index[2182],
                                      new_music_data.index[2297],
                                      new_music_data.index[3772]])

In [13]:
new_music_data

Unnamed: 0.1,Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity,Spotify id.1
0,0,1,Theme from A Summer Place,Percy Faith,,1960,6zwvB879PJSpTyFcg2wwnL,57.0,6zwvB879PJSpTyFcg2wwnL
1,1,2,He'll Have to Go,Jim Reeves,,1960,7dDE59NX0n466e705E8Itz,18.0,7dDE59NX0n466e705E8Itz
2,2,3,Cathy's Clown,The Everly Brothers,,1960,1MA9StLzlFftLbuqOmoWij,52.0,1MA9StLzlFftLbuqOmoWij
3,3,4,Running Bear,Johnny Preston,,1960,1RYznli2VNO7FCbW1Hq4KM,39.0,1RYznli2VNO7FCbW1Hq4KM
4,4,5,Teen Angel,Mark Dinning,,1960,36NPEs4S7ik50NrlzaqoIJ,31.0,36NPEs4S7ik50NrlzaqoIJ
...,...,...,...,...,...,...,...,...,...
6096,6096,96,More Than My Hometown,Morgan Wallen,,2020,0eBXyY4SatzpE7opnzgXvz,83.0,0eBXyY4SatzpE7opnzgXvz
6097,6097,97,Lovin' on You,Luke Combs,,2020,0nYvjcSlCgjcwogQAwIwNp,76.0,0nYvjcSlCgjcwogQAwIwNp
6098,6098,98,Said Sum,Moneybagg Yo,,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0,3sKz6Sd72K0ofPWcJPPk6H
6099,6099,99,Slide,H.E.R.,YG,2020,2rTnVB1bvwxHtaIl4uVu7f,77.0,2rTnVB1bvwxHtaIl4uVu7f


After dropping the 3 rows, we now have no more rows with missing Spotify ids. We just need to deal with missing Popularity information.

In [14]:
new_music_data.isnull().sum()

Unnamed: 0         0
rank               0
title              0
artist             0
other artists      0
year               0
Spotify id         0
Popularity       408
Spotify id.1       0
dtype: int64

Let's clean the columns with our new dataframe.

In [15]:
# Drop index column, as well as the old spotify id column
new_music_data = new_music_data.drop(['Unnamed: 0', 'Spotify id'], axis=1)

# Rename column
new_music_data = new_music_data.rename(columns={'Spotify id.1': 'Spotify id'})

# Reorder columns
new_music_data = new_music_data[['rank', 'title', 'artist', 'other artists', 'year', 'Spotify id', 'Popularity']]

In [17]:
new_music_data

Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity
0,1,Theme from A Summer Place,Percy Faith,,1960,6zwvB879PJSpTyFcg2wwnL,57.0
1,2,He'll Have to Go,Jim Reeves,,1960,7dDE59NX0n466e705E8Itz,18.0
2,3,Cathy's Clown,The Everly Brothers,,1960,1MA9StLzlFftLbuqOmoWij,52.0
3,4,Running Bear,Johnny Preston,,1960,1RYznli2VNO7FCbW1Hq4KM,39.0
4,5,Teen Angel,Mark Dinning,,1960,36NPEs4S7ik50NrlzaqoIJ,31.0
...,...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R.,YG,2020,2rTnVB1bvwxHtaIl4uVu7f,77.0


In [18]:
artist_list = []
feat_artist_list = []

def split_artists(dataframe):
    '''
    Check if the artist column has featured artists, then split artists into lists
    '''
    for index, row in dataframe.iterrows():
        if(row['other artists'] == "None"):
            artist_list.append(row['artist'].replace(" & ", "_").replace(" featuring ","_").replace(" and ","_").replace(", ","_").replace(" with ","_").split("_")[0])
            
            #there may or not be a featured artist, so check if this is true
            try:
                feat_artist_list.append(row['artist'].replace(" & ", "_").replace(" featuring ","_").replace(" and ","_").replace(", ","_").replace(" with ","_").split("_")[1])
            except:
                feat_artist_list.append("None")
        else:
            artist_list.append(row['artist'])
            feat_artist_list.append(row['other artists'])                               

In [19]:
split_artists(new_music_data)

In [20]:
new_music_data["artist"] = artist_list
new_music_data["other artists"] = feat_artist_list

In [21]:
new_music_data

Unnamed: 0,rank,title,artist,other artists,year,Spotify id,Popularity
0,1,Theme from A Summer Place,Percy Faith,,1960,6zwvB879PJSpTyFcg2wwnL,57.0
1,2,He'll Have to Go,Jim Reeves,,1960,7dDE59NX0n466e705E8Itz,18.0
2,3,Cathy's Clown,The Everly Brothers,,1960,1MA9StLzlFftLbuqOmoWij,52.0
3,4,Running Bear,Johnny Preston,,1960,1RYznli2VNO7FCbW1Hq4KM,39.0
4,5,Teen Angel,Mark Dinning,,1960,36NPEs4S7ik50NrlzaqoIJ,31.0
...,...,...,...,...,...,...,...
6096,96,More Than My Hometown,Morgan Wallen,,2020,0eBXyY4SatzpE7opnzgXvz,83.0
6097,97,Lovin' on You,Luke Combs,,2020,0nYvjcSlCgjcwogQAwIwNp,76.0
6098,98,Said Sum,Moneybagg Yo,,2020,3sKz6Sd72K0ofPWcJPPk6H,75.0
6099,99,Slide,H.E.R.,YG,2020,2rTnVB1bvwxHtaIl4uVu7f,77.0


In [26]:
new_music_data.isnull().sum()

rank               0
title              0
artist             0
other artists      0
year               0
Spotify id         0
Popularity       408
dtype: int64

In [22]:
track_ids, popularities = get_spotify_data(new_music_data)
new_music_data['Popularity'] = popularities

NameError: name 'get_spotify_data' is not defined

In [46]:
def get_spotify_data(dataframe):
    """
    Takes a dataframe as input.
    Returns a list of popularity scores from the Spotify API. 
    """
    from numpy import nan # we import np.nan to handle empty queries 
    popularities = []
    for id_ in list(zip(dataframe['Spotify id'])):
        try:
            print("uri:spotify:track:" + id_[0])
            #song_data = sp.search(q="uri:spotify:track: " + id_[0], type='track')
            #popularity = song_data['tracks']['items'][0]['popularity']
            popularities.append(popularity)
        except:
            print("Error in popularity extraction.")
            break
    return popularities

popularities = get_spotify_data(new_music_data)
#new_music_data['Popularity'] = popularities

uri:spotify:track:6zwvB879PJSpTyFcg2wwnL
Error in popularity extraction.


In [43]:
popularities

[]

In [49]:
new_music_data.loc[0]["Spotify id"]

'6zwvB879PJSpTyFcg2wwnL'

In [57]:
popularity = []
for id_ in new_music_data['Spotify id']:
    feature = sp.track(id_)
    features_2.append(feature)
    break

In [59]:
features_2[0]['popularity']

57