# Data Collection and Cleaning

### Installations and Imports

In [1]:
!pip install spotipy --upgrade

Requirement already up-to-date: spotipy in /usr/local/lib/python3.7/dist-packages (2.17.1)


In [2]:
import requests
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

In [3]:
SPOTIFY_CLIENT_ID='d7d7796cf615421589ca09ab7cfae54e'
SPOTIFY_CLIENT_SECRET='cb3838df46f94425b5f56746e13e9fde'

client_id = SPOTIFY_CLIENT_ID
client_secret = SPOTIFY_CLIENT_SECRET
client_credentials_manager = spotipy.oauth2.SpotifyClientCredentials(client_id, client_secret)
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
TOP50 = 'spotify:playlist:37i9dQZEVXbLRQDuF5jeBp'
json = spotify.playlist_tracks(TOP50)

In [5]:
#Genius web api credentials
GENIUS_CLIENT_ID = "EwumVTCFnk0rIQ6VWvNi1W_aFlNpz6W5SkU8vacv3l8IspBcvcYAqZ7Ee-LWaegP"
GENIUS_CLIENT_SECRET = "Jmjsqp_VOWzR1MIhB4DKv8MCMAoUUv6kVciuJbOqp4zosR0ARaY-DMw6LKbvTr1cOqApw_hSE5GJ2jXK29KFoQ"
!pip install lyricsgenius



In [6]:
import lyricsgenius
GENIUS_ACCESS_TOKEN="FAO53ip5S_PPt4Lz3HWflepJcqyhMje0HwQWEAvDiHzGJsCQgXMGlK2bMLHJDKYP"
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)

### Functions to allow us for a single genre for each song
We ran into issues with the way Spotify labels genres, as they give an array of genres for an artist, and don't give genres for singular songs. These methods allow us to turn that List into one of 11 predefined genres by extracting the parent genre of the labeled subgenres, and determining which parent genre occurs the most often. Often subgenres like "pop rock" boosted the frequencies of the pop genre, which dominated all of the hits playlists, so in situations like this we gave weight to the less frequent genre (pop rock is counted as rock). 

In [7]:
def topGenre(genreList):
  genres = []
  if not len(genreList):
    return "other"
  for genre in genreList:
    genres.append(mapGenre(genre))
  return pd.Series(genres).value_counts().index[0]

In [8]:
def mapGenre(g):
  if "hip hop" in g:
    return "hip hop"
  elif "r&b" in g:
    return "r&b"
  elif "rap" in g:
    return "rap"
  elif "rock" in g:
    return "rock"
  elif "punk" in g:
    return "punk"
  elif "country" in g:
    return "country"
  elif "pop" in g:
    return "pop"
  elif "metal" in g:
    return "metal"
  elif "reggaeton" in g:
    return "reggaeton"
  elif "reggae" in g:
    return "reggae"
  elif "funk" in g:
    return "funk"
  elif "grunge" in g:
    return "grunge"
  elif "edm" in g or "house" in g:
    return "edm"
  elif "singer-songwriter" in g:
    return "singer-songwriter"
  else: 
    return "other"

### Function to make our Dataframe

This is our main function we used to turn a given Spotify playlist into a useable and analyzable dataframe. 

In [9]:
# DataFrame: topYear, songId, songName, artistName, albumId, albumName, genre, popularity, tempo, 
# energy, danceability, loudness, speechiness, duration_ms

def makeDF(playListID, numSongs):
  jsonPlaylist = spotify.playlist_tracks(playListID)
  df = pd.DataFrame(columns=['rank', 
                             'trackId', 
                             'trackName', 
                             'artistName', 
                             'albumId', 
                             'albumName', 
                             'genres',
                             'popularity', 
                             'tempo', 
                             'energy', 
                             'danceability', 
                             'loudness', 
                             'speechiness', 
                             'instrumentalness', 
                             'duration_ms', 
                             'valence', 
                             'explicit'])
  
  for track in jsonPlaylist['items'][:numSongs]:
    albumName = track['track']['album']['name']
    albumId = track['track']['album']['id']
    artistName = track['track']['artists'][0]['name']
    artistId = track['track']['artists'][0]['id']
    trackName = track['track']['name']
    trackId = track['track']['id']
    popularity = track['track']['popularity']
    duration = track['track']['duration_ms']
    explicit = track['track']['explicit']

    jsonTrack = spotify.audio_features([trackId])
    tempo = jsonTrack[0]['tempo']
    energy = jsonTrack[0]['energy']
    danceability = jsonTrack[0]['danceability']
    loudness = jsonTrack[0]['loudness']
    speechiness = jsonTrack[0]['speechiness']
    instrumentalness = jsonTrack[0]['instrumentalness']
    valence = jsonTrack[0]['valence']

    jsonArtist = spotify.artist(artistId)
    genres = jsonArtist['genres']

    df = df.append({'trackId':trackId, 
                    'trackName':trackName, 
                    'artistName':artistName, 
                    'albumId':albumId, 
                    'albumName':albumName, 
                    'genres':genres, 
                    'popularity':popularity, 
                    'tempo':tempo, 
                    'energy':energy, 
                    'danceability':danceability, 
                    'loudness':loudness, 
                    'speechiness':speechiness, 
                    'instrumentalness':instrumentalness, 
                    'duration_ms':duration, 
                    'valence':valence, 
                    'explicit': explicit}, ignore_index=True)
    
  df['rank'] = pd.Series(range(1, numSongs+1))
  df['popularity'] = df['popularity'].astype(int)
  df['tempo'] = df['tempo'].astype(float)
  df['energy'] = df['energy'].astype(float)
  df['danceability'] = df['danceability'].astype(float)
  df['loudness'] = df['loudness'].astype(float)
  df['speechiness'] = df['speechiness'].astype(float)
  df['instrumentalness'] = df['instrumentalness'].astype(float)
  df['duration_m'] = round(df['duration_ms'].astype(int)/60000)
  df['valence'] = df['valence'].astype(float)
  df['topGenre'] = df['genres'].map(topGenre)

  return df

### Making all of the Dataframes and Saving

In [10]:
# Top From Years 2009-2019
import time
years = ['spotify:playlist:37i9dQZF1DX4UkKv8ED8jp',
        'spotify:playlist:37i9dQZF1DXc6IFF23C9jj',
        'spotify:playlist:37i9dQZF1DXcagnSNtrGuJ',
        'spotify:playlist:37i9dQZF1DX0yEZaMOXna3',
        'spotify:playlist:37i9dQZF1DX3Sp0P28SIer',
        'spotify:playlist:37i9dQZF1DX0h0QnLkMBl4',
        'spotify:playlist:37i9dQZF1DX9ukdrXQLJGZ',
        'spotify:playlist:37i9dQZF1DX8XZ6AUo9R4R',
        'spotify:playlist:37i9dQZF1DWTE7dVUebpUW',
        'spotify:playlist:37i9dQZF1DXe2bobNYDtW8',
        'spotify:playlist:37i9dQZF1DWVRSukIED0e9']

master_df = pd.DataFrame(columns=['topYear', 'rank', 'trackId', 'trackName', 'artistName', 'albumId', 'albumName', 'genres',
                  'popularity', 'tempo', 'energy', 'danceability', 'loudness', 'speechiness', 'instrumentalness', 
                  'duration_ms', 'valence'])
for i in range(11):
  tempdf = makeDF(years[i], 99)
  tempdf["topYear"] = 2009 + i
  master_df = pd.concat([master_df, tempdf])
  time.sleep(0.5)
master_df.reset_index(inplace=True)

In [11]:
NMF = 'spotify:playlist:37i9dQZF1DX4JAvHpjipBk'
NMF_df = makeDF(NMF, 50)

In [12]:
top50 = 'spotify:playlist:37i9dQZEVXbLRQDuF5jeBp'
top50_df = makeDF(top50, 50)

In [13]:
master_df.to_csv("spotify_2010sHits.csv", index=False)
NMF_df.to_csv("spotify_NewMusic.csv", index=False)
top50_df.to_csv("spotify_top50.csv", index=False)