In [30]:
import spotipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
client_id = 'd638d9ca84be4e9cac2974a1c8b02d52'
client_secret_id = '37c6ea57ab6e4c80b18e6b38b3dcce4d'

In [3]:
client_credentials_manager = spotipy.SpotifyClientCredentials(client_id = client_id, client_secret = client_secret_id)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [146]:
def get_random_songs(number_of_songs=1, genre='Pop', year_range='1970-1979', random_state = 101):
    #Set random seed
    import random
    random.seed(random_state)

    #Generate random offsets (sampling without replacement)
    random_offset = random.sample(range(0, 300), number_of_songs)

    #Generate random search character for query (sampling with replacement)
    chars = 'abcdefghijklmnopqrstuvwxyz'
    random_char = random.choices(chars, k=number_of_songs)

    #Generate random id to select in the output list of 10 (sampling with replacement)
    random_id = random.choices(range(0, 10), k=number_of_songs)
    
    
    df_columns = ['track_id','track_name', 'duration','popularity','explicit','artist_id', 'artist_name', 
                  'album_id', 'album_name', 'release_date']
    df_random_songs = pd.DataFrame(columns=df_columns)

    for i in range(0,number_of_songs):
        #Pseudo-random query selection
        results = sp.search(q='genre:' + genre + ' year:' + year_range+' '+random_char[i], type='track', offset=random_offset[i])

        song_list = []
        #Adding track info
        song_list.append(results['tracks']['items'][random_id[i]]['id'])
        song_list.append(results['tracks']['items'][random_id[i]]['name'])
        song_list.append(results['tracks']['items'][random_id[i]]['duration_ms'])
        song_list.append(results['tracks']['items'][random_id[i]]['popularity'])
        song_list.append(results['tracks']['items'][random_id[i]]['explicit'])

        #Adding artist info (first one listed)
        song_list.append(results['tracks']['items'][random_id[i]]['artists'][0]['id'])
        song_list.append(results['tracks']['items'][random_id[i]]['artists'][0]['name'])

        #Adding album info
        song_list.append(results['tracks']['items'][random_id[i]]['album']['id'])
        song_list.append(results['tracks']['items'][random_id[i]]['album']['name'])
        song_list.append(results['tracks']['items'][random_id[i]]['album']['release_date'])

        df_song = pd.DataFrame([song_list], columns=df_columns)

        df_random_songs=pd.concat([df_random_songs, df_song])

    df_random_songs=df_random_songs.reset_index(drop=True)
    
    return df_random_songs    
    

In [40]:
def get_song_info(track_id):
    #1. Extract the audio features of the track
    song_results = sp.audio_features(track_id)
    
    #2. Collect the following audio information about the track
    song_list=[track_id,
               song_results[0]['danceability'],
               song_results[0]['energy'],
               song_results[0]['key'],
               song_results[0]['loudness'],
               song_results[0]['mode'],
               song_results[0]['speechiness'],
               song_results[0]['acousticness'],
               song_results[0]['instrumentalness'],
               song_results[0]['liveness'],
               song_results[0]['valence'],
               song_results[0]['tempo'],
               song_results[0]['type'],
               song_results[0]['time_signature']]
               
    
    df_song_audio=pd.DataFrame([song_list], columns=['track_id', 'danceability', 'energy', 'key', 'loudness', 'mode',
                                                    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence',
                                                    'tempo', 'type', 'time_signature'])
    return df_song_audio
    

In [125]:
def get_song_additional_features(track_id):
    
    audio = sp.audio_analysis(track_id)
    segments = audio['segments']
    sections = audio['sections']
    
    pitches = []
    loudness = []
    section_lenghts = []
    
    for segment in segments:
        pitches.append(segment['pitches'])
        loudness.append(segment['loudness_start'])
        loudness.append(segment['loudness_max'])
        
    for section in sections:
        section_lenghts.append(section['duration'])
        
    pitches = np.array(pitches)
    C, C_sharp, D, D_sharp, E, F, F_sharp, G, G_sharp, A, A_sharp, B = np.mean(pitches, axis=0)
    
    features = [np.mean(pitches), np.var(loudness), np.mean(section_lenghts), C, 
                C_sharp, D, D_sharp, E, F, F_sharp, G, G_sharp, A, A_sharp, B] 
    
    additional_features = pd.DataFrame([features],columns=['tone_purity', 'loudness_var', 'mean_section_lenght', 
                                                'C','C_sharp','D','D_sharp','E','F','F_sharp','G','G_sharp','A','A_sharp','B'])
    
    return additional_features
    

In [180]:
def enhance_audio_features(data):
    data = data.copy()
    features_df = pd.DataFrame(columns=['track_id', 'danceability', 'energy', 'key', 'loudness', 'mode',
                                                    'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence',
                                                    'tempo', 'type', 'time_signature'])
    extra_features_df = pd.DataFrame(columns=['tone_purity', 'loudness_var', 'mean_section_lenght', 
                                                'C','C_sharp','D','D_sharp','E','F','F_sharp','G','G_sharp','A','A_sharp','B'])

    for track_id in data.track_id:
        features_song = get_song_info(track_id)
        features_df = pd.concat([features_df,features_song], ignore_index=True)
        extra_features_song = get_song_additional_features(track_id)
        extra_features_df = pd.concat([extra_features_df,extra_features_song], ignore_index=True)
        
    enhancement = pd.concat([features_df, extra_features_df], axis=1)
    
    enhanced_df = pd.concat([data, enhancement], axis=1)
    print('Done!')
    
    return enhanced_df

In [172]:
seventies = get_random_songs(number_of_songs=300, genre='Rock', year_range='1970-1979', random_state = 101)
eighties = get_random_songs(number_of_songs=300, genre='Rock', year_range='1980-1989', random_state = 101)
nineties = get_random_songs(number_of_songs=300, genre='Rock', year_range='1990-1999', random_state = 101)
twothousands = get_random_songs(number_of_songs=300, genre='Rock', year_range='2000-2009', random_state = 101)
twothousand_tens = get_random_songs(number_of_songs=300, genre='Rock', year_range='2010-2019', random_state = 101)
twothousand_twenties = get_random_songs(number_of_songs=300, genre='Rock', year_range='2020-2022', random_state = 101)

In [183]:
enhanced_seventies = enhance_audio_features(seventies)
enhanced_eighties = enhance_audio_features(eighties)
enhanced_nineties = enhance_audio_features(nineties)
enhanced_twothousands = enhance_audio_features(twothousands)
enhanced_twothousand_tens = enhance_audio_features(twothousand_tens)
enhanced_twothousand_twenties = enhance_audio_features(twothousand_twenties)

Done!
Done!
Done!
Done!
Done!
Done!


In [185]:
enhanced_seventies['Decade'] = '1970´s'
enhanced_eighties['Decade'] = '1980´s'
enhanced_nineties['Decade'] = '1990´s' 
enhanced_twothousands['Decade'] = '2000´s'
enhanced_twothousand_tens['Decade'] = '2010´s'
enhanced_twothousand_twenties['Decade'] = '2020´s'

In [190]:
data = pd.concat([enhanced_seventies,enhanced_eighties, enhanced_nineties, 
                  enhanced_twothousands, enhanced_twothousand_tens, enhanced_twothousand_twenties], ignore_index=True)

In [None]:
billboard = pd.read_csv('billboard.csv').dropna()

In [238]:
data['top_song'] = data.track_name.apply(lambda x : 1 if x in list(billboard.song) else 0)
data['explicit'] = data.explicit.apply(lambda x: 1 if x else 0)

In [241]:
data.to_csv('songs.csv')

In [251]:
data['mode'].value_counts()

1    1345
0     455
Name: mode, dtype: int64

In [243]:
data.columns

Index(['track_id', 'track_name', 'duration', 'popularity', 'explicit',
       'artist_id', 'artist_name', 'album_id', 'album_name', 'release_date',
       'track_id', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'time_signature', 'tone_purity',
       'loudness_var', 'mean_section_lenght', 'C', 'C_sharp', 'D', 'D_sharp',
       'E', 'F', 'F_sharp', 'G', 'G_sharp', 'A', 'A_sharp', 'B', 'Decade',
       'top_song'],
      dtype='object')