# Major Improvement: Better Dataset
- More features for each track
- More 'relevant' songs in dataset

In [22]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
import pandas as pd
import datetime
import pytz
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import MinMaxScaler

# Dataset Setup & Feature Engineering

In [45]:
# Has Genre and Popularity; does not have explicit --> Net total of 1 more feature; but will me expanded to more
dataset_df = pd.read_csv('SpotifyFeatures.csv')
dataset_df = dataset_df.drop(['artist_name', 'mode'], axis=1)
dataset_df.head()

Unnamed: 0,genre,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,Movie,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,0.0525,166.969,4/4,0.814
1,Movie,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,0.0868,174.003,4/4,0.816
2,Movie,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,0.0362,99.488,5/4,0.368
3,Movie,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,0.0395,171.758,4/4,0.227
4,Movie,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,0.0456,140.576,4/4,0.39


In [46]:
def create_feature_vectors(track_dataset_df):
    """
    Creates Feature Vectors for each track in the dataset. 
    Tunable Parameters: Weight of each indicator variable (Genre, key, time_sig, popularity)
    Parameters:
    - all_tracks_df: consists of all tracks in the used dataset, mimicking the "spotify db"
    Returns:
    - dataframe consisting of each track id, and their feature vector normalized.
    """
    # Get Unique Genre Values in df; make col for each genre and its corresponding value 1
    genre_df=pd.get_dummies(track_dataset_df['genre']) * 1

    # Get Unique key Values in df; make col for each key and its corresponding value 1
    key_df=pd.get_dummies(track_dataset_df['key']) * 1

    # Create 5 point buckets for popularity feature (OHE) - Reduces sensitivity to feature
    track_dataset_df['popularity_red'] = track_dataset_df['popularity'].apply(lambda x: int(x/5))
    tf_df = pd.get_dummies(track_dataset_df['popularity_red'])
    feature_names = tf_df.columns
    tf_df.columns = ["pop" + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)
    popularity_cols_df  = tf_df * 0.25

    # Scale and Normalize remaining columns
    float_cols = track_dataset_df[['acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(float_cols), columns = float_cols.columns) * 0.2

    # Create OHE Buckets for time_signature feature
    time_sig_df = pd.get_dummies(track_dataset_df['time_signature']) * 0.2

    # Combine all compononets
    tracks_feature_set = pd.concat([genre_df,key_df,time_sig_df, popularity_cols_df, floats_scaled], axis = 1)
    tracks_feature_set['id'] = track_dataset_df['track_id'].values


    return tracks_feature_set    

In [47]:
tracks_feature_set = create_feature_vectors(dataset_df)
tracks_feature_set.head()

Unnamed: 0,A Capella,Alternative,Anime,Blues,Children's Music,Children’s Music,Classical,Comedy,Country,Dance,...,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,id
0,0,0,0,0,0,0,0,0,0,0,...,0.071258,0.003033,0.182182,0.0,0.067923,0.180171,0.006414,0.128541,0.1628,0BRjO6ga9RKCKjfDqeFgWV
1,0,0,0,0,0,0,0,0,0,0,...,0.114387,0.004406,0.147546,0.0,0.028542,0.166894,0.013675,0.13516,0.1632,0BjC1NfoEOOusryehmNudP
2,0,0,0,0,0,0,0,0,0,0,...,0.13005,0.005594,0.026223,0.0,0.018848,0.137286,0.002964,0.065036,0.0736,0CoSDzoNIKCRs124s9uTVy
3,0,0,0,0,0,0,0,0,0,0,...,0.039288,0.004949,0.065263,0.0,0.017939,0.143339,0.003662,0.133048,0.0454,0Gc6TVm52BwZD07Ki6tIvf
4,0,0,0,0,0,0,0,0,0,0,...,0.058813,0.002428,0.045042,0.024625,0.038842,0.111411,0.004953,0.103703,0.078,0IuslXpMROHdEPvSl1fTQK
