# Plots

In [1]:
import pandas as pd
import altair as alt
import numpy as np

## Read the data

In [2]:
url = 'https://raw.githubusercontent.com/luckyberen/DSCI320_Project/main/spotify_tracks.csv'
data = pd.read_csv(url)
data.head()

Unnamed: 0,track_id,track_name,album_name,artist_1,artist_2,artist_3,artist_4,track_genre_1,track_genre_2,track_genre_3,...,duartion_s,popularity,danceability,loudness,speechiness,acousticness,instrumentalness,liveness,energy,valence
0,71ehTADpxs85ULrZgSEKCy,Almost Lover,One Cell In The Sea,A Fine Frenzy,,,,acoustic,,,...,268.8,57,0.549,-7.778,0.0338,0.947,0.0,0.106,0.24,0.167
1,2fPb58e6f8KxejYDCaARwS,Say Something,Is There Anybody Out There?,A Great Big World,,,,acoustic,piano,,...,233.27,57,0.447,-7.571,0.036,0.872,2e-06,0.0816,0.158,0.142
2,6Vc5wAMmXdKIAM7WUoEb7N,Say Something,Is There Anybody Out There?,A Great Big World,Christina Aguilera,,,acoustic,piano,,...,229.4,74,0.407,-8.822,0.0355,0.857,3e-06,0.0913,0.147,0.0765
3,7xLhousIHDxoGgeJNhO4Ye,Say Something,Is There Anybody Out There? - Track by Track C...,A Great Big World,,,,acoustic,piano,,...,233.27,57,0.447,-7.571,0.036,0.872,2e-06,0.0816,0.158,0.142
4,0jJqIi0uMG8IhGlLx7U85J,Already Home,Is There Anybody Out There? - Track by Track C...,A Great Big World,,,,acoustic,,,...,230.19,44,0.602,-7.344,0.0272,0.31,0.0,0.14,0.404,0.243


In [3]:
json = 'data.json'
data.to_json(json, orient = 'columns')

In [4]:
from altair_data_server import data_server

alt.data_transformers.enable('data_server')

DataTransformerRegistry.enable('data_server')

##  Is there a correlation between the audio features and popularity?

In [5]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# logged = np.log(data.select_dtypes(include=numerics))
num_col = list(data.select_dtypes(include=numerics).columns)
num_col_reverse = num_col.copy()
num_col_reverse.reverse()
num_col

['time_signature',
 'tempo',
 'duartion_s',
 'popularity',
 'danceability',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'energy',
 'valence']

In [6]:
# scatter matrix

feature_reg = alt.Chart(data).mark_circle(size = 1, opacity = 0.1).encode(
    alt.X(alt.repeat("row"), type='quantitative'),
    alt.Y("popularity:Q"), 
).properties(
    width=200,
    height=200
).repeat(
    row=num_col,
)
# feature_reg

## Rank most popular Genre
We tried calculating the most popular genre by many different ways as seen below first by mean and next by sorting and counting top the top 500 songs and concluded the second option made more sense so we used that to build our visualization

In [7]:
# select main genre and popularity
genre_pop = data[['track_genre_1','popularity']]

# groupby genre and calcuolated popularity by mean
pop_mean_by_genre = genre_pop.groupby(['track_genre_1'])['popularity'].mean()

# sort by mean popularity 
pop_mean_by_genre = pop_mean_by_genre.sort_values(ascending = False)
pop_mean_by_genre

track_genre_1
k-pop             59.358779
pop-film          56.733858
metal             56.422414
chill             53.738683
latino            51.788945
                    ...    
detroit-techno    11.130753
latin              9.855072
jazz               9.790076
romance            3.549779
iranian            2.224696
Name: popularity, Length: 113, dtype: float64

In [8]:
# of top 500 songs
genre_pop500 = data[['track_genre_1','popularity']].sort_values(by = 'popularity',ascending = False).iloc[:500]

# groupby genre and calcuolated popularity by mean
pop_by_genre500 = genre_pop500.groupby(['track_genre_1'])['popularity'].count()

# sort by mean popularity 
pop_genre = pop_by_genre500.sort_values(ascending = False)
pop_genre_list = pop_genre.index.tolist()

In [9]:
# add mean popularity to df

gen_pop = pop_genre.to_frame().join(pop_mean_by_genre, rsuffix='_mean')

gen_pop['track_genre_1'] = gen_pop.index
gen_pop

Unnamed: 0_level_0,popularity,popularity_mean,track_genre_1
track_genre_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dance,87,23.719409,dance
latino,58,51.788945,latino
pop,50,49.642617,pop
hip-hop,47,42.429929,hip-hop
rock,32,18.305233,rock
indie,28,38.621868,indie
alt-rock,20,33.896897,alt-rock
latin,16,9.855072,latin
k-pop,15,59.358779,k-pop
edm,12,40.439481,edm


In [10]:
genre_popularity = alt.Chart(gen_pop.iloc[0:20]).mark_arc(innerRadius = 20, stroke="#fff").encode(
    alt.Theta('popularity:Q'),
    alt.Radius("popularity_mean:Q"),
    color="genre:N",
    tooltip=['genre:N', 'popularity:Q', "popularity_mean:Q"]
)

genre_popularity

In [93]:
gen20 = gen_pop.iloc[0:20]

In [46]:
# Top 20 most popular genre by count of songs in top 500

# top_genre

## Range of song legnth by main genre

In [115]:
# # box plot
# song_length = alt.Chart(data).mark_boxplot(extent = 'min-max').encode(
#     alt.X("track_genre_1:N"),
#     alt.Y("duartion_s:Q"), 
# )

# song_length

# top 50 most popular songs

In [91]:
top50 = data.sort_values(by = 'popularity', ascending = False).iloc[0:50]
top50.head()
top_song_with_genre = top50[top50['track_genre_1'].isin(pop_genre_list)]


In [86]:
from urllib.parse import urlencode

def make_youtube_query(name):
    return "https://www.youtube.com/search?" + urlencode({'q': '"{0}"'.format(name)})


top50['url'] = top50['track_name'].apply(make_youtube_query)

top500 = data.sort_values(by = 'popularity', ascending = False).iloc[0:500]

top500['url'] = top500['track_name'].apply(make_youtube_query)

In [138]:

# alt.hconcat()
click2 = alt.selection_multi(
    fields = ['track_name'],
    # empty='none' # empty selection matches no points
)
click = alt.selection_multi(
    fields = ['track_genre_1'],
    bind='legend'
    # empty='none' # empty selection matches no points
)



top_genre = alt.Chart(gen20).mark_bar().encode(
    alt.Y("track_genre_1:N",sort = '-x'),
    alt.X("popularity",title='Mean Popularity'),
    color="track_genre_1:N",
    tooltip=['track_genre_1:N', "popularity_mean:Q"],
    opacity=alt.condition(click,alt.value(1), alt.value(0.2))
).properties(
    width=300,
    height=500,
    title = 'Top 20 Popular Genres'
).add_selection(
    click)

top500_scatter = alt.Chart(top500).mark_point(opacity=0.5, color='Red').encode(
    alt.Y("popularity:Q",scale=alt.Scale(domain=[80, 101])),
    alt.X("danceability:Q",),
    alt.Size('popularity:Q',scale=alt.Scale(domain=[80, 101])),
    # alt.Color('track_genre_1:N'),
    href='url:N',
    tooltip=['track_name:N', 'artist_1:N', 'track_genre_1:N','url']
    # opacity=alt.condition(click,alt.value(1), alt.value(0.2),)
).properties(
    width =500,
    height=500,
    title = 'Relationship Between Danceability and Popularity among Top 500 Songs'
).transform_filter(click).interactive()


alt.hconcat(top500_scatter,top_genre,title='Top 500 Songs and Top 20 Genres Dashboard')


