# Milestone 2

In [17]:
import pandas as pd
import altair as alt
from urllib.parse import urlencode
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [18]:
# Read the data
url = 'https://raw.githubusercontent.com/luckyberen/DSCI320_Project/main/spotify_tracks.csv'
data = pd.read_csv(url)

## View 1. Dashboard of Distribution and Relationship between Duration and popularity of Top 20 Genres

Tasks:

- What is the distribution of duration for each top 20 popular genre?
- What is the distribution of popularity for each top 20 popular genre?
- What is the relationship between popularity and duration for each top 20 popular genre?
- What is the range of popularity and duration for each top 20 popular genre?

In [None]:
# Get top 20 genres

genre_pop = data[['track_genre_1','popularity']]
pop_mean_by_genre = genre_pop.groupby(['track_genre_1'])['popularity'].mean()
pop_mean_by_genre = pop_mean_by_genre.sort_values(ascending=False)
top_20_genre = pop_mean_by_genre[0:20]
top_20_genre_list = top_20_genre.index.tolist()
genres = top_20_genre_list

In [20]:
dua_genre_data = data[['track_genre_1','duartion_s', 'popularity']]
dua_genre_data_top20 = dua_genre_data[dua_genre_data['track_genre_1'].isin(top_20_genre_list)]
# dua_genre_data_top20 = dua_genre_data_top20[dua_genre_data_top20['duartion_s'] < 1000] # Outliers
dua_genre_data_top20

Unnamed: 0,track_genre_1,duartion_s,popularity
4404,anime,241.17,32
4405,anime,260.51,32
4406,anime,260.69,0
4407,anime,262.33,0
4408,anime,210.97,57
...,...,...,...
77757,sertanejo,243.20,44
77758,sertanejo,215.00,47
77759,sertanejo,291.11,44
77760,sertanejo,250.47,46


In [21]:
# Initialize interactions
selectGenre = alt.selection_single(
    name='Select',
    fields=['track_genre_1'],
    init={'track_genre_1': genres[0]},
    bind=alt.binding_select(options=genres)
)

brush = alt.selection(type='interval',)


# Duration histogram
duration_dist = alt.Chart(dua_genre_data_top20).mark_bar(opacity=0.8).encode(
    alt.X('duartion_s:Q', title='Duration (s)', bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
    alt.Tooltip(['count()','track_genre_1'])
).properties(
    width=300,
    height=300,
    title='Distribution of Duration'
).add_selection(selectGenre).transform_filter(
    selectGenre).transform_filter(brush).interactive()

# Popularity distribution
pop_dist = alt.Chart(dua_genre_data_top20).mark_bar(opacity=0.8).encode(
    alt.X('popularity:Q', title='Popularity', bin=alt.BinParams(maxbins=50)),
    alt.Y('count()'),
    alt.Tooltip(['count()','track_genre_1'],)
).properties(
    width=300,
    height=300,
    title='Distribution of Population'
).add_selection(selectGenre).transform_filter(
    selectGenre).transform_filter(brush).interactive()

# Scatter plot between popularity and duration
dura_pop = alt.Chart(dua_genre_data_top20).mark_circle(opacity=0.8).encode(
    alt.X('duartion_s:Q', title='Duration (s)', scale=alt.Scale(zero=False)),
    alt.Y('popularity:Q', title='Popularity', scale=alt.Scale(zero=False)),
    color=alt.condition(brush, alt.value('steelblue'), alt.value('grey'))
).properties(
    width=300,
    height=300,
    title='Relationship between Duration and Popularity'
).add_selection(selectGenre).transform_filter(selectGenre).add_selection(brush)


# Boxplot of duration and popularity
dura_box = alt.Chart(dua_genre_data_top20).mark_boxplot(opacity=0.8, outliers={'size': 5}).encode(
    alt.Y(alt.repeat('column'), type='quantitative'),
    alt.X('track_genre_1:N'),
    color=alt.condition(selectGenre, alt.value('steelblue'), alt.value('grey'))
).properties(
    width=110,
    height=300,
).repeat(
    column = ['duartion_s', 'popularity'],
    title='Range of Duration and Popularity'
).add_selection(selectGenre).transform_filter(selectGenre).interactive()


view1 = alt.vconcat(
    alt.hconcat(duration_dist,pop_dist),
    alt.hconcat(dura_pop,dura_box),
    title = 'Duration and Popularity Dashboard'
)
view1

## View 2. Dashboard of the top 500 popular songs and the top 20 genres

Tasks:

- What is the relationship between danceability and popularity among the top 500 popular songs?
- Rank the top 20 popular genres.
- Which songs in the top 500 ranking are in each of the 20 top genres?
- Link each song to its Youtube search page.

In [22]:
# select main genre and popularity
genre_pop = data[['track_genre_1','popularity']]

# groupby genre and calcuolated popularity by mean
pop_mean_by_genre = genre_pop.groupby(['track_genre_1'])['popularity'].mean()

# sort by mean popularity 
pop_mean_by_genre = pop_mean_by_genre.sort_values(ascending = False)

# of top 500 songs
genre_pop500 = data[['track_genre_1','popularity']].sort_values(by = 'popularity',ascending = False).iloc[:500]

# groupby genre and calcuolated popularity by mean
pop_by_genre500 = genre_pop500.groupby(['track_genre_1'])['popularity'].count()

# sort by mean popularity 
pop_genre = pop_by_genre500.sort_values(ascending = False)
pop_genre_list = pop_genre.index.tolist()


gen_pop = pop_genre.to_frame().join(pop_mean_by_genre, rsuffix='_mean')

gen_pop['track_genre_1'] = gen_pop.index
gen20 = gen_pop.iloc[0:20]
gen20.head()

Unnamed: 0_level_0,popularity,popularity_mean,track_genre_1
track_genre_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dance,87,23.719409,dance
latino,58,51.788945,latino
pop,50,49.642617,pop
hip-hop,47,42.429929,hip-hop
rock,32,18.305233,rock


In [23]:
# Function to get the Youtube link
def make_youtube_query(name):
    return "https://www.youtube.com/search?" + urlencode({'q': '"{0}"'.format(name)})


top500 = data.sort_values(by = 'popularity', ascending = False).iloc[0:500]
top500['url'] = top500['track_name'].apply(make_youtube_query)
top500.head()

Unnamed: 0,track_id,track_name,album_name,artist_1,artist_2,artist_3,artist_4,track_genre_1,track_genre_2,track_genre_3,...,popularity,danceability,loudness,speechiness,acousticness,instrumentalness,liveness,energy,valence,url
19677,3nqQXoyQOWXiESFLlDF1hG,Unholy (feat. Kim Petras),Unholy (feat. Kim Petras),Sam Smith,Kim Petras,,,dance,pop,,...,100,0.714,-7.375,0.0864,0.013,5e-06,0.266,0.472,0.238,https://www.youtube.com/search?q=%22Unholy+%28...
45368,2tTmW7RDtMQtBk7m2rYeSw,"Quevedo: Bzrp Music Sessions, Vol. 52","Quevedo: Bzrp Music Sessions, Vol. 52",Bizarrap,Quevedo,,,hip-hop,,,...,99,0.621,-5.548,0.044,0.0125,0.033,0.23,0.782,0.55,https://www.youtube.com/search?q=%22Quevedo%3A...
19287,4uUG5RXrOk84mYEfFvj3cK,I'm Good (Blue),I'm Good (Blue),David Guetta,Bebe Rexha,,,dance,edm,pop,...,98,0.561,-3.673,0.0343,0.00383,7e-06,0.371,0.965,0.304,https://www.youtube.com/search?q=%22I%27m+Good...
57415,5ww2BF9slyYgNOk37BlC4u,La Bachata,La Bachata,Manuel Turizo,,,,latin,latino,reggae,...,98,0.835,-5.329,0.0364,0.583,2e-06,0.218,0.679,0.85,https://www.youtube.com/search?q=%22La+Bachata%22
56970,6Sq7ltF9Qa7SNFBsV5Cogx,Me Porto Bonito,Un Verano Sin Ti,Bad Bunny,Chencho Corleone,,,latin,latino,reggae,...,97,0.911,-5.105,0.0817,0.0901,2.7e-05,0.0933,0.712,0.425,https://www.youtube.com/search?q=%22Me+Porto+B...


In [24]:
# Initialize interactions

# Select the genre
click = alt.selection_multi(
    fields = ['track_genre_1'],
    bind='legend'
)
# Click to direct to the Youtube
click2 = alt.selection_multi(
    fields = ['track_name'],
)

# Top 20 Genres bar chart
top_genre = alt.Chart(gen20).mark_bar().encode(
    alt.Y("track_genre_1:N",sort = '-x'),
    alt.X("popularity",title='Mean Popularity'),
    color="track_genre_1:N",
    tooltip=['track_genre_1:N', "popularity_mean:Q"],
    opacity=alt.condition(click,alt.value(1), alt.value(0.2))
).properties(
    width=300,
    height=500,
    title = 'Top 20 Popular Genres'
).add_selection(
    click
)

# Top 500 songs scatter plot bwtween popularity and danceability
top500_scatter = alt.Chart(top500).mark_point(opacity=0.5, color='Red').encode(
    alt.Y("popularity:Q",scale=alt.Scale(domain=[80, 101])),
    alt.X("danceability:Q",),
    alt.Size('popularity:Q',scale=alt.Scale(domain=[80, 101])),
    # alt.Color('track_genre_1:N'), # doesn't work after adding href interaction
    href='url:N',
    tooltip=['track_name:N', 'artist_1:N', 'track_genre_1:N','url']
).properties(
    width =500,
    height=500,
    title = 'Relationship Between Danceability and Popularity among Top 500 Songs'
).transform_filter(
    click
).interactive()


view2 = alt.hconcat(top500_scatter,top_genre,title='Top 500 Songs and Top 20 Genres Dashboard')
view2

## View3. Dashboard of Correlation matrix

Task: Is there a correlation between the audio features and popularity?

In [25]:
cor_data = (data[['popularity','tempo','duartion_s','danceability','loudness','speechiness','acousticness',
            'instrumentalness','liveness','energy','valence']]
            .corr().stack()
            .reset_index()     # The stacking results in an index on the correlation values, we need the index as normal columns for Altair
            .rename(columns={0: 'correlation', 'level_0': 'variable1', 'level_1': 'variable2'}))
cor_data['correlation_label'] = cor_data['correlation'].map('{:.2f}'.format)  # Round to 2 decimal
cor_data.head()
corr_popularity = cor_data[cor_data['variable1'] == 'popularity']
corr_popularity

Unnamed: 0,variable1,variable2,correlation,correlation_label
0,popularity,popularity,1.0,1.0
1,popularity,tempo,0.007267,0.01
2,popularity,duartion_s,-0.023118,-0.02
3,popularity,danceability,0.064278,0.06
4,popularity,loudness,0.07168,0.07
5,popularity,speechiness,-0.047086,-0.05
6,popularity,acousticness,-0.038854,-0.04
7,popularity,instrumentalness,-0.127469,-0.13
8,popularity,liveness,-0.013846,-0.01
9,popularity,energy,0.013734,0.01


In [26]:
view3 = alt.Chart(cor_data).mark_rect().encode(
    alt.X('variable1:O'),
    alt.Y('variable2:O'),
    alt.Color('correlation',scale=alt.Scale(scheme='yelloworangered')),
    alt.Tooltip(['variable1','variable2','correlation_label']),
).interactive().properties(
    width=400, height=400,
    title = 'Dashboard of Correlation among Audio Features')
view3