In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
# define my own theme
@alt.theme.register('theme_1', enable=True)
def theme_1():
    font = "IBM Plex Mono"
    primary_color = "#F63366"
    font_color = "#262730"
    grey_color = "#f0f2f6"
    base_size = 16
    lg_font = base_size * 1.1
    sm_font = base_size * 0.8
    xl_font = base_size * 1.75

    config = {
        "config": {
            "arc": {"fill": primary_color},
            "area": {"fill": primary_color},
            "circle": {"fill": primary_color, "stroke": font_color, "strokeWidth": 0.5},
            "line": {"stroke": primary_color},
            "path": {"stroke": primary_color},
            "point": {"stroke": primary_color},
            "rect": {"fill": primary_color},
            "shape": {"stroke": primary_color},
            "symbol": {"fill": primary_color},
            "title": {
                "font": font,
                "color": font_color,
                "fontSize": lg_font,
                "anchor": "start",
            },
            "subtitle": {
                "font": font,
                "color": font_color,
                "fontSize": 18,
                "anchor": "start",
            },
            "axis": {
                "titleFont": font,
                "titleColor": font_color,
                "titleFontSize": sm_font,
                "labelFont": font,
                "labelColor": font_color,
                "labelFontSize": sm_font,
                "gridColor": grey_color,
                "domainColor": font_color,
                "tickColor": "#fff",
            },
            "header": {
                "labelFont": font,
                "titleFont": font,
                "labelFontSize": base_size,
                "titleFontSize": base_size,
            },
            "legend": {
                "titleFont": font,
                "titleColor": font_color,
                "titleFontSize": sm_font,
                "labelFont": font,
                "labelColor": font_color,
                "labelFontSize": sm_font,
            },
            "range": {
                "category": ["#fcca46", "#fe7f2d", "#233d4d", "#a1c181", "#717744", "#00798c"],
                "diverging": [
                    "#850018",
                    "#cd1549",
                    "#f6618d",
                    "#fbafc4",
                    "#f5f5f5",
                    "#93c5fe",
                    "#5091e6",
                    "#1d5ebd",
                    "#002f84",
                ],
                "heatmap": [
                    "#ffb5d4",
                    "#ff97b8",
                    "#ff7499",
                    "#fc4c78",
                    "#ec245f",
                    "#d2004b",
                    "#b10034",
                    "#91001f",
                    "#720008",
                ],
                "ramp": [
                    "#ffb5d4",
                    "#ff97b8",
                    "#ff7499",
                    "#fc4c78",
                    "#ec245f",
                    "#d2004b",
                    "#b10034",
                    "#91001f",
                    "#720008",
                ],
                "ordinal": [
                    "#ffb5d4",
                    "#ff97b8",
                    "#ff7499",
                    "#fc4c78",
                    "#ec245f",
                    "#d2004b",
                    "#b10034",
                    "#91001f",
                    "#720008",
                ],
            },
        }
    }
    return config

In [3]:
# alt.themes.register("theme_1", theme_1)
# alt.themes.enable("theme_1")

In [4]:
data = pd.read_csv('dataset.csv')

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,release_date
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,2022-04-08
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,2021-04-30
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,2021-03-17
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,2018-08-10
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,2017-02-03


In [6]:
data.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre', 'release_date'],
      dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [8]:
data['release_date'] = pd.to_datetime(data['release_date'], errors='coerce')
data['release_date'].min(), data['release_date'].max()

(Timestamp('1899-12-31 00:00:00'), Timestamp('2024-12-27 00:00:00'))

In [9]:
data['release_year'] = data['release_date'].dt.year

In [10]:
# drop if release_date is NaT
data = data.dropna(subset=['release_date'])

In [11]:
# drop data if release_year is before 1985 because the data is too sparse before that
data = data[data['release_year'] >= 1985]
# drop data after 2023
data = data[data['release_year'] <2023]
# drop data if release_year is null
data = data.dropna(subset=['release_year'])

In [12]:
# get unique genres
unique_genres = data['track_genre'].unique()
print(unique_genres)

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [13]:
# we generate a color mapping for all genres
mapping_file = pd.read_csv('Genre_Color_and_Display_Mapping.csv')

In [14]:
genre_year = (
    data.groupby(['release_year', 'track_genre'])['popularity']
    .mean()  # or .count() if you prefer song counts
    .reset_index()
    .rename(columns={'popularity': 'avg_popularity'})
)
# Make sure release_year is numeric
genre_year['release_year'] = pd.to_numeric(genre_year['release_year'], errors='coerce')

In [15]:
# Merge with genre_year on 'track_genre' and 'genre'
genre_year = genre_year.merge(mapping_file, left_on='track_genre', right_on='genre', how='left')

In [16]:
# show top songs for each genre
top_songs = (
    data.sort_values(['release_year', 'track_genre', 'popularity'], ascending=[True, True, False])
    .groupby(['release_year', 'track_genre'])
    .head(3)  # Top 3 songs per group
    .groupby(['release_year', 'track_genre'])['track_name']
    .apply(lambda x: ', '.join(x))
    .reset_index()
    .rename(columns={'track_name': 'top_tracks'})
)
genre_year = genre_year.merge(top_songs, on=['release_year', 'track_genre'], how='left')

In [17]:
# # color mapping
fixed_color_scale = alt.Scale(
    domain=mapping_file['genre'].tolist(),
    range=mapping_file['color'].tolist()
)
#
# color_encoding = alt.Color('track_genre:N', scale=fixed_color_scale, legend=None)
#
# # Define the slider
# year_param = alt.param(
#     name='year_selector',
#     bind=alt.binding_range(
#         min=int(genre_year['release_year'].min()),
#         max=int(genre_year['release_year'].max()),
#         step=1,
#         name='Year:'
#     ),
#     value=int(genre_year['release_year'].min())
# )
#
# # Create the bar chart
# bar_chart = alt.Chart(genre_year).transform_filter(
#     alt.datum.release_year == year_param
# ).transform_window(
#     rank='rank(avg_popularity)',
#     sort=[alt.SortField('avg_popularity', order='descending')],
#     groupby=['release_year']
# ).transform_filter(
#     'datum.rank <= 10'
# ).mark_bar().encode(
#     y=alt.Y('track_genre:N', sort='-x', title=None, axis=alt.Axis(labelLimit=150)),
#     x=alt.X('avg_popularity:Q', title='Popularity'),
#     color=color_encoding,
#     tooltip=[
#         alt.Tooltip('track_genre:N', title='Genre'),
#         alt.Tooltip('avg_popularity:Q', title='Popularity', format='.2f'),
#         alt.Tooltip('top_tracks:N', title='Top Songs'),
#     ],
# ).add_params(
#     year_param
# ).properties(
#     title=alt.TitleParams(
#         text=['Top 10 Genres by Popularity per Year'],
#         subtitle=['Popularity is the average popularity of songs in each genre (drag the slider to change the year)'],
#         anchor='start'
#     ),
#     width=600,
#     height=400
# )
# bar_chart.configure_view(
#     strokeWidth=0,
# ).configure_axis(
#     grid=False
# ).save("../../public/charts/top10_genre.json")

In [18]:
data['popularity'].describe()

count    104455.000000
mean         32.872270
std          22.343535
min           0.000000
25%          16.000000
50%          34.000000
75%          50.000000
max         100.000000
Name: popularity, dtype: float64

In [19]:
feature_cols = [
    'danceability', 'energy', 'valence', 'acousticness',
    'instrumentalness', 'liveness', 'speechiness'
]

long_df = data[['release_year'] + feature_cols].melt(
    id_vars='release_year',
    value_vars=feature_cols,
    var_name='feature',
    value_name='value'
)

trend_df = long_df.groupby(['release_year', 'feature'])['value'].mean().reset_index()

facet_chart = alt.Chart(trend_df).mark_line(point=True).encode(
    x=alt.X('release_year:O', title='Year', axis=alt.Axis(values=list(range(1960, 2025, 10)))),
    y=alt.Y('value:Q', title='Average Feature Level'),
    color=alt.Color('feature:N', legend=None),
    tooltip=['release_year', 'value']
).properties(
    width=160,
    height=120
).facet(
    facet=alt.Facet('feature:N', title=None), columns=4
)

facet_chart

In [20]:
data['release_year'] = pd.to_datetime(data['release_date'], errors='coerce').dt.year
genre_year_counts = data.groupby(['track_genre', 'release_year']).size().reset_index(name='count')
genre_year_span = genre_year_counts.groupby('track_genre')['release_year'].nunique().reset_index()
genre_year_span.columns = ['track_genre', 'active_years']
long_lasting_genres = genre_year_span[genre_year_span['active_years'] >= 10]  # threshold is adjustable

print(long_lasting_genres.sort_values(by='active_years', ascending=False))

           track_genre  active_years
106            swedish            38
76               opera            38
41                goth            38
47           hard-rock            38
48            hardcore            38
..                 ...           ...
83   progressive-house            16
53               house            16
15               chill            16
101              sleep            15
30                 edm            13

[112 rows x 2 columns]


In [21]:
# Step 4: Filter main dataset for long-lasting genres
lasting_genres_list = long_lasting_genres['track_genre']
filtered_data = data[data['track_genre'].isin(lasting_genres_list)]

# Step 5: Compute average feature values for each genre
features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']
genre_feature_means = filtered_data.groupby('track_genre')[features].mean().reset_index()

# View summary table
print(genre_feature_means.sort_values(by='track_genre'))

     track_genre  danceability    energy   valence  acousticness  \
0       acoustic      0.548152  0.428808  0.415441      0.575408   
1       afrobeat      0.668244  0.709412  0.698272      0.264666   
2       alt-rock      0.534710  0.754142  0.516095      0.122366   
3    alternative      0.558448  0.721111  0.494502      0.144665   
4        ambient      0.367931  0.236628  0.166825      0.778337   
..           ...           ...       ...       ...           ...   
107       techno      0.685338  0.749398  0.309688      0.072150   
108       trance      0.581176  0.845960  0.273283      0.035271   
109     trip-hop      0.636980  0.623174  0.472196      0.224655   
110      turkish      0.620844  0.612482  0.458195      0.311770   
111  world-music      0.414071  0.535595  0.250327      0.295782   

     instrumentalness  liveness  speechiness  
0            0.039708  0.152583     0.043211  
1            0.253921  0.187144     0.088612  
2            0.055160  0.211847     0.0549

In [22]:
# scatter plot

# Step 1: Compute total song count and average popularity per genre
genre_summary = data.groupby('track_genre').agg(
    total_count=('track_id', 'count'),
    avg_popularity=('popularity', 'mean')
).reset_index()

# Step 2: Merge with the active years data you already calculated
genre_summary = genre_summary.merge(genre_year_span, on='track_genre')

# Step 3: Create scatter plot
scatter = alt.Chart(genre_summary).mark_circle().encode(
    x=alt.X('active_years:Q', title='Number of Active Years'),
    y=alt.Y('total_count:Q', title='Total Number of Songs', scale=alt.Scale(domain=[500, 1100])),
    size=alt.Size('avg_popularity:Q', title='Average Popularity', scale=alt.Scale(range=[20, 600]), legend=alt.Legend(title="Avg Popularity")),
    color=alt.Color('track_genre:N', scale=fixed_color_scale, legend=None),
    tooltip=[alt.Tooltip('track_genre:N', title='Genre'),
                alt.Tooltip('active_years:Q', title='Active Years'),
                alt.Tooltip('total_count:Q', title='Total Songs'),
                alt.Tooltip('avg_popularity:Q', title='Avg Popularity', format='.2f')]
).properties(
    title='Genre Longevity vs Output vs Popularity',
    width=600,
    height=400
)

scatter.interactive().save("../../public/charts/enduring_genre.json")

In [23]:
#Normalize popularity and count
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
genre_summary[['norm_popularity', 'norm_count']] = scaler.fit_transform(
    genre_summary[['avg_popularity', 'total_count']]
)

# weight_slider = alt.param(
#     name='pop_weight',
#     bind=alt.binding_range(min=0, max=1, step=0.01, name='Popularity Weight:'),
#     value=0.5
# )
#
# scatter = alt.Chart(genre_summary).transform_calculate(
#     score='datum.norm_popularity * pop_weight + datum.norm_count * (1 - pop_weight)'
# ).encode(
#     x=alt.X('norm_count:Q', title='Quantity (Song Count)'),
#     y=alt.Y('norm_popularity:Q', title='Quality (Popularity)'),
#     size=alt.Size('score:Q', title='Weighted Score', scale=alt.Scale(range=[20, 1000]), legend=None),
#     color=alt.Color('track_genre:N', legend=None, scale=fixed_color_scale),
#     tooltip=[
#         alt.Tooltip('track_genre:N', title='Genre'),
#         alt.Tooltip('total_count:Q', title='Total Songs'),
#         alt.Tooltip('avg_popularity:Q', title='Average Popularity'),
#         alt.Tooltip('score:Q', title='Score', format='.2f')
#     ]
# ).add_params(
#     weight_slider
# ).mark_circle().properties(
#     title=alt.TitleParams(
#         text=['Genre Score Based on Popularity and Song Count'],
#         subtitle=['Drag the slider to adjust the weight of popularity vs song count']
#     ),
#     width=600,
#     height=400
# )
#
# # Shared score calculation
# score_calc = 'datum.norm_popularity * pop_weight + datum.norm_count * (1 - pop_weight)'
#
# # Score + rank
# ranked = alt.Chart(genre_summary).transform_calculate(
#     score=score_calc
# ).transform_window(
#     rank='rank(score)',
#     sort=[alt.SortField('score', order='descending')]
# ).transform_filter(
#     'datum.rank <= 10'
# )
#
# # Right-side ranked genre list (as text)
# top_genres = ranked.mark_text(align='left', dx=-50).encode(
#     y=alt.Y('rank:O', sort='ascending', title=None, axis=None),
#     text=alt.Text('track_genre:N'),
#     tooltip=[
#         alt.Tooltip('track_genre:N', title='Genre'),
#         alt.Tooltip('score:Q', title='Score', format='.2f')
#     ]
# ).properties(
#     width=120,
#     height=400,
#     title='Top 10 Genres'
# )

# genre_list = top_genres
# final_chart = scatter | genre_list
# final_chart.add_params(weight_slider).interactive().save("../../public/charts/whos_winning.json")

In [24]:
# facet chart
genres_of_interest = ['chill', 'sad', 'k-pop', 'pop-film']
filtered_data = data[data['track_genre'].isin(genres_of_interest)].copy()

filtered_data['release_year'] = pd.to_datetime(filtered_data['release_date'], errors='coerce').dt.year

summary = filtered_data.groupby(['release_year', 'track_genre']).agg(
    total_songs=('track_id', 'count'),
    avg_popularity=('popularity', 'mean')
).reset_index()

melted = summary.melt(id_vars=['release_year', 'track_genre'], 
                      value_vars=['total_songs', 'avg_popularity'], 
                      var_name='metric', value_name='value')

# line chart
chart = alt.Chart(melted).mark_line(point=True).encode(
    x=alt.X('release_year:O', title='Year', axis=alt.Axis(values=list(range(1985, 2022, 5)))),
    y=alt.Y('value:Q', title='Value'),
    color=alt.Color('metric:N', title='Metric', scale=alt.Scale(domain=['total_songs', 'avg_popularity'], range=['#1f77b4', '#ff7f0e'])
),
    # tooltip=['release_year', 'track_genre', 'metric', 'value']
    tooltip=[
        alt.Tooltip('release_year:O', title='Year'),
        alt.Tooltip('track_genre:N', title='Genre'),
        alt.Tooltip('metric:N', title='Metric'),
        alt.Tooltip('value:Q', title='Value')
    ]
).properties(
    title='Trends of Total Songs and Popularity Over Time',
    width=340,
    height=180
).facet(
    alt.Facet(
        'track_genre:N', title=None
    ), columns=2, title = 'Trends of Total Songs and Popularity Over Time'
)
chart.save("../../public/charts/rise_moods.json")

In [25]:
# bubble chart
genres_of_interest = ['chill', 'sad', 'k-pop', 'pop-film']
features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']

feature_profile_df = data[data['track_genre'].isin(genres_of_interest)].groupby('track_genre')[features].mean().reset_index()
melted_features = feature_profile_df.melt(id_vars='track_genre', var_name='feature', value_name='value')

overall_means = data[features].mean().reset_index()
overall_means.columns = ['feature', 'mean_value']
overall_means['track_genre'] = 'Average'

genre_bubbles = alt.Chart(melted_features).mark_circle().encode(
    x=alt.X('feature:N', title=None, sort=features, axis=alt.Axis(labelAngle=30)),
    y=alt.Y('track_genre:N', title=None, sort=genres_of_interest),
    size=alt.Size('value:Q', scale=alt.Scale(range=[0, 1000]), legend=alt.Legend(title='Feature Value')),
    color=alt.Color('feature:N', legend=None),
    tooltip=[alt.Tooltip('track_genre:N', title='Genre'),
            alt.Tooltip('feature:N', title='Feature'),
            alt.Tooltip('value:Q', title='Value', format='.2f')]
)

mean_bubbles = alt.Chart(overall_means).mark_circle(
    color='gray',
    fillOpacity=0.5,
).encode(
    x=alt.X('feature:N', sort=features),
    y=alt.Y('track_genre:N', sort=genres_of_interest),
    size=alt.Size('mean_value:Q', scale=alt.Scale(range=[0, 1000]), legend=None),
    tooltip=[alt.Tooltip('mean_value:Q', title='Dataset Mean', format='.2f')]
)

# Combine the layers
final_chart_top = (genre_bubbles + mean_bubbles).properties(
    title='Musical Feature Profile - Most Popular Genres',
    width=400,
    height=150
)

final_chart_top

In [26]:
# bubble chart for least popular genres
genres_of_interest = ['latin', 'romance', 'iranian', 'jazz']
features = ['danceability', 'energy', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'speechiness']

feature_profile_df = data[data['track_genre'].isin(genres_of_interest)].groupby('track_genre')[features].mean().reset_index()
melted_features = feature_profile_df.melt(id_vars='track_genre', var_name='feature', value_name='value')

overall_means = data[features].mean().reset_index()
overall_means.columns = ['feature', 'mean_value']
overall_means['track_genre'] = 'Average'

genre_bubbles = alt.Chart(melted_features).mark_circle().encode(
    x=alt.X('feature:N', title=None, sort=features, axis=alt.Axis(labelAngle=30)),
    y=alt.Y('track_genre:N', title=None, sort=genres_of_interest),
    size=alt.Size('value:Q', scale=alt.Scale(range=[0, 1000]), legend=alt.Legend(title='Feature Value')),
    color=alt.Color('feature:N', legend=None),
    tooltip=[alt.Tooltip('track_genre:N', title='Genre'),
            alt.Tooltip('feature:N', title='Feature'),
            alt.Tooltip('value:Q', title='Value', format='.2f')]
)

mean_bubbles = alt.Chart(overall_means).mark_circle(
    color='gray',
    fillOpacity=0.5,
).encode(
    x=alt.X('feature:N', sort=features),
    y=alt.Y('track_genre:N', sort=genres_of_interest),
    size=alt.Size('mean_value:Q', scale=alt.Scale(range=[0, 1000]), legend=None),
    tooltip=[alt.Tooltip('mean_value:Q', title='Dataset Mean', format='.2f')]
)

# Combine the layers
final_chart_least = (genre_bubbles + mean_bubbles).properties(
    title='Musical Feature Profile - Least Popular Genres',
    width=400,
    height=150
)

final_chart_least

In [27]:
(final_chart_top & final_chart_least).save("../../public/charts/features.json")