<a href="https://colab.research.google.com/github/SteveOat/Hit-Song/blob/main/Python_Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Key finding

Working on Spotify's api data
1. Visualize characteristics of hit songs on Billboard charts.
2. Find the most streamed genre on platform.
3. Does the artist's fame affect whether a song makes it onto the Billboard charts?

<hr>

<h1>Import Libraries

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display_html

!pip install plotly==5.15.0
import plotly.express as px
import plotly.graph_objects as go

df_1 = pd.read_csv('Hot100.csv')
df_2 = pd.read_csv('spotify_full_list_20102023.csv')



<h1>Preparing Data

>Remove **Bracket "()"**  from "Track" column to match the text format of song's name.

In [2]:
df_1['Track'] = df_1['Track'].str.replace(r'\(.*\)', '', regex=True)
df_2['Artist and Title'] = df_2['Artist and Title'].str.replace(r'\(.*\)', '', regex=True)


>Remove **Space " "** and transform every text to lowercase

>Splits **"Artist and Title"** into **"Artist"** , **"Track"**  


In [3]:
df_1['Track'] = df_1['Track'].str.replace(' ','').str.lower()
df_1['Artist_No_Space'] = df_1['Artist'].str.replace(' ','').str.lower()

df_2[['Artist', 'Track']] = df_2['Artist and Title'].str.split('-', n=1, expand=True)

df_2['Artist'] = df_2['Artist'].str.replace(' ','').str.lower()
df_2['Track'] = df_2['Track'].str.replace(' ','').str.lower()

df_2.columns

Index(['Unnamed: 0', 'Artist and Title', 'Artist', 'Streams', 'Daily', 'year',
       'main_genre', 'genres', 'first_genre', 'second_genre', 'third_genre',
       'Track'],
      dtype='object')

>Merge the tables and filled *NaN* value in **"main_genre"** with *Unknown*

>Remove duplicate **"Artist"** column

In [4]:
df_column = df_2[['Track', 'Artist', 'main_genre','Streams']]

df_pre_final = pd.merge( df_1, df_column, left_on=['Track','Artist_No_Space'], right_on=['Track','Artist'], how='left' )
df_pre_final['main_genre'] = df_pre_final['main_genre'].fillna('Unknown')
df_pre_final = df_pre_final.drop(columns=['Artist_y'])

df_final = df_pre_final.rename(columns={'Artist_x': 'Artist'})

>Create new column by combining rows that have the same song name but are sung by different artists to differentiate them.

In [5]:
df_final['Track_n_artist'] = df_final['Track'] + '_' + df_final['Artist_No_Space']

df_final.head()

Unnamed: 0,Track,Artist,Album,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,...,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity,Artist_No_Space,main_genre,Streams,Track_n_artist
0,7rings,Ariana Grande,"thank u, next",2019,178626,4,0.78,0.321,1,-10.747,...,0.562,0.0,0.0881,0.315,139.961,50,arianagrande,Pop,2307550000.0,7rings_arianagrande
1,breakfree,Ariana Grande,My Everything - Deluxe,2014,214840,4,0.686,0.702,7,-5.325,...,0.00637,4.5e-05,0.204,0.29,129.948,76,arianagrande,Pop,918831700.0,breakfree_arianagrande
2,dangerouswoman,Ariana Grande,Dangerous Woman,2016,235946,3,0.664,0.602,4,-5.369,...,0.0529,0.0,0.356,0.289,134.049,70,arianagrande,Pop,1161277000.0,dangerouswoman_arianagrande
3,godisawoman,Ariana Grande,Sweetener,2018,197546,4,0.602,0.658,1,-5.934,...,0.0233,6e-05,0.237,0.268,145.031,75,arianagrande,Pop,1079614000.0,godisawoman_arianagrande
4,intoyou,Ariana Grande,Dangerous Woman,2016,244453,4,0.623,0.734,9,-5.948,...,0.0162,2e-06,0.145,0.37,107.853,71,arianagrande,Pop,1405367000.0,intoyou_arianagrande


<hr>

<h1>Clustering Data

**1. Key**
*   Musical key of the track, represented by integers (0 = C, 1 = C#/Db, etc.).



In [6]:
df_final['Cluster_Key'] = np.where(df_final['Key'] == 0, 'C',
                                         np.where(df_final['Key'] == 1, 'C#/Db',
                                        np.where(df_final['Key'] == 2, 'D',
                                        np.where(df_final['Key'] == 3, 'Eb',
                                        np.where(df_final['Key'] == 4, 'E',
                                        np.where(df_final['Key'] == 5, 'F',
                                        np.where(df_final['Key'] == 6, 'F#/Gb',
                                        np.where(df_final['Key'] == 7, 'G',
                                        np.where(df_final['Key'] == 8, 'G#/Ab',
                                        np.where(df_final['Key'] == 9, 'A',
                                        np.where(df_final['Key'] ==  10, 'A#/Bb',
                                        np.where(df_final['Key'] ==  11, 'B',''))))))))))))

**2. Mode**
* Indicates whether the track is in a major (1) or minor (0) key.

In [7]:
df_final['Cluster_Mode'] = np.where(df_final['Mode'] == 0, 'Minor',
                                      np.where(df_final['Mode'] == 1, 'Major',''))

**3. Duration**
- Length of the track, usually in milliseconds.
 - Short: 0 - 180,000 ms (0 - 3 minutes)
 - Medium: 180,001 - 300,000 ms (3 - 5 minutes)
 - Long: 300,001 ms and above (5+ minutes)

In [8]:
df_final['Cluster_Duration(Min)'] = np.where(df_final['Duration'] < 180000, '0-3mins',
                                                  np.where(df_final['Duration'] < 300000, '3-5mins', '5+mins'))

**4. Danceability**
- How suitable a track is for dancing, from 0.0 to 1.0.
 - Low: 0.0 - 0.4 (e.g., classical, ambient)
 - Medium: 0.4 - 0.7 (e.g., soft rock, indie)
 - High: 0.7 - 1.0 (e.g., pop, dance, hip-hop)

In [9]:
df_final['Cluster_Danceability'] = np.where(df_final['Danceability'] <= 0.4, 'Low_Dance',
                                               np.where(df_final['Danceability'] <= 0.7, 'Mid_Dance',
                                               np.where(df_final['Danceability'] <=  1,'High_Dance','')))

**5. Energy**
- Measure of intensity and activity in the track, from 0.0 to 1.0.
 - Low Energy: 0.0 - 0.4 (e.g., acoustic, ambient, soft ballads)
 - Medium Energy: 0.4 - 0.7 (e.g., indie rock, chill electronic)
 - High Energy: 0.7 - 1.0 (e.g., EDM, rock, fast-paced pop)

In [10]:
df_final['Cluster_Energy'] = np.where(df_final['Energy'] <= 0.4, 'Low_Energy',
                                        np.where(df_final['Energy'] <= 0.7, 'Mid_Energy',
                                        np.where(df_final['Energy'] <=  1, 'High_Energy','')))

**6.Loudness**
- Spotify offers three loudness settings to control how normalization is applied:

 - Loud : This applies a normalization level of around -11 dB LUFS, suitable for noisier environments where higher volume is needed.
 - Normal (default) : This is the standard setting at -14 dB LUFS, aiming for balanced playback across all tracks.
 - Quiet : This setting lowers the loudness normalization target to -23 dB LUFS, ideal for quiet environments or more dynamic listening experiences.

In [11]:
df_final['Cluster_Loudness'] = np.where(df_final['Loudness'] >= -11, 'High_Loud',
                                            np.where(df_final['Loudness'] >= -14, 'Mid_Loud', 'Low_Loud'))

**7. Speechiness**
-  Detects the presence of spoken words in a track, from 0.0 to 1.0.
 - Low Speechiness: 0.0 - 0.33 (e.g., music without much spoken word)
 - Medium Speechiness: 0.33 - 0.66 (e.g., tracks with both music and speech, like rap)
 - High Speechiness: 0.66 - 1.0 (e.g., podcasts, spoken word tracks)

In [12]:
df_final['Cluster_Speechiness'] = np.where(df_final['Speechiness'] <= 0.333, 'Low_Speech',
                                               np.where(df_final['Speechiness'] <= 0.666, 'Mid_Speech',
                                               np.where(df_final['Speechiness'] <=  1, 'High_Speech','')))

**8. Acousticness**
- Confidence level that the track is acoustic, from 0.0 to 1.0.
 - Low Acousticness: 0.0 - 0.3 (e.g., electronic, heavily produced)
 - Medium Acousticness: 0.3 - 0.7 (e.g., some balance between acoustic and electronic elements)
 - High Acousticness: 0.7 - 1.0 (e.g., acoustic tracks, singer-songwriter)

In [13]:
df_final['Cluster_Acousticness'] = np.where(df_final['Acousticness'] <= 0.3, 'Low_Acoustic',
                                                np.where(df_final['Acousticness'] <= 0.7, 'Mid_Acoustic',
                                                np.where(df_final['Acousticness'] <=  1, 'High_Acoustic','')))

**9. Instrumentalness**
- Probability that the track contains no vocals, from 0.0 to 1.0.
 - Vocal-heavy: 0.0 - 0.1 (most mainstream music)
 - Medium Instrumentalness: 0.1 - 0.5 (vocals present but not dominant)
 - Instrumental: 0.5 - 1.0 (mostly instrumental, no vocals)

In [14]:
df_final['Cluster_Instrumentalness'] = np.where(df_final['Instrumentalness'] <= 0.1, 'Low_Instru',
                                                      np.where(df_final['Instrumentalness'] <= 0.5, 'Mid_Instru',
                                                      np.where(df_final['Instrumentalness'] <=  1, 'High_Instru','')))

**10. Liveness**
Detects the presence of a live audience in the recording, from 0.0 to 1.0.
- Studio quality group
 - Studio-like: 0.0 - 0.3 (recorded in a studio without live ambiance)
- Outdoor quality group
 - Medium Liveness: 0.3 - 0.6 (some audience noise or live characteristics)
 - Live Recording: 0.6 - 1.0 (recorded live in concert with audience presence)


In [15]:
df_final['Cluster_Liveness'] = np.where(df_final['Liveness'] <= 0.3, 'Studio_Liveness','Outdoor_Liveness')

**11. Valence**
- Describes the musical positiveness conveyed by a track, from 0.0 to 1.0.
 - Low Valence: 0.0 - 0.3 (e.g., sad, melancholic tracks)
 - Medium Valence: 0.3 - 0.6 (e.g., neutral or emotionally mixed tracks)
 - High Valence: 0.6 - 1.0 (e.g., happy, cheerful, upbeat tracks)

In [16]:
df_final['Cluster_Valence'] = np.where(df_final['Valence'] <= 0.3, 'Low_Valence',
                                         np.where(df_final['Valence'] <= 0.6, 'Mid_Valence',
                                         np.where(df_final['Valence'] <=  1, 'High_Valence','')))

**12. Tempo**
- Speed of the track in beats per minute (BPM).
 - Slow: 0 - 60 BPM (e.g., ballads, ambient music)
 - Medium: 60 - 120 BPM (e.g., pop, mid-tempo rock)
 - Fast: 120 - 180+ BPM (e.g., dance, EDM, upbeat tracks)

In [17]:
df_final['Cluster_Tempo'] = np.where(df_final['Tempo'] <= 60 , 'Low_Tempo',
                                       np.where(df_final['Tempo'] <= 120 , 'Mid_Tempo', 'High_Tempo'))

<hr>

<h2>1. Visualize characteristics of hit songs on Billboard charts.

> Key Signature

In [18]:
cluster_key_count = df_final.groupby('Cluster_Key')['Track'].count().reset_index()

sort_key = ['C', 'C#/Db', 'D', 'Eb', 'E', 'F', 'F#/Gb', 'G', 'G#/Ab', 'A', 'A#/Bb', 'B']

fig = px.bar(cluster_key_count, x='Cluster_Key', y='Track',
             category_orders={'Cluster_Key': sort_key},
             labels={'Track': 'Count of Track', 'Cluster_Key': 'Cluster Key'},
             title='Track Count by Cluster Key',
             text='Track')

fig.update_traces(marker_color='skyblue', textposition='outside')

fig.update_layout(xaxis_title='Cluster Key',
                  yaxis_title='Count of Track',
                  xaxis_tickangle=-45,
                  uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()


> Time Signature

In [19]:
cluster_time_count = df_final.groupby('Time_Signature')['Track'].count().reset_index()

fig = go.Figure(data=[
    go.Bar(
        x=cluster_time_count['Time_Signature'],
        y=cluster_time_count['Track'],
        text=cluster_time_count['Track'],
        textposition='auto',
        marker_color='teal',
    )
])

fig.update_layout(
    title='Track Count by Time Signature',
    xaxis_title='Time Signature',
    yaxis_title='Count of Track',
    yaxis=dict(range=[0, 700]),
    xaxis_tickvals=[1, 2, 3, 4, 5],
    xaxis_ticktext=['1/4', '2/4', '3/4', '4/4', '5/8'],
    xaxis_tickangle=0,
    plot_bgcolor='white',
)


fig.show()

> Major or Minor

In [20]:
cluster_mode_count = df_final.groupby('Cluster_Mode')['Track'].count().reset_index()

fig = px.pie(cluster_mode_count, values='Track', names='Cluster_Mode',
             title='Track Count by Cluster Mode',
             color_discrete_sequence=['#3F72AF', '#DBE2EF'],
             hole=0.3)

fig.update_traces(textposition='inside', textinfo='percent+label')

fig.show()

Tempo VS Track

In [21]:
cluster_tempo_count = df_final.groupby('Cluster_Tempo')['Track'].count().reset_index()

sort_tempo = ['Low_Tempo', 'Mid_Tempo', 'High_Tempo']

tempo_counts = {tempo: 0 for tempo in sort_tempo}
for _, row in cluster_tempo_count.iterrows():
    tempo_counts[row['Cluster_Tempo']] = row['Track']

cluster_tempo_count = pd.DataFrame(list(tempo_counts.items()), columns=['Cluster_Tempo', 'Track'])

fig = px.bar(
    cluster_tempo_count,
    x='Cluster_Tempo',
    y='Track',
    title='Track Count by Cluster Tempo',
    labels={'Cluster_Tempo': 'Cluster Tempo', 'Track': 'Count of Track'},
    color_discrete_sequence=['#08D9D6'],
    category_orders={'Cluster_Tempo': sort_tempo}
)

for i, row in cluster_tempo_count.iterrows():
    fig.add_annotation(
        x=row['Cluster_Tempo'],
        y=row['Track'],
        text=f"{int(row['Track']):,}",
        showarrow=False,
        yshift=10
    )

fig.update_yaxes(range=[0, 380])

fig.show()


Duration VS Track

In [22]:
cluster_duration_count = df_final.groupby('Cluster_Duration(Min)')['Track'].count().reset_index()

fig = go.Figure(data=[
    go.Bar(
        x=cluster_duration_count['Cluster_Duration(Min)'],
        y=cluster_duration_count['Track'],
        text=cluster_duration_count['Track'],
        textposition='auto',
        marker_color='#FC5185',
    )
])

fig.update_layout(
    title='Track Count by Cluster Duration',
    xaxis_title='Cluster Duration (Min)',
    yaxis_title='Count of Track',
    yaxis=dict(range=[0, 500]),
    xaxis_tickangle=0,
    plot_bgcolor='white',
)

fig.show()


Loudness VS Track

In [23]:
cluster_loud_count = df_final.groupby('Cluster_Loudness')['Track'].count().reset_index()

sort_loud = ['Low_Loud', 'Mid_Loud', 'High_Loud']

cluster_loud_count['Cluster_Loudness'] = pd.Categorical(cluster_loud_count['Cluster_Loudness'], categories=sort_loud, ordered=True)
cluster_loud_count = cluster_loud_count.sort_values('Cluster_Loudness')

fig = go.Figure(data=[
    go.Bar(
        x=cluster_loud_count['Cluster_Loudness'],
        y=cluster_loud_count['Track'],
        text=cluster_loud_count['Track'],
        textposition='auto',
        marker_color='#609966'
    )
])

fig.update_layout(
    title='Track Count by Cluster Loudness',
    xaxis_title='Cluster Loudness',
    yaxis_title='Count of Track',
    yaxis=dict(range=[0, 680]),
    plot_bgcolor='white'
)

fig.show()

Danceability VS Popularity VS Cluster

In [24]:
color_map = {'Low_Dance': '#FDA403', 'Mid_Dance': '#FF204E', 'High_Dance': '#1E3E62'}

counts = df_final['Cluster_Danceability'].value_counts()

sort_dance = ['Low_Dance', 'Mid_Dance', 'High_Dance']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_dance]

fig = px.scatter(
    df_final,
    x='Danceability',
    y='Popularity',
    color='Cluster_Danceability',
    color_discrete_map=color_map,
    category_orders={'Cluster_Danceability': sort_dance},
    labels={'Cluster_Danceability': 'Cluster Danceability'},
    title='Danceability vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_dance.index(t.name)]))

fig.show()






Energy VS Popularity VS Cluster

In [25]:
color_map = {'Low_Energy': '#FDA403', 'Mid_Energy': '#FF204E', 'High_Energy': '#1E3E62'}

counts = df_final['Cluster_Energy'].value_counts()

sort_energy = ['Low_Energy', 'Mid_Energy', 'High_Energy']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_energy]

fig = px.scatter(
    df_final,
    x='Energy',
    y='Popularity',
    color='Cluster_Energy',
    color_discrete_map=color_map,
    category_orders={'Cluster_Energy': sort_energy},
    labels={'Cluster_Energy': 'Cluster Energy'},
    title='Energy vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_energy.index(t.name)]))

fig.show()






Speechiness VS Popularity VS Cluster_Speechiness

In [26]:
color_map = {'Low_Speech': '#FDA403', 'Mid_Speech': '#FF204E', 'High_Speech': '#1E3E62'}

counts = df_final['Cluster_Speechiness'].value_counts()

sort_speech = ['Low_Speech', 'Mid_Speech', 'High_Speech']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_speech]

fig = px.scatter(
    df_final,
    x='Speechiness',
    y='Popularity',
    color='Cluster_Speechiness',
    color_discrete_map=color_map,
    category_orders={'Cluster_Speechiness': sort_speech},
    labels={'Cluster_Speechiness': 'Cluster Speechiness'},
    title='Speechiness vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_speech.index(t.name)]))

fig.show()





Acouticness VS Popularity VS Cluster_Acouticness

In [27]:
color_map = {'Low_Acoustic': '#FDA403', 'Mid_Acoustic': '#FF204E', 'High_Acoustic': '#1E3E62'}

counts = df_final['Cluster_Acousticness'].value_counts()

sort_acoustic = ['Low_Acoustic', 'Mid_Acoustic', 'High_Acoustic']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_acoustic]

fig = px.scatter(
    df_final,
    x='Acousticness',
    y='Popularity',
    color='Cluster_Acousticness',
    color_discrete_map=color_map,
    category_orders={'Cluster_Acousticness': sort_acoustic},
    labels={'Cluster_Acousticness': 'Cluster Acousticness'},
    title='Acousticness vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_acoustic.index(t.name)]))

fig.show()





Instrumentalness VS Popularity VS Cluster_Instrumentalness

In [28]:
color_map = {'Low_Instru': '#FDA403', 'Mid_Instru': '#FF204E', 'High_Instru': '#1E3E62'}

counts = df_final['Cluster_Instrumentalness'].value_counts()

sort_instru = ['Low_Instru', 'Mid_Instru', 'High_Instru']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_instru]

fig = px.scatter(
    df_final,
    x='Instrumentalness',
    y='Popularity',
    color='Cluster_Instrumentalness',
    color_discrete_map=color_map,
    category_orders={'Cluster_Instrumentalness': sort_instru},
    labels={'Cluster_Instrumentalness': 'Cluster Instrumentalness'},
    title='Instrumentalness vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_instru.index(t.name)]))

fig.show()





Valence VS Popularity VS Cluster_Valence




In [29]:
color_map = {'Low_Valence': '#FDA403', 'Mid_Valence': '#FF204E', 'High_Valence': '#1E3E62'}

counts = df_final['Cluster_Valence'].value_counts()

sort_valence = ['Low_Valence', 'Mid_Valence', 'High_Valence']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_valence]

fig = px.scatter(
    df_final,
    x='Valence',
    y='Popularity',
    color='Cluster_Valence',
    color_discrete_map=color_map,
    category_orders={'Cluster_Valence': sort_valence},
    labels={'Cluster_Valence': 'Cluster Valence'},
    title='Valence vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_valence.index(t.name)]))

fig.show()





Liveness VS Popularity VS Cluster_Liveness

In [30]:
color_map = {'Studio_Liveness': '#DC84F3', 'Outdoor_Liveness': '#756AB6'}

counts = df_final['Cluster_Liveness'].value_counts()

sort_liveness = ['Studio_Liveness', 'Outdoor_Liveness']

sorted_labels = [f"{label} ({counts.get(label, 0)})" for label in sort_liveness]

fig = px.scatter(
    df_final,
    x='Liveness',
    y='Popularity',
    color='Cluster_Liveness',
    color_discrete_map=color_map,
    category_orders={'Cluster_Liveness': sort_liveness},
    labels={'Cluster_Liveness': 'Cluster Liveness'},
    title='Liveness vs Popularity by Cluster'
)

fig.for_each_trace(lambda t: t.update(name=sorted_labels[sort_liveness.index(t.name)]))

fig.show()





<hr>

<h2>2. Find the most streamed genre on platform.

Genre VS Streams

In [31]:
cluster_genre_count = df_final.groupby('main_genre')['Track'].count().reset_index()
cluster_genre_count.columns = ['main_genre', 'Track']

cluster_streams_sum = df_final.groupby('main_genre')['Streams'].sum().reset_index()
cluster_streams_sum.columns = ['main_genre', 'Streams']

fig1 = px.bar(
    cluster_genre_count,
    x='main_genre',
    y='Track',
    title='Track Count by Genre',
    labels={'main_genre': 'Genre', 'Track': 'Count of Track'},
    color='Track',
    color_continuous_scale='Reds'
)

for i, row in cluster_genre_count.iterrows():
    fig1.add_annotation(
        x=row['main_genre'],
        y=row['Track'],
        text=f"{int(row['Track']):,}",
        showarrow=False,
        yshift=10
    )

fig2 = px.bar(
    cluster_streams_sum,
    x='main_genre',
    y='Streams',
    title='Streams by Genre',
    labels={'main_genre': 'Genre', 'Streams': 'Sum of Streams'},
    color='Streams',
    color_continuous_scale='Reds'
)

for i, row in cluster_streams_sum.iterrows():
    fig2.add_annotation(
        x=row['main_genre'],
        y=row['Streams'],
        text=f"{int(row['Streams']):,}",
        showarrow=False,
        yshift=10
    )

fig1.show()
fig2.show()

<hr>

<h2>3. Does the artist's fame affect whether a song makes it onto the Billboard charts?

Genre VS  Track of Artist VS Streams

In [32]:
from plotly.subplots import make_subplots

cluster_artist_count = df_final.groupby('Artist')[['Track', 'Popularity']].agg(Track=('Track', 'count'), Popularity=('Popularity', 'mean')).reset_index().sort_values(by='Track', ascending=False)

top_10_track = cluster_artist_count.head(10)
top_10_pop = cluster_artist_count.head(10)
bottom_10_track = cluster_artist_count.tail(10)
bottom_10_pop = cluster_artist_count.tail(10)

fig = make_subplots(rows=1, cols=2, subplot_titles=('Top 10 Track and Avg Popularity of Artists', 'Bottom 10 Track and Avg Popularity of Artists'))

top_10_artists = top_10_track['Artist'].tolist()
top_10_tracks = top_10_track['Track'].tolist()
top_10_popularity = top_10_pop['Popularity'].tolist()

fig.add_trace(
    go.Bar(
        x=top_10_artists,
        y=top_10_tracks,
        name='Num of Track',
        marker_color='purple',
        text=[f'{int(val)}' for val in top_10_tracks],
        textposition='auto'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=top_10_artists,
        y=top_10_popularity,
        name='Avg of Pop.',
        marker_color='#30E3CA',
        text=[f'{int(val)}' for val in top_10_popularity],
        textposition='auto'
    ),
    row=1, col=1
)

bottom_10_artists = bottom_10_track['Artist'].tolist()
bottom_10_tracks = bottom_10_track['Track'].tolist()
bottom_10_popularity = bottom_10_pop['Popularity'].tolist()

fig.add_trace(
    go.Bar(
        x=bottom_10_artists,
        y=bottom_10_tracks,
        name='Num of Track',
        marker_color='purple',
        text=[f'{int(val)}' for val in bottom_10_tracks],
        textposition='auto'
    ),
    row=1, col=2
)

fig.add_trace(
    go.Bar(
        x=bottom_10_artists,
        y=bottom_10_popularity,
        name='Avg of Pop.',
        marker_color='#30E3CA',
        text=[f'{int(val)}' for val in bottom_10_popularity],
        textposition='auto'
    ),
    row=1, col=2
)

fig.update_layout(
    title_text='Track Counts and Average Popularity by Artist',
    barmode='group',
    xaxis_title='Artist',
    yaxis_title='Count of Track and Avg Pop',
    xaxis_tickangle=-45,
)

fig.show()


<hr>

<h1>Challenges

<h1>Pearson's Correlation

In [33]:
from scipy.stats import pearsonr

df_p = pd.read_csv('Pearson_Mat.csv')

correlation_results = {}

for column in df_p.columns:
    if column != 'Popularity':
        correlation = df_p['Popularity'].corr(df_p[column])
        correlation_results[column] = correlation

correlation_df = pd.DataFrame(correlation_results.items(), columns=['Feature', 'Pearson_r'])
print(correlation_df)

             Feature  Pearson_r
0           Duration   0.170051
1     Time_Signature  -0.032756
2       Danceability  -0.048724
3             Energy  -0.077441
4                Key  -0.002714
5           Loudness  -0.054008
6               Mode  -0.041995
7        Speechiness  -0.224381
8       Acousticness   0.158303
9   Instrumentalness   0.070755
10          Liveness  -0.017248
11           Valence  -0.046704
12             Tempo  -0.034341
13           Streams   0.539580


In [34]:
popularity_corr = df_p.corr()[['Popularity']]

fig = go.Figure(data=go.Heatmap(
    z=popularity_corr.values,
    x=popularity_corr.columns,
    y=popularity_corr.index,
    colorscale='RdBu',
    zmin=-1, zmax=1,
    colorbar=dict(title='Correlation')
))

annotations = []
for i in range(popularity_corr.shape[0]):
    for j in range(popularity_corr.shape[1]):
        annotations.append(
            dict(
                x=popularity_corr.columns[j],
                y=popularity_corr.index[i],
                text=str(round(popularity_corr.iloc[i, j], 2)),
                showarrow=False,
                font=dict(color="red")
            )
        )
fig.update_layout(
    title="Pearson Correlation with Popularity",
    annotations=annotations,
    xaxis_title="Features",
    yaxis_title="Features",
    height=600, width=800
)

fig.show()



<hr>