In [1]:
import pandas as pd
import altair as alt
import os
import kagglehub

# Enable VegaFusion for large datasets
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [2]:
# Manually Change kagglehub.dataset_download cache to the data directory in repo
os.environ['KAGGLEHUB_CACHE'] = "../data/raw"

# Download the dataset from kaggle
path = kagglehub.dataset_download("sanjanchaudhari/spotify-dataset")

# Define the path to retrieve the csv file
data_directory = os.path.join(path, "cleaned_dataset.csv")

# Retrieve the downloaded csv and store in a variable
df = pd.read_csv(data_directory)

In [3]:
# Preview first 5 rows
df.head()

Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,...,Title,Channel,Views,Likes,Comments,Licensed,official_video,Stream,EnergyLiveness,most_playedon
0,Gorillaz,Feel Good Inc.,Demon Days,album,0.818,0.705,-6.679,0.177,0.00836,0.00233,...,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,True,True,1040235000.0,1.150082,Spotify
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,0.676,0.703,-5.815,0.0302,0.0869,0.000687,...,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,True,True,310083700.0,15.183585,Spotify
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,0.695,0.923,-3.93,0.0522,0.0425,0.0469,...,Gorillaz - New Gold ft. Tame Impala & Bootie B...,Gorillaz,8435055.0,282142.0,7399.0,True,True,63063470.0,7.956897,Spotify
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,0.689,0.739,-5.81,0.026,1.5e-05,0.509,...,Gorillaz - On Melancholy Hill (Official Video),Gorillaz,211754952.0,1788577.0,55229.0,True,True,434663600.0,11.546875,Spotify
4,Gorillaz,Clint Eastwood,Gorillaz,album,0.663,0.694,-8.627,0.171,0.0253,0.0,...,Gorillaz - Clint Eastwood (Official Video),Gorillaz,618480958.0,6197318.0,155930.0,True,True,617259700.0,9.942693,Youtube


In [4]:
# Check data types and nulls
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 20594 entries, 0 to 20593
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Artist            20594 non-null  str    
 1   Track             20594 non-null  str    
 2   Album             20594 non-null  str    
 3   Album_type        20594 non-null  str    
 4   Danceability      20594 non-null  float64
 5   Energy            20594 non-null  float64
 6   Loudness          20594 non-null  float64
 7   Speechiness       20594 non-null  float64
 8   Acousticness      20594 non-null  float64
 9   Instrumentalness  20594 non-null  float64
 10  Liveness          20594 non-null  float64
 11  Valence           20594 non-null  float64
 12  Tempo             20594 non-null  float64
 13  Duration_min      20594 non-null  float64
 14  Title             20594 non-null  str    
 15  Channel           20594 non-null  str    
 16  Views             20594 non-null  float64
 17  Like

In [5]:
# Show only columns with nulls
null_counts = df.isnull().sum()
null_counts[null_counts > 0]

EnergyLiveness    2
dtype: int64

In [6]:
# Check for duplicate rows
df.duplicated().sum()

np.int64(0)

In [7]:
# Distribution of numerical features
df.describe()

Unnamed: 0,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_min,Views,Likes,Comments,Stream,EnergyLiveness
count,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20594.0,20592.0
mean,0.620102,0.635176,-7.678254,0.096733,0.291391,0.056162,0.193653,0.530077,120.562616,3.742439,92037400.0,647990.2,26846.79,132644600.0,5.167227
std,0.165504,0.214274,4.639481,0.112182,0.286117,0.193622,0.168832,0.245542,29.588093,2.085211,272602600.0,1773648.0,191175.1,242358200.0,4.117431
min,0.0,0.0,-46.251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.9e-05
25%,0.519,0.507,-8.868,0.0357,0.0452,0.0,0.0941,0.34,96.994,2.996746,1478284.0,17542.0,406.0,15590980.0,2.38619
50%,0.638,0.666,-6.5405,0.05065,0.193,2e-06,0.125,0.538,119.959,3.551267,13313480.0,115315.5,3006.0,47305250.0,4.256881
75%,0.741,0.798,-4.935,0.104,0.47675,0.000474,0.237,0.727,139.9235,4.202163,67396820.0,500019.8,13736.75,134345300.0,6.822034
max,0.975,1.0,0.92,0.964,0.996,1.0,1.0,0.993,243.372,77.9343,8079649000.0,50788650.0,16083140.0,3386520000.0,59.113924


In [8]:
# Create summary table grouped by Danceability

# Initialize column with default value and update depending on condition
df["dance_bin"] = "Mid" 
df.loc[df["Danceability"] < 0.4, "dance_bin"] = "Low"
df.loc[df["Danceability"] >= 0.7, "dance_bin"] = "High"

df["dance_bin"] = df["dance_bin"].astype(str)

summary = df.groupby("dance_bin")[["Likes", "Views"]].mean()
summary = summary.round(0).astype(int).reset_index()

# Set order for category in summary table
bin_order = ["Low", "Mid", "High"]
summary["dance_bin"] = pd.Categorical(summary["dance_bin"], 
                                      categories=bin_order, ordered=True)
summary = summary.sort_values("dance_bin").reset_index(drop=True)

summary

Unnamed: 0,dance_bin,Likes,Views
0,Low,368803,52332879
1,Mid,566328,80829862
2,High,861425,121742251


In [9]:
# Create base bar chart
chart = alt.Chart(df).mark_bar().encode(
    alt.X("dance_bin:N", 
          title="Danceability Level",
         sort=["Low", "Mid", "High"]),
    alt.Y("mean(Likes):Q", 
          title="Average Likes", 
          axis=alt.Axis(format="~s")),
    color=alt.Color("dance_bin:N", 
                    legend=None,
                    scale=alt.Scale(scheme="set2", domain=["Low", "Mid", "High"]))
)

# Create error bars
error_bars = alt.Chart(df).mark_errorbar(
    extent='ci', 
    ticks={"thickness":3, "size": 16},
    color="black"
).encode(
    alt.X("dance_bin:N", sort=["Low", "Mid", "High"]),
    alt.Y("Likes:Q")
)
    
final_chart = (chart + error_bars).properties(
    title=alt.TitleParams(
        text="Average Song Likes by Danceability Level",
        subtitle="Error bars represent 95% Confidence Intervals (CI)"),
    width=300,
    height=300
).configure_axis(
    labelFontSize=12,
    titleFontSize=16,
    labelAngle=0
).configure_title(
    fontSize=20
)

final_chart

In [10]:
# Save chart to img folder
output_path = "../img/danceability_eda.png"

final_chart.save(output_path, ppi=300)