In [46]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## Column Specifications

| Column             | Type              | Description |
|--------------------|-------------------|-------------|
| track_id          | String           | Unique Spotify ID for the track (e.g., "spotify:track:2takcwOaAZWiXQijPHIx7B"). |
| artists           | String           | Names of performing artists, separated by semicolons for multiples. |
| album_name        | String           | Name of the album containing the track. |
| track_name        | String           | Title of the track. |
| popularity        | Integer (0-100)  | Score where higher values indicate greater popularity based on plays and recency. |
| duration_ms       | Integer          | Track length in milliseconds. |
| explicit          | Boolean          | Indicates if the track contains explicit lyrics. |
| danceability      | Float (0.0-1.0)  | Suitability for dancing based on tempo, rhythm stability, beat strength, and regularity (0.0 least, 1.0 most). |
| energy            | Float (0.0-1.0)  | Perceived intensity and activity (0.0 calm, 1.0 high-energy). |
| key               | Integer (-1 to 11)| Numeric key (0=C, 1=C♯/D♭); -1 if no key detected. |
| loudness          | Float            | Overall loudness in decibels (typically -60 to 0). |
| mode              | Integer (0-1)    | Modality (0=major, 1=minor). |
| speechiness       | Float (0.0-1.0)  | Presence of spoken words (0.0=music, 1.0=spoken like podcast). |
| acousticness      | Float (0.0-1.0)  | Confidence that the track is acoustic (0.0=electric, 1.0=acoustic). |
| instrumentalness  | Float (0.0-1.0)  | Likelihood of no vocals (0.0=vocalic, 1.0=instrumental). |
| liveness          | Float (0.0-1.0)  | Detection of live audience (higher for live recordings). |
| valence           | Float (0.0-1.0)  | Musical positiveness (0.0=sad/angry, 1.0=happy/cheerful). |
| tempo             | Float            | Estimated beats per minute (BPM). |
| time_signature    | Integer          | An estimated overall time signature (e.g., 4/4 as 4). |
| track_genre       | String           | Assigned genre label (e.g., from datasets covering 125 genres). |

In [47]:
df = pd.read_csv(r'..\datasets\dataset.csv')

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [49]:
df_objects = df.select_dtypes(include="object")

In [50]:
for column in df_objects.columns:
    print(f"column: {column}", df_objects[column].unique())

column: track_id ['5SuOikwiRyPMVoIQDJUgSV' '4qPNDBW1i3p13qLCt0Ki3A'
 '1iJBSr7s7jYXzM8EGcbK5b' ... '6x8ZfSoqDjuNa5SVP5QjvX'
 '2e6sXL2bYv4bSz6VTdnfLs' '2hETkH7cOfqmz3LqZDHZf5']
column: artists ['Gen Hoshino' 'Ben Woodward' 'Ingrid Michaelson;ZAYN' ...
 'Cuencos Tibetanos Sonidos Relajantes'
 'Bryan & Katie Torwalt;Brock Human' 'Jesus Culture']
column: album_name ['Comedy' 'Ghost (Acoustic)' 'To Begin Again' ...
 '#20 Sueños Vividos - Música Intrumental Suave 2018 para Dormir Bien y Relajarse Profundamente'
 'Frecuencias Álmicas en 432hz (Solo Piano)' 'Revelation Songs']
column: track_name ['Comedy' 'Ghost - Acoustic' 'To Begin Again' ... 'Water Into Light'
 'Miss Perfumado' 'Barbincor']
column: track_genre ['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'di

In [51]:
genre_list = df_objects['track_genre'].value_counts().index.tolist()
genre_list

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie-pop',
 'indie',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metalcore',
 'minimal-techno',
 'mpb',
 'new-age',
 'opera',
 'pagode',
 'party',
 'piano',
 'pop-film',
 'pop',
 'power-pop',
 'progressive

In [52]:
genre_categories = {
    'pop-mainstream': [
        'pop', 'pop-film', 'power-pop', 'k-pop', 'j-pop', 'mandopop', 
        'cantopop', 'indie-pop', 'synth-pop', 'j-idol'
    ],
    'rock': [
        'rock', 'alt-rock', 'grunge', 'punk', 'punk-rock', 'indie', 
        'psych-rock', 'garage', 'rock-n-roll', 'rockabilly', 'hard-rock'
    ],
    'electronic': [
        'house', 'techno', 'trance', 'dubstep', 'edm', 'electro', 'electronic',
        'drum-and-bass', 'deep-house', 'progressive-house', 'chicago-house',
        'detroit-techno', 'hardstyle', 'minimal-techno', 'idm'
    ],
    'hiphop-rnb': [
        'hip-hop', 'r-n-b'
    ],
    'metal': [
        'metal', 'heavy-metal', 'death-metal', 'black-metal', 'metalcore',
        'grindcore', 'hardcore'
    ],
    'country-folk': [
        'country', 'folk', 'honky-tonk', 'singer-songwriter', 'songwriter'
    ],
    'jazz-blues': [
        'jazz', 'blues', 'soul'
    ],
    'world-regional': [
        'latin', 'latino', 'afrobeat', 'brazil', 'forro', 'salsa', 'samba',
        'sertanejo', 'pagode', 'mpb', 'french', 'spanish', 'german', 'swedish',
        'indian', 'iranian', 'malay', 'turkish', 'j-dance', 'j-rock'
    ],
    'dance-club': [
        'dance', 'dancehall', 'disco', 'club', 'reggaeton', 'reggae', 'dub'
    ],
    'classical': [
        'classical', 'opera', 'new-age'
    ],
    'niche-mood': [
        'acoustic', 'ambient', 'anime', 'bluegrass', 'breakbeat', 'british',
        'children', 'chill', 'comedy', 'disney', 'emo', 'funk', 'gospel', 
        'goth', 'guitar', 'groove', 'happy', 'industrial', 'kids', 'party',
        'piano', 'romance', 'sad', 'show-tunes', 'ska', 'sleep', 'study', 'world-music'
    ]
}

In [53]:
# Create category column
genre_to_cat = {}
for cat, genres in genre_categories.items():
    for genre in genres:
        genre_to_cat[genre] = cat

df['genre_subcategory'] = df['track_genre'].map(genre_to_cat).fillna('uncategorized')
print(df['genre_subcategory'].value_counts())

genre_subcategory
niche-mood        28000
world-regional    20000
electronic        15000
rock              11000
pop-mainstream    10000
metal              7000
dance-club         7000
country-folk       5000
uncategorized      3000
jazz-blues         3000
classical          3000
hiphop-rnb         2000
Name: count, dtype: int64


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         114000 non-null  int64  
 1   track_id           114000 non-null  object 
 2   artists            113999 non-null  object 
 3   album_name         113999 non-null  object 
 4   track_name         113999 non-null  object 
 5   popularity         114000 non-null  int64  
 6   duration_ms        114000 non-null  int64  
 7   explicit           114000 non-null  bool   
 8   danceability       114000 non-null  float64
 9   energy             114000 non-null  float64
 10  key                114000 non-null  int64  
 11  loudness           114000 non-null  float64
 12  mode               114000 non-null  int64  
 13  speechiness        114000 non-null  float64
 14  acousticness       114000 non-null  float64
 15  instrumentalness   114000 non-null  float64
 16  li

In [55]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,genre_subcategory
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,niche-mood
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,niche-mood
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,niche-mood
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,niche-mood
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,niche-mood


In [56]:
popularity_list = df['popularity'].value_counts().index.to_list()
popularity_list

[0,
 22,
 21,
 44,
 1,
 23,
 20,
 43,
 45,
 41,
 40,
 46,
 38,
 42,
 39,
 24,
 47,
 48,
 37,
 26,
 49,
 19,
 25,
 50,
 56,
 36,
 27,
 51,
 28,
 52,
 35,
 57,
 18,
 55,
 54,
 34,
 29,
 58,
 53,
 59,
 32,
 33,
 30,
 60,
 31,
 17,
 62,
 61,
 2,
 63,
 16,
 65,
 64,
 66,
 67,
 12,
 11,
 10,
 68,
 13,
 69,
 70,
 15,
 5,
 3,
 71,
 14,
 8,
 9,
 74,
 72,
 73,
 7,
 6,
 75,
 76,
 4,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 90,
 89,
 93,
 91,
 92,
 97,
 96,
 98,
 94,
 95,
 100,
 99]

| Bin | Range  | Label           | Vibe/Example             |
|-----|--------|-----------------|--------------------------|
| 0   | 0-20   | Niche Tracks    | Underground, cult classics |
| 1   | 21-40  | Club Filler     | Dancefloor staples        |
| 2   | 41-60  | Radio Hits      | Everyday streaming        |
| 3   | 61-80  | Chart Climbers  | Viral potential           |
| 4   | 81-100 | Bangers         | Anthems & TikTok smashes |


In [57]:
df['popularity_bin'] = pd.cut(df['popularity'], 
                             bins=[-1, 20, 40, 60, 80, 101],
                             labels=['Niche Tracks', 'Club Filler', 'Radio Hits', 
                                    'Chart Climbers', 'Bangers'])

In [58]:
print(df['popularity_bin'].value_counts())

popularity_bin
Niche Tracks      34177
Club Filler       33149
Radio Hits        33104
Chart Climbers    12616
Bangers             954
Name: count, dtype: int64


In [59]:
# Top 10% threshold (101 genres * 0.1 = ~10 highest)
top_10_threshold = 90  # indices 90-100

# Or as 1/0 numeric
df['popularity_top_10'] = (df['popularity'] >= top_10_threshold).astype(int)

In [60]:
df['popularity_top_10'].value_counts().unique 

<bound method Series.unique of popularity_top_10
0    113902
1        98
Name: count, dtype: int64>

In [61]:
energy_list = df['energy'].value_counts().index.tolist()
energy_list

[0.876,
 0.937,
 0.931,
 0.801,
 0.886,
 0.858,
 0.961,
 0.948,
 0.92,
 0.981,
 0.979,
 0.978,
 0.964,
 0.818,
 0.906,
 0.909,
 0.939,
 0.959,
 0.995,
 0.977,
 0.953,
 0.803,
 0.72,
 0.934,
 0.902,
 0.933,
 0.913,
 0.855,
 0.93,
 0.938,
 0.941,
 0.943,
 0.95,
 0.924,
 0.988,
 0.573,
 0.856,
 0.739,
 0.828,
 0.954,
 0.955,
 0.838,
 0.845,
 0.94,
 0.942,
 0.8,
 0.907,
 0.792,
 0.817,
 0.714,
 0.625,
 0.871,
 0.962,
 0.878,
 0.97,
 0.874,
 0.875,
 0.89,
 0.929,
 0.917,
 0.9,
 0.728,
 0.946,
 0.833,
 0.841,
 0.894,
 0.884,
 0.932,
 0.903,
 0.842,
 0.606,
 0.7,
 0.67,
 0.849,
 0.666,
 0.788,
 0.951,
 0.873,
 0.935,
 0.896,
 0.86,
 0.726,
 0.991,
 0.936,
 0.974,
 0.869,
 0.914,
 0.706,
 0.522,
 0.843,
 0.966,
 0.823,
 0.711,
 0.737,
 0.727,
 0.867,
 0.859,
 0.675,
 0.947,
 0.983,
 0.949,
 0.98,
 0.879,
 0.957,
 0.85,
 0.968,
 0.796,
 0.969,
 0.963,
 0.743,
 0.921,
 0.877,
 0.892,
 0.857,
 0.713,
 0.715,
 0.872,
 0.69,
 0.565,
 0.802,
 0.958,
 0.742,
 0.912,
 0.952,
 0.744,
 0.898,
 0.945,
 0

| Label       | Range     | Count (sample) | Vibe/Example      |
|-------------|-----------|----------------|-------------------|
| Chill       | 0.00-0.70 | ~3%           | Lounge, ballads   |
| Mellow      | 0.70-0.82 | ~15%          | Acoustic, chill   |
| Upbeat      | 0.82-0.92 | ~35%          | Pop, indie        |
| High-Energy | 0.92-1.00 | ~47%          | EDM, rock anthems |

In [62]:
df['energy_code'] = pd.cut(df['energy'], 
                         bins=[0.0, 0.70, 0.82, 0.92, 1.0],
                         labels=['Chill', 'Mellow', 'Upbeat', 'High-Energy'])

In [63]:
print(df['energy_code'].value_counts())

energy_code
Chill          59554
Mellow         20044
Upbeat         18805
High-Energy    15596
Name: count, dtype: int64


| Label     | Range (dB) | % Songs | Vibe/Example       |
|-----------|------------|---------|--------------------|
| Quiet     | -20 to -12 | 0.5%   | ASMR, ambient      |
| Low       | -12 to -8  | 11%    | Acoustic, chill    |
| Medium    | -8 to -5   | 56%    | Pop, hip-hop       |
| Loud      | -5 to -3   | 30%    | Rock, dance        |
| Very Loud | -3 to 0    | 2%     | Festivals  |

In [64]:
loudness_list = df['loudness'].value_counts().index.tolist()
loudness_list

[-5.662,
 -4.457,
 -9.336,
 -7.57,
 -4.034,
 -8.871,
 -3.725,
 -4.324,
 -5.08,
 -12.472,
 -6.196,
 -4.156,
 -3.718,
 -5.501,
 -6.135,
 -8.483,
 -5.119,
 -8.05,
 -5.956,
 -2.749,
 -5.395,
 -7.779,
 -6.961,
 -6.04,
 -5.795,
 -6.9,
 -6.83,
 -3.956,
 -5.579,
 -5.645,
 -6.347,
 -4.867,
 -5.915,
 -3.758,
 -5.598,
 -6.152,
 -7.877,
 -7.125,
 -7.282,
 -6.264,
 -6.941,
 -5.199,
 -9.175,
 -8.494,
 -4.627,
 -4.177,
 -5.301,
 -3.394,
 -3.809,
 -6.296,
 -5.098,
 -3.318,
 -6.512,
 -4.938,
 -5.025,
 -5.62,
 -4.649,
 -9.01,
 -7.035,
 -3.378,
 -4.179,
 -11.717,
 -6.054,
 -5.327,
 -3.826,
 -4.431,
 -5.414,
 -3.702,
 -5.926,
 -4.152,
 -3.933,
 -7.097,
 -4.499,
 -4.612,
 -5.371,
 -6.09,
 -7.525,
 -5.157,
 -5.242,
 -6.827,
 -8.365,
 -5.243,
 -7.295,
 -3.673,
 -7.639,
 -4.909,
 -5.294,
 -2.117,
 -7.858,
 -10.312,
 -8.169,
 -5.032,
 -10.493,
 -7.079,
 -7.87,
 -6.305,
 -5.296,
 -8.957,
 -4.164,
 -7.3,
 -5.57,
 -5.702,
 -5.81,
 -4.563,
 -6.205,
 -6.373,
 -5.498,
 -4.898,
 -6.727,
 -5.071,
 -6.348,
 -5.049,
 -4

In [65]:
df['loudness_code'] = pd.cut(df['loudness'], 
                           bins=[-20, -12, -8, -5, -3, 0],
                           labels=['Quiet', 'Low', 'Medium', 'Loud', 'Very Loud'])
print(df['loudness_code'].value_counts())

loudness_code
Medium       39921
Low          27328
Loud         22590
Quiet        14100
Very Loud     5776
Name: count, dtype: int64


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 26 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Unnamed: 0         114000 non-null  int64   
 1   track_id           114000 non-null  object  
 2   artists            113999 non-null  object  
 3   album_name         113999 non-null  object  
 4   track_name         113999 non-null  object  
 5   popularity         114000 non-null  int64   
 6   duration_ms        114000 non-null  int64   
 7   explicit           114000 non-null  bool    
 8   danceability       114000 non-null  float64 
 9   energy             114000 non-null  float64 
 10  key                114000 non-null  int64   
 11  loudness           114000 non-null  float64 
 12  mode               114000 non-null  int64   
 13  speechiness        114000 non-null  float64 
 14  acousticness       114000 non-null  float64 
 15  instrumentalness   114000 non-null

In [67]:
danceability_list = df['danceability'].value_counts().index.tolist()
danceability_list

[0.647,
 0.609,
 0.579,
 0.685,
 0.602,
 0.524,
 0.689,
 0.598,
 0.607,
 0.626,
 0.631,
 0.586,
 0.576,
 0.56,
 0.582,
 0.545,
 0.593,
 0.534,
 0.568,
 0.532,
 0.687,
 0.627,
 0.653,
 0.588,
 0.714,
 0.596,
 0.533,
 0.603,
 0.671,
 0.795,
 0.639,
 0.497,
 0.523,
 0.623,
 0.616,
 0.569,
 0.573,
 0.637,
 0.705,
 0.535,
 0.546,
 0.589,
 0.674,
 0.503,
 0.661,
 0.521,
 0.555,
 0.565,
 0.601,
 0.654,
 0.679,
 0.592,
 0.543,
 0.516,
 0.65,
 0.549,
 0.649,
 0.541,
 0.658,
 0.608,
 0.629,
 0.636,
 0.498,
 0.606,
 0.634,
 0.611,
 0.615,
 0.678,
 0.683,
 0.553,
 0.509,
 0.572,
 0.651,
 0.536,
 0.519,
 0.518,
 0.529,
 0.51,
 0.67,
 0.707,
 0.715,
 0.621,
 0.64,
 0.69,
 0.665,
 0.619,
 0.688,
 0.682,
 0.499,
 0.581,
 0.693,
 0.63,
 0.807,
 0.663,
 0.632,
 0.558,
 0.544,
 0.55,
 0.635,
 0.494,
 0.563,
 0.633,
 0.686,
 0.574,
 0.527,
 0.58,
 0.594,
 0.657,
 0.561,
 0.591,
 0.604,
 0.567,
 0.559,
 0.522,
 0.62,
 0.625,
 0.673,
 0.614,
 0.512,
 0.489,
 0.605,
 0.737,
 0.648,
 0.694,
 0.695,
 0.584,
 0

## Danceability Levels

| Dance Label | Range     | Vibe Description                  |
|-------------|-----------|-----------------------------------|
| Still       | [0.0, 0.2)| Minimal rhythm, no floor action   |
| Sway        | [0.2, 0.4)| Gentle head-nod, light movement   |
| Groove      | [0.4, 0.6)| Steady beat, easy body roll       |
| Bump        | [0.6, 0.8)| Club-ready, hip-shake energy      |
| Rave        | [0.8, 1.0]| Full floor-filler, peak dance     |


In [68]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
labels = ['Still', 'Sway', 'Groove', 'Bump', 'Rave']
df['dance_level'] = pd.cut(df['danceability'], bins=bins, labels=labels, include_lowest=True)

In [69]:
print(df['dance_level'].value_counts())

dance_level
Bump      43066
Groove    42720
Sway      16061
Rave       8768
Still      3385
Name: count, dtype: int64


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 27 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Unnamed: 0         114000 non-null  int64   
 1   track_id           114000 non-null  object  
 2   artists            113999 non-null  object  
 3   album_name         113999 non-null  object  
 4   track_name         113999 non-null  object  
 5   popularity         114000 non-null  int64   
 6   duration_ms        114000 non-null  int64   
 7   explicit           114000 non-null  bool    
 8   danceability       114000 non-null  float64 
 9   energy             114000 non-null  float64 
 10  key                114000 non-null  int64   
 11  loudness           114000 non-null  float64 
 12  mode               114000 non-null  int64   
 13  speechiness        114000 non-null  float64 
 14  acousticness       114000 non-null  float64 
 15  instrumentalness   114000 non-null

In [71]:
valence_list = df['valence'].value_counts().index.tolist()
valence_list

[0.961,
 0.304,
 0.717,
 0.962,
 0.324,
 0.963,
 0.55,
 0.365,
 0.949,
 0.202,
 0.464,
 0.56,
 0.435,
 0.619,
 0.351,
 0.326,
 0.964,
 0.342,
 0.706,
 0.336,
 0.471,
 0.662,
 0.399,
 0.965,
 0.478,
 0.517,
 0.549,
 0.305,
 0.374,
 0.237,
 0.235,
 0.483,
 0.774,
 0.397,
 0.369,
 0.0,
 0.569,
 0.354,
 0.218,
 0.398,
 0.64,
 0.671,
 0.415,
 0.198,
 0.4,
 0.429,
 0.456,
 0.227,
 0.343,
 0.548,
 0.473,
 0.722,
 0.641,
 0.238,
 0.541,
 0.345,
 0.49,
 0.356,
 0.148,
 0.338,
 0.558,
 0.364,
 0.696,
 0.152,
 0.371,
 0.312,
 0.922,
 0.664,
 0.525,
 0.382,
 0.193,
 0.185,
 0.179,
 0.427,
 0.62,
 0.563,
 0.159,
 0.639,
 0.358,
 0.546,
 0.38,
 0.386,
 0.332,
 0.446,
 0.299,
 0.488,
 0.373,
 0.623,
 0.537,
 0.208,
 0.575,
 0.475,
 0.229,
 0.691,
 0.357,
 0.433,
 0.233,
 0.631,
 0.239,
 0.507,
 0.836,
 0.372,
 0.692,
 0.454,
 0.48,
 0.262,
 0.394,
 0.323,
 0.395,
 0.96,
 0.199,
 0.346,
 0.327,
 0.3,
 0.515,
 0.506,
 0.609,
 0.526,
 0.697,
 0.314,
 0.69,
 0.256,
 0.966,
 0.649,
 0.68,
 0.683,
 0.533,


In [72]:
#df.to_csv('spotify_features.csv', index = False)

| Range     | Label Pair          | Musical Context                  |
|-----------|---------------------|----------------------------------|
| 0.00–0.33 | sad/angry          | Minor keys, slow/dissonant       |
| 0.33–0.66 | neutral/melancholy | Mid-tempo ballads, bittersweet   |
| 0.66–0.85 | happy/content      | Major keys, steady/warm          |
| 0.85–1.00 | cheerful/upbeat    | Fast tempo, bright/euphoric      |

In [73]:
bins = [0.0, 0.33, 0.66, 0.85, 1.0]
labels = ['sad/angry', 'neutral/melancholy', 'happy/content', 'cheerful/upbeat']
df['music_valence'] = pd.cut(df['valence'], bins=bins, labels=labels, include_lowest=True)

In [74]:
print(df['music_valence'].value_counts())

music_valence
neutral/melancholy    44446
sad/angry             38207
happy/content         20769
cheerful/upbeat       10578
Name: count, dtype: int64


In [75]:
tempo_list = df['tempo'].value_counts().index.tolist()
tempo_list

[0.0,
 151.925,
 95.004,
 130.594,
 87.925,
 92.988,
 125.004,
 76.783,
 77.321,
 90.04,
 77.117,
 105.016,
 94.999,
 123.045,
 109.976,
 106.998,
 122.958,
 100.011,
 97.993,
 92.994,
 138.908,
 101.988,
 126.002,
 123.997,
 119.989,
 122.872,
 82.229,
 92.025,
 91.982,
 119.935,
 124.994,
 83.506,
 124.005,
 109.991,
 207.478,
 130.0,
 105.0,
 120.031,
 71.078,
 130.027,
 119.986,
 103.025,
 125.008,
 100.006,
 122.031,
 176.088,
 100.002,
 92.23,
 125.007,
 104.059,
 80.87,
 170.023,
 170.082,
 120.041,
 119.995,
 124.977,
 117.953,
 126.108,
 125.073,
 119.993,
 122.975,
 127.949,
 108.729,
 123.998,
 124.993,
 128.003,
 123.993,
 91.993,
 125.012,
 120.007,
 127.994,
 127.998,
 101.483,
 123.992,
 119.999,
 171.973,
 126.042,
 124.002,
 78.763,
 119.992,
 79.31,
 125.005,
 161.948,
 120.0,
 125.99,
 90.001,
 120.006,
 123.96,
 120.028,
 120.01,
 100.015,
 124.999,
 120.003,
 130.001,
 99.762,
 139.994,
 134.999,
 143.912,
 119.968,
 169.965,
 125.992,
 99.993,
 100.021,
 127.993,


| Category       | BPM Range | Examples                     |
|----------------|-----------|------------------------------|
| Slow/Largo     | 0–80      | Ballads, Dub, Reggae         |
| Mid/Andante    | 80–120    | Hip-hop, Pop, Chill-out      |
| Fast/Presto    | 120–160   | House, Rock, Techno          |
| Very Fast      | 160+      | Drum & Bass, Hardstyle       |


In [76]:
bins = [0, 80, 120, 160, float('inf')]
labels = ['slow', 'mid', 'fast', 'very_fast']
df['tempo_bin'] = pd.cut(df['tempo'], bins=bins, labels=labels, right=False)

print(df['tempo_bin'].value_counts())

tempo_bin
fast         46548
mid          45411
very_fast    14148
slow          7893
Name: count, dtype: int64
