In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder

In [15]:
df = pd.read_csv(r'..\datasets\sam_df_clean.csv')

In [16]:
genre_categories = {
    'pop-mainstream': [
        'pop', 'pop-film', 'power-pop', 'k-pop', 'j-pop', 'mandopop', 
        'cantopop', 'indie-pop', 'synth-pop', 'j-idol'
    ],
    'rock': [
        'rock', 'alt-rock', 'grunge', 'punk', 'punk-rock', 'indie', 
        'psych-rock', 'garage', 'rock-n-roll', 'rockabilly', 'hard-rock'
    ],
    'electronic': [
        'house', 'techno', 'trance', 'dubstep', 'edm', 'electro', 'electronic',
        'drum-and-bass', 'deep-house', 'progressive-house', 'chicago-house',
        'detroit-techno', 'hardstyle', 'minimal-techno', 'idm'
    ],
    'hiphop-rnb': [
        'hip-hop', 'r-n-b'
    ],
    'metal': [
        'metal', 'heavy-metal', 'death-metal', 'black-metal', 'metalcore',
        'grindcore', 'hardcore'
    ],
    'country-folk': [
        'country', 'folk', 'honky-tonk', 'singer-songwriter', 'songwriter'
    ],
    'jazz-blues': [
        'jazz', 'blues', 'soul'
    ],
    'world-regional': [
        'latin', 'latino', 'afrobeat', 'brazil', 'forro', 'salsa', 'samba',
        'sertanejo', 'pagode', 'mpb', 'french', 'spanish', 'german', 'swedish',
        'indian', 'iranian', 'malay', 'turkish', 'j-dance', 'j-rock'
    ],
    'dance-club': [
        'dance', 'dancehall', 'disco', 'club', 'reggaeton', 'reggae', 'dub'
    ],
    'classical': [
        'classical', 'opera', 'new-age'
    ],
    'niche-mood': [
        'acoustic', 'ambient', 'anime', 'bluegrass', 'breakbeat', 'british',
        'children', 'chill', 'comedy', 'disney', 'emo', 'funk', 'gospel', 
        'goth', 'guitar', 'groove', 'happy', 'industrial', 'kids', 'party',
        'piano', 'romance', 'sad', 'show-tunes', 'ska', 'sleep', 'study', 'world-music'
    ]
}

def add_genre_cat(X: pd.DataFrame) -> pd.DataFrame:

    X = X.copy()
    genre_to_cat = {}

    for cat, genres in genre_categories.items():
            for genre in genres:
                genre_to_cat[genre] = cat

    X['genre_subcategory'] = X['track_genre'].map(genre_to_cat).fillna('uncategorized')
    print(X['genre_subcategory'].value_counts())
    
    return X

In [17]:
def add_popularity_bins(X: pd.DataFrame) -> pd.DataFrame:
       
       X = X.copy()  
       X['popularity_bin'] = pd.cut(X['popularity'], 
              bins=[-1, 20, 40, 60, 80, 101],
                     labels=['Niche Tracks', 
                             'Club Filler', 
                             'Radio Hits', 
                             'Chart Climbers', 
                             'Bangers'])
       print(X['popularity_bin'].value_counts())
       
       return X

In [18]:
def add_pop_top_10(X: pd.DataFrame) -> pd.DataFrame:
    
    X=X.copy()
    top_10_threshold = 90

    X['popularity_top_10'] = (X['popularity'] >= top_10_threshold).astype(int)
    print(X['popularity_top_10'].value_counts())
    
    return X

In [19]:
def add_energy_bins(X: pd.DataFrame) -> pd.DataFrame:
    
    X=X.copy()
    X['energy_code'] = pd.cut(df['energy'], 
                         bins=[0.0, 0.70, 0.82, 0.92, 1.0],
                         labels=['Chill', 
                                 'Mellow', 
                                 'Upbeat', 
                                 'High-Energy'])
    print(X['energy_code'].value_counts())
    
    return X

In [20]:
def add_loudness_bins(X: pd.DataFrame) -> pd.DataFrame:
    
    X=X.copy()
    X['loudness_code'] = pd.cut(df['loudness'], 
                           bins=[-float('inf'),-20, -12, -8, -5, -3, 0,float('inf')],
                           labels=['Very Quiet',
                                   'Quiet', 
                                   'Low', 
                                   'Medium', 
                                   'Loud', 
                                   'Very Loud',
                                   'Mega Loud'
                                   ])
    print(X['loudness_code'].value_counts())
    
    return X

In [21]:
def add_danceability_bins(X: pd.DataFrame) -> pd.DataFrame:
    
    X=X.copy()
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
    labels = ['Still', 
              'Sway', 
              'Groove', 
              'Bump', 
              'Rave']
    X['dance_level'] = pd.cut(X['danceability'], bins=bins, 
                               labels=labels, 
                               include_lowest=True)
    print(X['dance_level'].value_counts())
    
    return X

In [22]:
def add_valence_bins(X: pd.DataFrame) -> pd.DataFrame:

    X=X.copy()
    bins = [0.0, 0.33, 0.66, 0.85, 1.0]
    labels = ['sad/angry', 
              'neutral/melancholy', 
              'happy/content', 
              'cheerful/upbeat']
    X['music_valence'] = pd.cut(X['valence'], 
                                 bins=bins, 
                                 labels=labels, 
                                 include_lowest=True)
    print(X['music_valence'].value_counts())

    return X

In [23]:
def add_tempo_bins(X: pd.DataFrame) -> pd.DataFrame:
    
    X=X.copy()
    bins = [0, 80, 120, 160, float('inf')]
    labels = ['slow',
               'mid', 
               'fast', 
               'very_fast']
    X['tempo_bin'] = pd.cut(X['tempo'], 
                            bins=bins,
                             labels=labels, 
                             right=False)

    print(X['tempo_bin'].value_counts())

    return X

In [24]:
pipe = Pipeline(steps=[
    ('genre category', FunctionTransformer(add_genre_cat, validate=False)),
    ('popularity bins', FunctionTransformer(add_popularity_bins, validate=False)),
    ('popular top 10', FunctionTransformer(add_pop_top_10, validate=False)),
    ('energy bins', FunctionTransformer(add_energy_bins, validate=False)),
    ('loudness bins', FunctionTransformer(add_loudness_bins, validate=False)),
    ('danceability bins', FunctionTransformer(add_danceability_bins, validate=False)),
    ('valence bins', FunctionTransformer(add_valence_bins, validate=False)),
    ('tempo bins', FunctionTransformer(add_tempo_bins, validate=False))
])

In [25]:
df_transformed = pipe.fit_transform(df)

genre_subcategory
niche-mood        24134
world-regional    15528
electronic        10467
pop-mainstream     7585
rock               6308
metal              5516
dance-club         4340
country-folk       2781
classical          2549
uncategorized      2125
jazz-blues         1231
hiphop-rnb         1115
Name: count, dtype: int64
popularity_bin
Club Filler       29659
Radio Hits        25207
Niche Tracks      20291
Chart Climbers     8050
Bangers             472
Name: count, dtype: int64
popularity_top_10
0    83639
1       40
Name: count, dtype: int64
energy_code
Chill          44324
Mellow         13881
Upbeat         13412
High-Energy    12061
Name: count, dtype: int64
loudness_code
Medium        28432
Low           20744
Loud          15336
Quiet         11329
Very Loud      4162
Very Quiet     3610
Mega Loud        66
Name: count, dtype: int64
dance_level
Groove    31502
Bump      30550
Sway      12428
Rave       6236
Still      2963
Name: count, dtype: int64
music_valence
neutral

In [26]:
df_transformed.to_csv('..\datasets\sam_df_clean_featured.csv', index = False)