# Moosic Feature Engineering



* data extraction and data management
* data preprocessing 
* load combined spotify data + main genres engineered
* remove null entries from main genres
* create core genre : one gener associated to a track
* mood engineering: research, quadrants and subquadrants
* drop nulls and duplicates
* encode genre to be used as features during training
* make data balanced with respect the moods






## Importing required libraries




In [1]:


# IMPORT LIBRARIES


try:

    import numpy as np
    import pandas as pd
    import random as rnd
    #from tqdm.notebook import tqdm as tqdm
    from tqdm import tqdm 
    #from .autonotebook import tqdm as notebook_tqdm
    import time

    # databases - sql
    #from dotenv import dotenv_values
    #import sqlalchemy

    # visualisation
    import seaborn as sns
    import matplotlib.pyplot as plt
    from matplotlib.colors import ListedColormap

    # split data - avoid data leakage
    from sklearn.model_selection import train_test_split



except ImportError as error:
    print(f"Installation of the required dependencies necessary! {error}")

    %pip install numpy
    %pip install pandas
    #%pip install dotenv
    #%pip install sqlalchemy
    %pip install seaborn
    %pip install matplotlib
    %pip install tqdm
    %pip install ipywidgets
    %pip install yellowbrick  

    print(f"Successful installation of the required dependencies necessary")


import warnings
warnings.filterwarnings('ignore')





## Loading the data

In [2]:
# load the data files for moosic


df_musgenre = pd.read_csv('../data/processed/df_with_main_genres_v1.csv', low_memory=False)

# get shape of the artist and tracks dataframe

print(f"Music data: There are {df_musgenre.shape[0]} observations and {df_musgenre.shape[1]} feature variables ")
print('----------'*10)

df_musgenre.head(2)

Music data: There are 457150 observations and 24 feature variables 
----------------------------------------------------------------------------------------------------


Unnamed: 0,artists_id,track_id,artist_name,track_name,genres,release_date,explicit,duration_ms,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,followers,artist_popularity,track_popularity,main_genres
0,45tIt06XoI0Iio4LBEVpls,35iwgR4jXetI318WEWsa1Q,Uli,Carve,,1922-02-22,0,126903,0.645,0.445,...,0.674,0.744,0.151,0.127,104.851,3,91.0,6,4,[]
1,14jtPCOoNZwquk5wd9DxrY,021ht4sdgPcrDgSk7JTbKY,Fernando Pessoa,Capítulo 2.16 - Banquero Anarquista,,1922-06-01,0,98200,0.695,0.263,...,0.797,0.0,0.148,0.655,102.009,1,3.0,0,0,[]


In [3]:


moosic_data = df_musgenre.copy(deep=True)
moosic_data = moosic_data.query("main_genres != '[]' ").reset_index(drop=True)
moosic_data = moosic_data.convert_dtypes()

null_rows = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows)

print(moosic_data.isnull().T.any())
print(moosic_data.shape)

moosic_data.head(2)



0         False
1         False
2         False
3         False
4         False
          ...  
343471    False
343472    False
343473    False
343474    False
343475    False
Length: 343476, dtype: bool
(343476, 24)


Unnamed: 0,artists_id,track_id,artist_name,track_name,genres,release_date,explicit,duration_ms,danceability,energy,...,acousticness,instrumentalness,liveness,valence,tempo,time_signature,followers,artist_popularity,track_popularity,main_genres
0,5ESobCkc6JI4tIMxQttqeg,2wAfHM7Whz67VFbdanhZlk,Bessie Smith,Nobody Knows You When Youre Down and Out,"blues, harlem renaissance, jazz blues, traditi...",1923,0,177133,0.614,0.0423,...,0.996,0.00293,0.183,0.211,89.822,4,116265,41,44,['blues']
1,5ESobCkc6JI4tIMxQttqeg,2AZgaYZSwUosJD71J2N2Zo,Bessie Smith,Taint Nobodys Bizness If I Do,"blues, harlem renaissance, jazz blues, traditi...",1923,0,206600,0.537,0.0443,...,0.996,0.000265,0.152,0.137,80.468,4,116265,30,44,['blues']


In [4]:
moosic_data['main_genres'].unique()



<StringArray>
[                           "['blues']",
                     "['soul', 'jazz']",
                  "['folk', 'country']",
                             "['jazz']",
                "['classical', 'jazz']",
                    "['blues', 'jazz']",
                        "['classical']",
                           "['gospel']",
            "['soul', 'blues', 'jazz']",
                          "['hip hop']",
 ...
  "['rock', 'rockabilly', 'punk rock']",
              "['rock', 'soul', 'r&b']",
           "['blues', 'jazz', 'other']",
               "['rock', 'pop', 'r&b']",
      "['soul', 'rock', 'pop', 'jazz']",
                      "['r&b', 'jazz']",
                "['edm', 'r&b', 'pop']",
           "['pop', 'blues', 'reggae']",
     "['pop', 'folk', 'jazz', 'other']",
 "['rock', 'hip hop', 'pop', 'reggae']"]
Length: 603, dtype: string

In [5]:
# from one_genre_each_song.ipynb


# Create an empty list to hold rows
rows_list = []

# Iterate through each row in the original DataFrame
for index, row in moosic_data.iterrows():
    # Get the list of genres in the 'main_genres' column
    genres_list = eval(row['main_genres'])  # Convert the string representation to a list
    
    # Iterate through each genre in the list and create a new row
    for genre in genres_list:
        # Create a copy of the row and update the 'main_genres' column with the current genre
        new_row = row.copy()
        new_row['core_genres'] = genre #changed here to core_genres because we already have main_genres column
        
        # Append the new row to the list
        rows_list.append(new_row)

# Create a new DataFrame from the list of rows
moosic_data = pd.DataFrame(rows_list)



In [6]:
# Count the number of songs for each unique genre
genre_counts = moosic_data['core_genres'].value_counts()

# Count the total number of unique genres
num_unique_genres = len(genre_counts)

# Print the number of unique genres and the number of songs for each genre
print("Number of Unique Genres:", num_unique_genres)
print("\nNumber of Songs for Each Genre:")
print(genre_counts)

Number of Unique Genres: 25

Number of Songs for Each Genre:
core_genres
pop            191802
rock           142848
jazz            31935
folk            23958
hip hop         20628
metal           15795
soul            15708
other           11358
blues           11006
alternative     10944
funk            10019
country          8896
reggae           6267
disco            5879
house            5268
rockabilly       4901
r&b              4292
indie rock       3112
classical        3071
gospel           2261
edm              1996
techno           1960
punk rock        1284
electronic       1219
dubstep            45
Name: count, dtype: int64


In [7]:
# Check for empty values in the entire DataFrame
empty_values = moosic_data.isna().sum()

# Print the number of empty values for each column
print("Empty Values in Each Column:")
print(empty_values)

Empty Values in Each Column:
artists_id           0
track_id             0
artist_name          0
track_name           0
genres               0
release_date         0
explicit             0
duration_ms          0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
followers            0
artist_popularity    0
track_popularity     0
main_genres          0
core_genres          0
dtype: int64


In [8]:
# drop null/empty rows

null_rows1 = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows1)
empty_values = moosic_data.isna().sum()
print(empty_values)

artists_id           0
track_id             0
artist_name          0
track_name           0
genres               0
release_date         0
explicit             0
duration_ms          0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
followers            0
artist_popularity    0
track_popularity     0
main_genres          0
core_genres          0
dtype: int64


In [9]:
# current features for moosic data

features = moosic_data.columns.tolist()
features



['artists_id',
 'track_id',
 'artist_name',
 'track_name',
 'genres',
 'release_date',
 'explicit',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'followers',
 'artist_popularity',
 'track_popularity',
 'main_genres',
 'core_genres']

In [10]:
# save current moosic (mood-music) dataset : removed nulls and empty lists

#moosic_data.to_csv('../data/processed/moosic_data.csv', chunksize=len(moosic_data)//5, index=False)





![1](https://www.researchgate.net/profile/Angelo-Ciaramella/publication/263964114/figure/fig1/AS:566286145867776@1512024546717/Two-dimensional-emotion-representation-in-Thayers-model.png)
[ 1](https://www.researchgate.net/figure/Two-dimensional-emotion-representation-in-Thayers-model_fig1_263964114/download?_tp=eyJjb250ZXh0Ijp7ImZpcnN0UGFnZSI6Il9kaXJlY3QiLCJwYWdlIjoiX2RpcmVjdCJ9fQ)


![2](https://www.researchgate.net/profile/Sarfaraz-Masood/publication/309648785/figure/fig1/AS:424403734405121@1478197143294/Thayers-2-D-Emotion-Model.png)
[ 2](https://www.researchgate.net/publication/309648785_MFCC_Spectral_and_Temporal_Feature_based_Emotion_Identification_in_Songs)





### Circumplex model of affect (mood) - Russell 2-D model formula

* valence = radius (lenght) x cos(angle of affect term)
* energy = radius (lenght) x sin(angle of affect term)


### Plutchik's wheel of affect/Emotions + Circumplex

* mood_indicators = (valence , energy) = (r.cos(theta) , r.sin(theta)) 
  - the radius = 0.5
  - the origin = (0.5, 0.5)
  - theta = angle of affect for mood
  - proposed affect-mood sub-quadrants thetas : happy (0°), euphoric (45°), tense (90°), angry (135°), depressive (180°), sad (225°), calm (270°), relaxed (315°).
* our data for both variables lie between the range 0 and 1 so 

* given the fact that we have the valence and energy values already, we can calculate the 8 basic moods for the model with:
 - theta = arctan2( energy, valence) - the arctangent when given for 2 parameters

* 8 basic mood quadrants and sub-quadrants : 8 mood co-ordinates (4 main quadrants) for the 8 basic moods 
    - Quadrants : 
        - Q1 (Happy/Exuberant)  : [(0.5, 1.0), (0.5, 1.0)], # high valence, high energy
        - Q2 (Anger/anxious)   : [(0.0, 0.5), (0.5, 1.0)], # low valence, high energy
        - Q3(Sad/depressed)        : [(0.0, 0.5), (0.0, 0.5)], # low valence, low energy
        - Q4 (Relaxed/Content/calm) : [(0.5, 1.0), (0.0, 0.5)], # high valence, low energy 

    <br>

  - Sub-quadrants (using the polar method) : * not in use here, need to obtain the vertices of the triangles in each quadrant, so using the other to save time
    - Q1 (Happy)  : M1 - happy [r.cos(0°), r.sin(0°)],  M2 - euphoric [r.cos(45°), r.sin(45°)] 
    - Q2 (Angry)  : M3 - tense [r.cos(90°), r.sin(90°)],  M4 - angry [r.cos(135°), r.sin(135°)] 
    - Q3 (Sad) : M5 - depressive [r.cos(180°), r.sin(180°)],  M6 - sad [r.cos(225°), r.sin(225°)]
    - Q4 (Relaxed)  : M7 - calm [r.cos(270°), r.sin(270°)],  M8 - relaxed [r.cos(315°), r.sin(315°)] 




  - Sub-quadrants (modified-russell-thayer): 
    - Q1 (Happy)  : M1 - happy [(0.5, 1.0), (0.5, 0.75)],  M2 - euphoric [(0.5, 1.0), (0.75, 1.0)] 
    - Q2 (Angry)  : M3 - tense [(0.0, 0.5), (0.75, 1.0)],  M4 - angry [(0.0, 0.5), (0.5, 0.75)]
    - Q3 (Sad) : M5 - depressive [(0.0, 0.5), (0.25, 0.5)],  M6 - sad [(0.0, 0.5), (0.0, 0.25)]
    - Q4 (Relaxed)  : M7 - calm [(0.5, 1.0), (0.0, 0.25)],  M8 - relaxed [(0.5, 1.0), (0.0, 0.25)]
  

 so, using the valence and energy values we have for the affect terms listed in the mood_labels variable to calculate the angle it lies on the circumplex 2d graph using the russell 2-d valence and arousal 8 (energy) formula we get

```python 

# quadrants
mood_quadrants = {
                    'Q1 (Happy/Exuberant)'  : [(0.5, 1.0), (0.5, 1.0)], # high valence, high energy
                    'Q2 (Angry/anxious)'   : [(0.0, 0.5), (0.5, 1.0)], # low valence, high energy                  
                    'Q3(Sad/depressed)'        : [(0.0, 0.5), (0.0, 0.5)], # low valence, low energy
                    'Q4 (Relaxed/Content/calm)' : [(0.5, 1.0), (0.0, 0.5)] # high valence, low energy

}



# 1D : subquadrants (valence)

mood_sub_quadrants = {
                        'happy' : [0.875, 1.0),
                        'euphoric' : [0.875, 1.0),                 
                        'tense' : [0.875, 1.0),
                        'angry' :[0.875, 1.0),
                        'depressed' : [0.0, 0.125),
                        'sad' :[0.125, 0.25),
                        'calm' : [0.875, 1.0),
                        'relaxed' : [0.875, 1.0),

    }

# 2D : subquadrants (valence & energy)

mood_sub_quadrants = {
                        'happy' : [(0.5, 1.0), (0.5, 0.75)],
                        'euphoric' : [(0.5, 1.0), (0.75, 1.0)],                  
                        'tense' : [(0.0, 0.5), (0.75, 1.0)], 
                        'angry' :[(0.0, 0.5), (0.5, 0.75)], 
                        'depressed' : [(0.0, 0.5), (0.25, 0.5)], 
                        'sad' :[(0.0, 0.5), (0.0, 0.25)], 
                        'calm' : [(0.5, 1.0), (0.0, 0.25)], 
                        'relaxed' : [(0.5, 1.0), (0.25, 0.5)]

    }


```







In [11]:


# quadrants
mood_quadrants = {
                    'Q1 (Happy/Exuberant)'  : [(0.5, 1.0), (0.5, 1.0)], # high valence, high energy
                    'Q2 (Angry/anxious)'   : [(0.0, 0.5), (0.5, 1.0)], # low valence, high energy                  
                    'Q3(Sad/depressed)'        : [(0.0, 0.5), (0.0, 0.5)], # low valence, low energy
                    'Q4 (Relaxed/Content/calm)' : [(0.5, 1.0), (0.0, 0.5)] # high valence, low energy

}


# 1D : subquadrants (valence)

mood_1sub_quadrants = {
                        'happy' : (0.875, 1.0),
                        'euphoric' : (0.875, 1.0),                 
                        'tense' : (0.875, 1.0),
                        'angry' :(0.875, 1.0),
                        'depressed' : (0.0, 0.125),
                        'sad' :(0.125, 0.25),
                        'calm' : (0.875, 1.0),
                        'relaxed' : (0.875, 1.0),

    }

# for the 2d cartesian plot based on valence and energy
# The list of tuples represents the quadrants in which the core 8 moods dominate

# 2D : subquadrants (valence & energy)

mood_2sub_quadrants = {
                        'happy' : [(0.5, 1.0), (0.5, 0.75)],
                        'euphoric' : [(0.5, 1.0), (0.75, 1.0)],                  
                        'tense' : [(0.0, 0.5), (0.75, 1.0)], 
                        'angry' :[(0.0, 0.5), (0.5, 0.75)], 
                        'depressed' : [(0.0, 0.5), (0.25, 0.5)], 
                        'sad' :[(0.0, 0.5), (0.0, 0.25)], 
                        'calm' : [(0.5, 1.0), (0.0, 0.25)], 
                        'relaxed' : [(0.5, 1.0), (0.25, 0.5)]

    }


In [12]:


# mood maps for 1d and 2d


def mood_map(data):


    valence = data['valence']
    energy = data['energy']
    
    if (valence >= 0.5 and valence <= 1.0) and (energy >= 0.5 and energy <= 1.0):
        return 'happy' #hvhe
    elif (valence >= 0.0 and valence <= 0.5) and (energy >= 0.5 and energy <= 1.0):
        return 'tense' #lvhe
    elif (valence >= 0.0 and valence <= 0.5) and (energy >= 0.0 and energy <= 0.5):
        return 'sad' #lvle
    elif (valence >= 0.5 and valence <= 1.0) and (energy >= 0.0 and energy <= 0.5):
        return 'relaxed' #hvle
    else:
        pass



def mood_2d_map(data):


    valence = data['valence']
    energy = data['energy']
    
    if (valence >= 0.5 and valence <= 1.0) and (energy >= 0.5 and energy <= 0.75):
        return 'happy'
    elif (valence >= 0.5 and valence <= 1.0) and (energy >= 0.75 and energy <= 1.0):
        return 'euphoric'
    elif (valence >= 0.0 and valence <= 0.5) and (energy >= 0.75 and energy <= 1.0):
        return 'tense'
    elif (valence >= 0.0 and valence <= 0.5) and (energy >= 0.5 and energy <= 0.75):
        return 'angry'
    elif (valence >= 0.0 and valence <= 0.5) and (energy >= 0.25 and energy <= 0.5):
        return 'depressed'
    elif (valence >= 0.0 and valence <= 0.5) and (energy >= 0.0 and energy <= 0.25):
        return 'sad'
    elif (valence >= 0.5 and valence <= 1.0) and (energy >= 0.0 and energy <= 0.25):
        return 'calm'
    elif (valence >= 0.5 and valence <= 1.0) and (energy >= 0.25 and energy <= 0.5):
        return 'relaxed'
    else:
        pass




def mood_1d_map(data):

    valence = data['valence']

    if (valence > 0.875 and valence <= 1.0):
        return 'happy'
    elif (valence > 0.75 and valence <= 0.875):
        return 'euphoric'
    elif (valence > 0.375 and valence <= 0.5):
        return 'tense'
    elif (valence > 0.25 and valence <= 0.375):
        return 'angry'
    elif (valence > 0.0 and valence <= 0.125):
        return 'depressed'
    elif (valence > 0.125 and valence <= 0.25):
        return 'sad'
    elif (valence > 0.5 and valence <= 0.625):
        return 'calm'
    elif (valence > 0.625 and valence <= 0.75):
        return 'relaxed'
    else:
        pass

moosic_data['mood_42d_label'] = moosic_data.apply(mood_map , axis=1)
moosic_data['mood_1d_label'] = moosic_data.apply(mood_1d_map, axis=1)
moosic_data['mood_goal'] = moosic_data.apply(mood_2d_map, axis=1)


moosic_data.head(3)



Unnamed: 0,artists_id,track_id,artist_name,track_name,genres,release_date,explicit,duration_ms,danceability,energy,...,tempo,time_signature,followers,artist_popularity,track_popularity,main_genres,core_genres,mood_42d_label,mood_1d_label,mood_goal
0,5ESobCkc6JI4tIMxQttqeg,2wAfHM7Whz67VFbdanhZlk,Bessie Smith,Nobody Knows You When Youre Down and Out,"blues, harlem renaissance, jazz blues, traditi...",1923,0,177133,0.614,0.0423,...,89.822,4,116265,41,44,['blues'],blues,sad,sad,sad
1,5ESobCkc6JI4tIMxQttqeg,2AZgaYZSwUosJD71J2N2Zo,Bessie Smith,Taint Nobodys Bizness If I Do,"blues, harlem renaissance, jazz blues, traditi...",1923,0,206600,0.537,0.0443,...,80.468,4,116265,30,44,['blues'],blues,sad,sad,sad
2,19eLuQmk9aCobbVDHc6eek,7lRFR5GJCxK87ZbVMtQSeS,Louis Armstrong,Aint Misbehavin,"adult standards, dixieland, harlem renaissance...",1923,0,192600,0.665,0.254,...,125.734,4,2256652,28,74,"['soul', 'jazz']",soul,relaxed,calm,relaxed


In [13]:
# current features for moosic data

features = moosic_data.columns.tolist()
features

['artists_id',
 'track_id',
 'artist_name',
 'track_name',
 'genres',
 'release_date',
 'explicit',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'followers',
 'artist_popularity',
 'track_popularity',
 'main_genres',
 'core_genres',
 'mood_42d_label',
 'mood_1d_label',
 'mood_goal']

In [14]:
moosic_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 536452 entries, 0 to 343475
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   artists_id         536452 non-null  object 
 1   track_id           536452 non-null  object 
 2   artist_name        536452 non-null  object 
 3   track_name         536452 non-null  object 
 4   genres             536452 non-null  object 
 5   release_date       536452 non-null  object 
 6   explicit           536452 non-null  int64  
 7   duration_ms        536452 non-null  int64  
 8   danceability       536452 non-null  float64
 9   energy             536452 non-null  float64
 10  key                536452 non-null  int64  
 11  loudness           536452 non-null  float64
 12  mode               536452 non-null  int64  
 13  speechiness        536452 non-null  float64
 14  acousticness       536452 non-null  float64
 15  instrumentalness   536452 non-null  float64
 16  livenes

In [15]:
# drop null/empty rows

null_rows11 = moosic_data[moosic_data.isnull().T.any()].index
moosic_data = moosic_data.drop(null_rows11)
empty_values = moosic_data.isna().sum()
print(empty_values)

artists_id           0
track_id             0
artist_name          0
track_name           0
genres               0
release_date         0
explicit             0
duration_ms          0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
followers            0
artist_popularity    0
track_popularity     0
main_genres          0
core_genres          0
mood_42d_label       0
mood_1d_label        0
mood_goal            0
dtype: int64


In [16]:
# current features for moosic data

features = moosic_data.columns.tolist()
features



['artists_id',
 'track_id',
 'artist_name',
 'track_name',
 'genres',
 'release_date',
 'explicit',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'followers',
 'artist_popularity',
 'track_popularity',
 'main_genres',
 'core_genres',
 'mood_42d_label',
 'mood_1d_label',
 'mood_goal']

In [17]:
moosic_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 536350 entries, 0 to 343475
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   artists_id         536350 non-null  object 
 1   track_id           536350 non-null  object 
 2   artist_name        536350 non-null  object 
 3   track_name         536350 non-null  object 
 4   genres             536350 non-null  object 
 5   release_date       536350 non-null  object 
 6   explicit           536350 non-null  int64  
 7   duration_ms        536350 non-null  int64  
 8   danceability       536350 non-null  float64
 9   energy             536350 non-null  float64
 10  key                536350 non-null  int64  
 11  loudness           536350 non-null  float64
 12  mode               536350 non-null  int64  
 13  speechiness        536350 non-null  float64
 14  acousticness       536350 non-null  float64
 15  instrumentalness   536350 non-null  float64
 16  livenes

In [18]:
# save current moosic (mood-music) dataset : removed nulls and empty lists
# compress and send processed file to be for further analysis and modelling

#moosic_data.to_csv('../data/processed/moosic_data_processed.csv', chunksize=len(moosic_data)//5, index=False)




In [19]:
# encode categorical data : genre 

#genre
genre2d_dummies = pd.get_dummies(moosic_data['core_genres'], drop_first=True).replace({True: 1, False: 0})
genre2d_encoded = pd.concat([moosic_data['core_genres'], genre2d_dummies], axis=1)
display(genre2d_encoded.head(2))



Unnamed: 0,core_genres,blues,classical,country,disco,dubstep,edm,electronic,folk,funk,...,metal,other,pop,punk rock,r&b,reggae,rock,rockabilly,soul,techno
0,blues,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,blues,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:

# add labels to moosic data

moosic_data = pd.concat([moosic_data, genre2d_encoded.drop(['core_genres'], axis = 1) ], axis=1)
display(moosic_data.head(2))




Unnamed: 0,artists_id,track_id,artist_name,track_name,genres,release_date,explicit,duration_ms,danceability,energy,...,metal,other,pop,punk rock,r&b,reggae,rock,rockabilly,soul,techno
0,5ESobCkc6JI4tIMxQttqeg,2wAfHM7Whz67VFbdanhZlk,Bessie Smith,Nobody Knows You When Youre Down and Out,"blues, harlem renaissance, jazz blues, traditi...",1923,0,177133,0.614,0.0423,...,0,0,0,0,0,0,0,0,0,0
1,5ESobCkc6JI4tIMxQttqeg,2AZgaYZSwUosJD71J2N2Zo,Bessie Smith,Taint Nobodys Bizness If I Do,"blues, harlem renaissance, jazz blues, traditi...",1923,0,206600,0.537,0.0443,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# get shape of the current moosic data

print(f''' 

    Processed Mood - Music (MOOSIC) data
    Number of observations : {moosic_data.shape[0]} 
    Number of feature variables : {moosic_data.shape[1]} 

    ''')

 

    Processed Mood - Music (MOOSIC) data
    Number of observations : 536350 
    Number of feature variables : 52 

    


# Make data balanced for modelling purposes



In [22]:
# dataset is unbalanced from the perspective of the associated mood 
#    get the count of how the mood is distributed wrt the data


def get_balanced_data(processed_dataset):

    ''' 
    get the count of how the mood is distributed wrt the data
        - count of tracks by each mood
        - get mood label with lowest count
        - group the data based on the mood labels (mood_goal of tracks for user)
        - get dataset with randomly selected track samples for each mood labels based on 
        the value from the least occuring mood label
        
    '''

    balanced_mood_data = pd.DataFrame()

    mood_label_counts = processed_dataset['mood_goal'].value_counts()
    least_frequent_mood_label = mood_label_counts.idxmin()
    count_least_frequent_mood_label = mood_label_counts.min()

    print(f"The mood label count {mood_label_counts} ")
    print("______"*10)

    print(f"The least frequent mood label is '{least_frequent_mood_label}' with {count_least_frequent_mood_label} features.")
    print("______"*10)

    grouped_data = processed_dataset.groupby('mood_goal')

    sample_size = 11879

    for mood_label, group in grouped_data:
        
        #print(f' getting balanced data for the mood : {mood_label} \n ')

        if len(group) >= count_least_frequent_mood_label: 
            random_rows = group.sample(count_least_frequent_mood_label, random_state=42) 
        else:
            random_rows = group  

        balanced_mood_data = pd.concat([balanced_mood_data, random_rows])

        continue

    print(f' Finished processing, data has balanced number of samples for all categories. ')

    balanced_mood_data = balanced_mood_data.reset_index(drop=True) 

    current_mood_label_counts = balanced_mood_data['mood_goal'].value_counts()
    print(f"The size of data mood label count {current_mood_label_counts} ")
    print("______"*10)

    return balanced_mood_data




In [23]:

## display balanced moosic : mood-music data

moosic_data = get_balanced_data(moosic_data)





The mood label count mood_goal
happy        121041
euphoric     112413
depressed     78012
relaxed       66200
angry         58995
tense         45238
sad           42572
calm          11879
Name: count, dtype: int64 
____________________________________________________________
The least frequent mood label is 'calm' with 11879 features.
____________________________________________________________
 Finished processing, data has balanced number of samples for all categories. 
The size of data mood label count mood_goal
angry        11879
calm         11879
depressed    11879
euphoric     11879
happy        11879
relaxed      11879
sad          11879
tense        11879
Name: count, dtype: int64 
____________________________________________________________


In [24]:
# info etc

print(f"-----"*10)


print(f''' 

    Processed Mood - Music (MOOSIC) data
    Number of observations : {moosic_data.shape[0]} 
    Number of feature variables : {moosic_data.shape[1]} 

    ''')

print(f"-----"*10)

moosic_data.head(2)

--------------------------------------------------
 

    Processed Mood - Music (MOOSIC) data
    Number of observations : 95032 
    Number of feature variables : 52 

    
--------------------------------------------------


Unnamed: 0,artists_id,track_id,artist_name,track_name,genres,release_date,explicit,duration_ms,danceability,energy,...,metal,other,pop,punk rock,r&b,reggae,rock,rockabilly,soul,techno
0,3rj96ybf29aaJsJRg4Qci2,7drcUyUsrkdXoBDCqQ29Ax,THE BOOM,島唄 ウチナーグチ・ヴァージョン - Uchinaguchi Version,"j-pop, j-poprock, j-rock, okinawan folk, ryuky...",1992,0,304733,0.602,0.696,...,0,0,0,0,0,0,0,0,0,0
1,3o13BpwuBBj106YoNIoUrW,1K1JvzDBPzi34wkELMiwB8,Daniel Landa,Neklidne nebe,"czech folk, czech rock, slovak pop",2015-09-18,0,201933,0.708,0.73,...,0,0,0,0,0,0,0,0,0,0


In [25]:
moosic_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95032 entries, 0 to 95031
Data columns (total 52 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   artists_id         95032 non-null  object 
 1   track_id           95032 non-null  object 
 2   artist_name        95032 non-null  object 
 3   track_name         95032 non-null  object 
 4   genres             95032 non-null  object 
 5   release_date       95032 non-null  object 
 6   explicit           95032 non-null  int64  
 7   duration_ms        95032 non-null  int64  
 8   danceability       95032 non-null  float64
 9   energy             95032 non-null  float64
 10  key                95032 non-null  int64  
 11  loudness           95032 non-null  float64
 12  mode               95032 non-null  int64  
 13  speechiness        95032 non-null  float64
 14  acousticness       95032 non-null  float64
 15  instrumentalness   95032 non-null  float64
 16  liveness           950

In [26]:

# saving balanced data  

#moosic_data.to_csv('../data/processed/moodsic_data.csv', chunksize=len(moosic_data)//5, index=False)



