# Music emotion classifier

## Setup

In [None]:
!pip install opensmile

### Imports

In [3]:
import pandas as pd
import numpy as np
import librosa
import opensmile
import lzma, pickle

### Constants

In [4]:
AUDIO_PATH = "../data/processed/wav/"
ANNOTATIONS_PATH = "../data/processed/annotations/annotations averaged per song/song_level/static_annotations_averaged_songs_1_2000.csv"

## Data preparation

### Data loading

In [5]:
dataframe = pd.read_csv(ANNOTATIONS_PATH)

audio_clips = dataframe["song_id"]
targets = dataframe[["valence_mean", "arousal_mean"]]

print(audio_clips)
print(targets['valence_mean'], targets['arousal_mean'])

0          2
1          3
2          4
3          5
4          7
        ... 
1739    1996
1740    1997
1741    1998
1742    1999
1743    2000
Name: song_id, Length: 1744, dtype: int64
0       3.1
1       3.5
2       5.7
3       4.4
4       5.8
       ... 
1739    3.9
1740    5.3
1741    6.4
1742    4.6
1743    5.8
Name: valence_mean, Length: 1744, dtype: float64 0       3.0
1       3.3
2       5.5
3       5.3
4       6.4
       ... 
1739    5.9
1740    3.9
1741    6.2
1742    5.4
1743    6.0
Name: arousal_mean, Length: 1744, dtype: float64


### Data transformation

## Feature engineering

### Feature extraction

Librosa features (use this OR openSMILE)

In [36]:
features_list = []
iter = 0

for song_id in song_id_list:
    if iter == 100:
        break
    iter += 1
    waveform, sample_rate = librosa.load(AUDIO_PATH + f"{song_id}.mp3")

    mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate)
    rolloff = librosa.feature.spectral_rolloff(y=waveform,sr=sample_rate)
    centroid = librosa.feature.spectral_centroid(y=waveform, sr=sample_rate)
    rms = librosa.feature.rms(y=waveform)
    tempo = librosa.feature.tempo(y=waveform,sr=sample_rate)
    onset_env = librosa.onset.onset_strength(y=waveform, sr=sample_rate)
    zcr = librosa.feature.zero_crossing_rate(waveform)
    chromagram = librosa.feature.chroma_stft(y=waveform, sr=sample_rate)
    pitches, magnitudes = librosa.piptrack(y=waveform, sr=sample_rate)
    features_list.append([song_id, mfcc, rolloff, centroid, rms, tempo, onset_env, zcr, chromagram, pitches, magnitudes])

features = pd.DataFrame(
    data=features_list,
    columns=["song_id", "mfcc", "rolloff", "centroid", "rms", "tempo", "onset_env", "zcr", "chromagram", "pitches", "magnitudes"]
)

openSMILE

In [7]:
def get_matched_smile(audio, targets):
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.emobase,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
    all_smiles = [] # list of smile features for each clip
    iters = 0
    for file in audio:
        iters += 1
        print(str(iters) + "/" + str(len(audio)))
        
        # get smile features
        filepath = AUDIO_PATH + str(file) + ".wav"
        smile_feats = smile.process_file(filepath)
        # convert from df to list
        smile_feats = smile_feats.values.tolist()
        # convert from 2d list to 1d list
        smile_feats = sum(smile_feats, [])
        all_smiles.append(smile_feats)

    df = pd.DataFrame({'features': all_smiles, 'valence': targets['valence_mean'], 'arousal': targets['arousal_mean']})
    
    return df

matched_smile_df = get_matched_smile(audio_clips[:10], targets[:10])

# Optionally save the matched_midi_df DataFrame as a pickle file    
# with lzma.open("matched_smile.xz", "wb") as f:
#     pickle.dump(matched_smile_df, f)

['pcm_intensity_sma_max', 'pcm_intensity_sma_min', 'pcm_intensity_sma_range', 'pcm_intensity_sma_maxPos', 'pcm_intensity_sma_minPos', 'pcm_intensity_sma_amean', 'pcm_intensity_sma_linregc1', 'pcm_intensity_sma_linregc2', 'pcm_intensity_sma_linregerrA', 'pcm_intensity_sma_linregerrQ', 'pcm_intensity_sma_stddev', 'pcm_intensity_sma_skewness', 'pcm_intensity_sma_kurtosis', 'pcm_intensity_sma_quartile1', 'pcm_intensity_sma_quartile2', 'pcm_intensity_sma_quartile3', 'pcm_intensity_sma_iqr1-2', 'pcm_intensity_sma_iqr2-3', 'pcm_intensity_sma_iqr1-3', 'pcm_loudness_sma_max', 'pcm_loudness_sma_min', 'pcm_loudness_sma_range', 'pcm_loudness_sma_maxPos', 'pcm_loudness_sma_minPos', 'pcm_loudness_sma_amean', 'pcm_loudness_sma_linregc1', 'pcm_loudness_sma_linregc2', 'pcm_loudness_sma_linregerrA', 'pcm_loudness_sma_linregerrQ', 'pcm_loudness_sma_stddev', 'pcm_loudness_sma_skewness', 'pcm_loudness_sma_kurtosis', 'pcm_loudness_sma_quartile1', 'pcm_loudness_sma_quartile2', 'pcm_loudness_sma_quartile3', '

Import from pickle

In [5]:
with lzma.open('matched_smile.xz', 'rb') as f:
    matched_smile_df = pickle.load(f)

print(matched_smile_df.head())
print(matched_smile_df.shape)

                                            features  valence  arousal
0  [9.9690914794337e-05, 0.0, 9.9690914794337e-05...      3.1      3.0
1  [5.123164737597108e-05, 0.0, 5.123164737597108...      3.5      3.3
2  [8.819365757517517e-05, 0.0, 8.819365757517517...      5.7      5.5
3  [5.9923360822722316e-05, 0.0, 5.99233608227223...      4.4      5.3
4  [0.00011262285988777876, 0.0, 0.00011262285988...      5.8      6.4
(1744, 3)


### Feature selection

In [42]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [54]:
smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.emobase,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
feature_names = smile.feature_names

# Convert to numpy array of features
smile_features = matched_smile_df['features']
smile_features = np.stack(smile_features.values)
# Convert to numpy array of labels
valence_labels = matched_smile_df['valence']
arousal_labels = matched_smile_df['arousal']

selector = SelectKBest(score_func=f_regression, k=100) # Choose the 100 most effective features

valence_fit = selector.fit(smile_features, valence_labels)
valence_features = selector.transform(smile_features)
print(selector.get_feature_names_out())

arousal_fit = selector.fit(smile_features, arousal_labels)
arousal_features = selector.transform(smile_features)
print(selector.get_feature_names_out())

print(arousal_features.shape, valence_features.shape)



['x3' 'x5' 'x7' 'x8' 'x9' 'x10' 'x11' 'x13' 'x14' 'x15' 'x16' 'x17' 'x18'
 'x24' 'x26' 'x27' 'x28' 'x29' 'x30' 'x32' 'x33' 'x34' 'x35' 'x36' 'x37'
 'x42' 'x49' 'x51' 'x78' 'x106' 'x111' 'x114' 'x133' 'x135' 'x168' 'x169'
 'x170' 'x191' 'x198' 'x199' 'x200' 'x206' 'x207' 'x208' 'x262' 'x315'
 'x335' 'x339' 'x341' 'x353' 'x365' 'x387' 'x391' 'x394' 'x395' 'x498'
 'x500' 'x501' 'x502' 'x503' 'x504' 'x507' 'x508' 'x509' 'x510' 'x511'
 'x512' 'x513' 'x519' 'x520' 'x521' 'x522' 'x523' 'x526' 'x527' 'x528'
 'x529' 'x530' 'x531' 'x647' 'x648' 'x676' 'x706' 'x773' 'x775' 'x776'
 'x777' 'x778' 'x799' 'x820' 'x863' 'x864' 'x865' 'x868' 'x906' 'x909'
 'x910' 'x911' 'x942' 'x975']
['x3' 'x5' 'x7' 'x8' 'x9' 'x10' 'x11' 'x12' 'x13' 'x14' 'x15' 'x16' 'x17'
 'x18' 'x22' 'x24' 'x26' 'x27' 'x28' 'x29' 'x30' 'x31' 'x32' 'x33' 'x34'
 'x35' 'x36' 'x37' 'x42' 'x49' 'x76' 'x111' 'x114' 'x133' 'x176' 'x185'
 'x186' 'x206' 'x296' 'x315' 'x318' 'x327' 'x365' 'x391' 'x437' 'x439'
 'x498' 'x499' 'x501' 'x502' 'x50

## Model

### Training

### Testing

### Validation

## Results

### Metrics

### Visualization