In [1]:
# ! pip3 install category_encoders

In [2]:
import numpy as np
import pandas as pd
from   category_encoders          import *
from   sklearn.compose            import *
from   sklearn.ensemble           import *
from   sklearn.experimental       import enable_iterative_imputer
from   sklearn.impute             import *
from   sklearn.linear_model       import *
from   sklearn.metrics            import * 
from   sklearn.pipeline           import Pipeline
from   sklearn.preprocessing      import *
from   sklearn.tree               import *
from   sklearn.model_selection    import *
from   sklearn.base               import BaseEstimator
from   sklearn.decomposition      import PCA
from   sklearn.inspection         import permutation_importance

import warnings
warnings.filterwarnings("ignore")

# 1. Data and Target Transformations

* ### Goal
    Genre Classification with music features extracted from audio files

In [3]:
data = pd.read_csv("input.csv")

X = data.drop(columns=["filename", "length", "label"])
y = data["label"]

In [4]:
X.shape, y.shape

((1000, 57), (1000,))

In [5]:
X.tail()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
995,0.352063,0.080487,0.079486,0.000345,2008.149458,282174.689224,2106.541053,88609.749506,4253.557033,1222421.0,...,1.789867,45.050526,-13.289984,41.754955,2.484145,36.778877,-6.713265,54.866825,-1.193787,49.950665
996,0.398687,0.075086,0.076458,0.000588,2006.843354,182114.70951,2068.942009,82426.016726,4149.338328,1046621.0,...,3.73902,33.851742,-10.848309,39.395096,1.881229,32.01004,-7.461491,39.196327,-2.795338,31.773624
997,0.432142,0.075268,0.081651,0.000322,2077.526598,231657.96804,1927.293153,74717.124394,4031.405321,804215.4,...,1.83809,33.597008,-12.845291,36.367264,3.440978,36.00111,-12.58807,42.502201,-2.106337,29.865515
998,0.362485,0.091506,0.08386,0.001211,1398.699344,240318.731073,1818.45028,109090.207161,3015.631004,1332712.0,...,-2.812176,46.324894,-4.41605,43.583942,1.556207,34.331261,-5.041897,47.22718,-3.590644,41.299088
999,0.358401,0.085884,0.054454,0.000336,1609.795082,422203.216152,1797.213044,120115.632927,3246.90893,1753476.0,...,1.794104,59.167755,-7.069775,73.760391,0.028346,76.504326,-2.025783,72.189316,1.155239,49.66251


# 2. Feature Engineering and Target Transformation

In [6]:
# define a loop cross validation to return a mean accuracy score due to the randomness of random forest
def loop_cv(pipe, X, y, split_size, n):
    """returns the mean accuracy score of predictions, using X, y to split n times"""
    BAS = []
    for i in range(n):
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=split_size)
        pipe.fit(X_train, y_train)
        pred = pipe.predict(X_valid)
        bas = accuracy_score(y_valid, pred)
        BAS.append(bas)
    return np.mean(BAS)

### 2.1 Target Transformation

In [7]:
le = LabelEncoder() # encodes from 0 to 9, 10 - music genre 
y = le.fit_transform(y)
np.unique(y)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### 2.2 Feature Transformation

In [8]:
pipe_preprocessing = Pipeline([
    ("scaler", StandardScaler()),
    ("transformer", QuantileTransformer(output_distribution='normal')),
])

### 2.3 Feature Selection

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=7)

In [10]:
X_train_valid, X_valid, y_train_valid, y_valid = train_test_split(X_train, y_train,
                                                                  train_size=0.8, random_state=7)

In [11]:
pipe_test = Pipeline([('preprocessing', pipe_preprocessing),
                      ('rf', RandomForestClassifier())])
pipe_test.fit(X_train_valid, y_train_valid)
r = permutation_importance(pipe_test,
                           X_valid,
                           y_valid,
                           n_repeats=20,
                           random_state=7)

for i in r.importances_mean.argsort()[::-1]:
    print(f"{X_train_valid.columns[i]:<8}"
          f"{r.importances_mean[i]:.3f}"
          f" ± {r.importances_std[i]:.3f}")

chroma_stft_mean0.041 ± 0.014
perceptr_var0.024 ± 0.009
mfcc13_mean0.024 ± 0.008
rolloff_var0.022 ± 0.009
spectral_centroid_var0.018 ± 0.007
mfcc10_var0.017 ± 0.007
mfcc4_mean0.017 ± 0.007
perceptr_mean0.016 ± 0.009
spectral_bandwidth_var0.016 ± 0.006
harmony_mean0.015 ± 0.010
mfcc4_var0.014 ± 0.014
mfcc11_mean0.013 ± 0.006
harmony_var0.013 ± 0.009
mfcc2_mean0.012 ± 0.008
mfcc12_mean0.012 ± 0.008
mfcc3_mean0.012 ± 0.006
mfcc5_var0.011 ± 0.010
chroma_stft_var0.010 ± 0.013
mfcc15_mean0.009 ± 0.004
mfcc14_var0.008 ± 0.007
zero_crossing_rate_var0.007 ± 0.010
mfcc5_mean0.007 ± 0.006
mfcc8_mean0.007 ± 0.008
mfcc16_var0.006 ± 0.007
rms_var 0.006 ± 0.010
mfcc11_var0.006 ± 0.006
mfcc9_mean0.005 ± 0.008
mfcc20_mean0.005 ± 0.006
mfcc9_var0.005 ± 0.008
spectral_bandwidth_mean0.004 ± 0.008
mfcc2_var0.004 ± 0.007
mfcc7_mean0.004 ± 0.006
mfcc17_var0.004 ± 0.005
mfcc18_mean0.004 ± 0.008
mfcc16_mean0.004 ± 0.007
mfcc8_var0.003 ± 0.008
zero_crossing_rate_mean0.003 ± 0.006
mfcc7_var0.002 ± 0.007
mfcc19_v

In [12]:
mean_bas_test = loop_cv(pipe_test, X_train, y_train, 0.8, 50)
print(f"Mean Balacend Accuracy Score with 50 loops: {mean_bas_test: .3f}")

Mean Balacend Accuracy Score with 50 loops:  0.682


In [13]:
# Apply PCA() requires standardization
pca = PCA() # Principal component analysis (PCA)
rf = RandomForestClassifier()
pipe_pca = Pipeline([('preprocessing', pipe_preprocessing), ('pca', pca),
                     ('rf', rf)])
mean_bas_pca = loop_cv(pipe_pca, X_train, y_train, 0.8, 50)
print(f"Mean Balacend Accuracy Score with 50 loops: {mean_bas_pca: .3f}")

Mean Balacend Accuracy Score with 50 loops:  0.676


In [14]:
pipe_pca.fit(X_train_valid, y_train_valid)
for i, r in enumerate(pca.explained_variance_ratio_):
    print(f"Component #{i}: {r:>6.2%}")

Component #0: 26.67%
Component #1: 16.84%
Component #2: 10.26%
Component #3:  7.38%
Component #4:  5.19%
Component #5:  3.74%
Component #6:  3.05%
Component #7:  2.60%
Component #8:  2.13%
Component #9:  1.71%
Component #10:  1.55%
Component #11:  1.49%
Component #12:  1.10%
Component #13:  1.07%
Component #14:  0.98%
Component #15:  0.91%
Component #16:  0.82%
Component #17:  0.74%
Component #18:  0.70%
Component #19:  0.65%
Component #20:  0.61%
Component #21:  0.54%
Component #22:  0.54%
Component #23:  0.51%
Component #24:  0.48%
Component #25:  0.46%
Component #26:  0.46%
Component #27:  0.42%
Component #28:  0.41%
Component #29:  0.40%
Component #30:  0.39%
Component #31:  0.38%
Component #32:  0.36%
Component #33:  0.35%
Component #34:  0.33%
Component #35:  0.32%
Component #36:  0.31%
Component #37:  0.29%
Component #38:  0.28%
Component #39:  0.27%
Component #40:  0.26%
Component #41:  0.25%
Component #42:  0.23%
Component #43:  0.21%
Component #44:  0.21%
Component #45:  0.20

In [15]:
# check the change of accuracy score with dropping insignificant PCA components
pca = PCA(n_components=14)
rf = RandomForestClassifier()
pipe_pca = Pipeline([('preprocessing', pipe_preprocessing), 
                     ('pca', pca),
                     ('rf', rf)])
mean_bas_pca = loop_cv(pipe_pca, X_train, y_train, 0.8, 50)
print(f"Mean Balacend Accuracy Score with 50 loops: {mean_bas_pca: .3f}")

Mean Balacend Accuracy Score with 50 loops:  0.671


# 3. Algorithms & Model Selection

Three appropriate algorithms were chosen for the random search.
* **LogisticRegression**
* **RidgeClassifier**
* **RandomForestClassifier**: tree based model, does not require standardization or normality of features
  * n_estimators: try to explore the effect of number of trees in random forest on accuracy
  * max_depth: try to explore the effect of the depth of trees in random forest
  * min_samples_leaf: try to explore the effect of the minimun samples leaf

In [16]:
# Create space of candidate learning algorithms and their hyperparameters
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."

    def fit(self): pass
    def score(self): pass

pipe_search = Pipeline([
    ('prep', pipe_preprocessing),
    ('pca', PCA(n_components=14)),
    ('clf', DummyEstimator())
])

search = [
            {
                'clf': [LogisticRegression()],
                'clf__C': np.logspace(0, 5, 50)
            },
            {
                'clf': [RidgeClassifier()]
                
            },
            {
                'clf': [RandomForestClassifier()],
                'clf__n_estimators': range(50, 400, 50),
                'clf__max_depth': [20, 30, 50, 100],
                'clf__min_samples_leaf': range(1, 6)
            }  
         ]
clf_algos_rand = RandomizedSearchCV(estimator=pipe_search,
                                    param_distributions=search,
                                    n_iter=30,
                                    cv=5,
                                    n_jobs=-1,
                                    verbose=1,
                                    scoring='accuracy')

clf_algos_rand.fit(X_train, y_train)

clf_algos_rand.best_estimator_

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Pipeline(steps=[('prep',
                 Pipeline(steps=[('scaler', StandardScaler()),
                                 ('transformer',
                                  QuantileTransformer(output_distribution='normal'))])),
                ('pca', PCA(n_components=14)),
                ('clf',
                 RandomForestClassifier(max_depth=20, n_estimators=200))])

# 5. Evaluation

In [17]:
# final Model
pipe = Pipeline(steps=[('prep',
                        Pipeline(steps=[('scaler', StandardScaler()),
                                        ('transformer',
                                         QuantileTransformer(output_distribution='normal'))])),
                       ('pca', PCA(n_components=14)),
                       ('clf',
                         RandomForestClassifier(max_depth=20, min_samples_leaf=2,
                                                n_estimators=100))])

In [18]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
bas = accuracy_score(y_test, y_pred)
print(f"accuracy_score: {bas: .4f}")

accuracy_score:  0.6450


# 5. Conclusion

In [19]:
# explained variance ration of each component (new feature) after PCA
for i, r in enumerate(pca.explained_variance_ratio_):
    print(f"Component #{i}: {r:>6.2%}")

Component #0: 26.03%
Component #1: 17.76%
Component #2: 10.01%
Component #3:  7.49%
Component #4:  4.77%
Component #5:  3.42%
Component #6:  2.98%
Component #7:  2.71%
Component #8:  2.05%
Component #9:  1.88%
Component #10:  1.54%
Component #11:  1.48%
Component #12:  1.27%
Component #13:  1.07%
