In [1]:
import sys
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold

from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier, XGBRFClassifier

sys.path.append("../../")

from helpers.split import tag_label_feature_split
from helpers.assess import make_classification_report, make_confusion_matrix

DATASET_FOLDER = "../../datasets/"

**MTG-Jamendo Dataset**

Our project uses  the MTG-Jamendo dataset.

It is built using music available at Jamendo under Creative Commons licenses and tags provided by content uploaders. The dataset contains over 55,000 full audio tracks with 195 tags from genre, instrument, and mood/theme categories.

As we describe in greater detail in our report, we limit our analysis to songs that are identified as belonging to exactly one of ten genres.  We eliminate songs in other genres.  We also eliminate songs labelled as belonging to more than one genre.

After flattening arrays, the dataset includes 2,654 features excluding mel spectrogram data.  We considered several collections of features, for example, one collection included all features in the dataset other than the mel spectrograms, another version consisted of just the mel spectrogram data.  In addition we looked at nine other combinations of features.

https://mtg.github.io/mtg-jamendo-dataset/

**Exploratory Data Analysis**

The genres within the scope of our project are:

1. blues
2. classical
3. country
4. disco
5. hiphop
6. jazz
7. metal
8. genre_pop
9. reggae
10. rock

As noted above, we removed songs from the dataset that were labelled as being in other genres and we removed songs labelled as belonging to more than one of the genres listed above.

The exploratory data analysis that follows is on the version of the dataset that includes all of the features other than mel spectrograms, which we handle separately.  We refer to this dataset as our **all features** dataset

In [28]:
# read a data set
df = pd.read_pickle(DATASET_FOLDER + "dataset_00_all.pickle")

In [None]:
BAS

In [3]:
# get labels, a label encoder and features
_, (y, le), X = tag_label_feature_split(df, label_format="encoded")

In [26]:
pd.DataFrame(le.inverse_transform(y), columns=['genre']).value_counts()

genre          
genre_pop          5343
genre_classical    5276
genre_rock         4000
genre_hiphop       2126
genre_jazz         1552
genre_reggae        985
genre_metal         831
genre_blues         468
genre_country       409
genre_disco         361
dtype: int64

In [27]:
# review the shapes
print(f"{X.shape= } {y.shape= }")

X.shape= (21351, 2654) y.shape= (21351,)


In [None]:
pd.Series(le.inverse_transform(y)).value_counts()

In [None]:
# split the data for training and testing with shuffling and stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1962, shuffle=True, stratify=y
)

In [None]:
# review the shapes
print(f"{X_train.shape = }\t{y_train.shape = }")
print(f"{X_test.shape =  }\t{y_test.shape =  }")

In [None]:
plt.figure(figsize=(8,4))
plt.hist([y, y_train, y_test],
         label=['y', 'y_train', 'y_test'])
plt.xlabel("Genre ID")
plt.legend()
plt.show()

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca=PCA(random_state=1962, n_components=.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1962)

classifiers = []

for train_idx, validate_idx in skf.split(X_train, y_train):
    X_fold_train, X_fold_validate = X_train_scaled[train_idx], X_train_scaled[validate_idx]
    y_fold_train, y_fold_validate = y[train_idx], y[validate_idx]
    
#    smt = SMOTETomek(random_state=1962, n_jobs=-1)
#    smt = SMOTE(random_state=1962, n_jobs=--1)
    
    print ("resample")
#    X_fold_train_resampled, y_fold_train_resampled = smt.fit_resample(X_fold_train, y_fold_train)

    X_fold_train_resampled = X_fold_train
    y_fold_train_resampled = y_fold_train
    
    print ("create classifier")
    classifier = XGBClassifier(
#        learning_rate=.221461,
#        n_estimators=827,
#        max_depth=4,
#        gamma=.524969,
#        reg_alpha=4.327827,
        use_label_encoder=False,
        tree_method="gpu_hist",
        sampling_method="gradient_based",
        objective="multi:softprob",
        eval_metric=["mlogloss", "auc"], 
        early_stopping_rounds=10,
        seed=1962,
    )

    eval_set = [(X_fold_train_resampled, y_fold_train_resampled), (X_fold_validate, y_fold_validate)]
    classifier.fit(X_fold_train_resampled, y_fold_train_resampled,
                   
                   eval_set=eval_set, 
                   verbose=False)
    
    classifiers.append(classifier)
    

In [None]:
test_model = classifiers[4]
test_model_results = test_model.evals_result()
for key, val in test_model_results.items():
    print(key)

In [None]:
train_results=test_model_results['validation_0']
test_results=test_model_results['validation_1']

In [None]:
plt.plot(train_results['mlogloss'], label='train')
plt.plot(test_results['mlogloss'], label='test')
plt.legend()
plt.show()

In [None]:
plt.plot(train_results['auc'], label='train')
plt.plot(test_results['auc'], label='test')
plt.legend()
plt.show()

In [None]:
accuracy_score(y_test, test_model.predict(X_test_scaled))

In [None]:
matthews_corrcoef(y_test, test_model.predict(X_test_scaled))