In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np
import re

In [2]:
HIDDEN_PARAMS = [(100,80,60), (150,120, 100), (150,100)]
ALPHA = [1e-3, 1e-5]
ESTIMATORS = [100,200]
MAX_DEPTH =[3,5,7]
MIN_LEAF = [50, 100]

In [3]:
def metrics(model, x, y):
    prediction = model.predict(x)

    f1 = f1_score(y, prediction, average='macro')
    precision = precision_score(y, prediction, average='macro')
    recall = recall_score(y, prediction, average='macro')
    acc = accuracy_score(y, prediction)
    scores = {'f1' : f1, 'precision' : precision, 'recall' : recall, 'accuracy':acc}

    return scores

In [4]:

def run_MLP(trainx, trainy, testx, testy):
    
    #activations = ['logistic', 'tanh', 'relu']
   
    def tune_nn(x, y, hiddenparams, alphaparams):

        grid = {'hidden_layer_sizes': hiddenparams, 'alpha': alphaparams}

        nn = MLPClassifier(activation ='relu', max_iter = 500)

        grid_search = GridSearchCV(estimator=nn, param_grid=grid, scoring='accuracy', verbose=3, cv = 3)

        grid_search.fit(x, y)

        besthidden = grid_search.best_params_['hidden_layer_sizes']
        # bestactivation = grid_search.best_params_['activation']
        bestalpha = grid_search.best_params_['alpha']

        results = {'best-hidden': besthidden, 'best-activation': 'relu', 'best-alpha': bestalpha}
        return results

    best_results = tune_nn(trainx, trainy, HIDDEN_PARAMS, ALPHA)
    hidden = best_results['best-hidden']
    activation = 'relu'
    alpha = best_results['best-alpha']
    nn = MLPClassifier(hidden_layer_sizes=hidden, activation=activation, alpha=alpha)

    nn.fit(trainx, trainy)

    # metrics for test
    test_metrics = metrics(nn, testx, testy)
    return test_metrics

In [5]:
def run_RF(trainx, trainy, testx, testy):
    num_estimators = []
    max_depths = []
    min_samples_leaf = []
    def tune_rf(x, y, num_estimators, dparams, lsparams):
        # 2a
        grid = {'n_estimators' : num_estimators, 'max_depth': dparams, 'min_samples_leaf': lsparams}
        rf = RandomForestClassifier()
        grid_search = GridSearchCV(estimator=rf, param_grid=grid, scoring='accuracy', verbose=3, cv = 3)
        grid_search.fit(x, y)
        best_estimator = grid_search.best_params_['n_estimators']
        best_depth = grid_search.best_params_['max_depth']
        best_leaf_samples = grid_search.best_params_['min_samples_leaf']
        results = {'best-estimator' : best_estimator, 'best-depth': best_depth, 'best-leaf-samples': best_leaf_samples}
        return results

    best_results = tune_rf(trainx, trainy, ESTIMATORS, MAX_DEPTH, MIN_LEAF)
    estimator = best_results['best-estimator']
    max_depth = best_results['best-depth']
    leaves = best_results['best-leaf-samples']
    forest = RandomForestClassifier(n_estimators=estimator, max_depth=max_depth, min_samples_leaf=leaves)

    forest.fit(trainx, trainy)
    # metrics for test
    test_metrics = metrics(forest, testx, testy)
    return test_metrics


In [6]:
input_csv = 'mega_data_3.csv'
df = pd.read_csv(input_csv)

y = df['genre']

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
# splitting data

lyrics_train = scaler.fit_transform(X_train.iloc[:, 282:])
lyrics_test = scaler.transform(X_test.iloc[:, 282:])
midi_train = scaler.fit_transform(X_train.iloc[:, 26:282])
midi_test = scaler.transform(X_test.iloc[:, 26:282])

audio_indices = [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
audio_metadata_train = scaler.fit_transform(X_train.iloc[:, audio_indices].copy())
audio_metadata_test = scaler.transform(X_test.iloc[:, audio_indices].copy())
# audio_metadata_train = audio_metadata_train.values
# audio_metadata_test = audio_metadata_test.values

In [28]:
np.concatenate([audio_metadata_train, lyrics_train], axis =1 )

(5783, 395)

In [None]:
# -----------MLP models--------------
# SINGLE MODALITY

In [39]:
trainx = midi_train
testx = midi_test
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.880 total time=   4.0s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.885 total time=   6.6s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.873 total time=   3.5s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.879 total time=   6.6s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.870 total time=   5.1s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.878 total time=   7.1s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.881 total time=   4.1s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.883 total time=   3.3s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.887 total time=   4.0s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.871 total time=   3.4s
[CV 2/3] END alpha=1e-0

{'f1': 0.9159903235604174,
 'precision': 0.9199423453594889,
 'recall': 0.9140297807065925,
 'accuracy': 0.9204702627939142}

In [7]:
trainx = lyrics_train
testx = lyrics_test
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.793 total time=   6.0s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.813 total time=   4.7s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.801 total time=   3.6s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.788 total time=   5.3s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.820 total time=   9.7s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.806 total time=   5.1s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.792 total time=   6.4s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.818 total time=   3.6s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.811 total time=   6.5s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.786 total time=   6.8s
[CV 2/3] END alpha=1e-0

{'f1': 0.8227845223758011,
 'precision': 0.8313175997583,
 'recall': 0.8194547042634082,
 'accuracy': 0.8340248962655602}

In [38]:
trainx = lyrics_train
testx = lyrics_test
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.781 total time=   5.9s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.810 total time=   3.6s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.795 total time=   3.9s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.786 total time=   5.4s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.806 total time=   5.3s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.814 total time=   6.5s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.791 total time=   6.5s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.808 total time=  11.7s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.800 total time=   8.7s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.788 total time=   4.5s
[CV 2/3] END alpha=1e-0

{'f1': 0.8268088254409075,
 'precision': 0.849632105427005,
 'recall': 0.8148133871394714,
 'accuracy': 0.8381742738589212}

In [None]:
# ------------MLP-------------
# DOUBLE MODALITIES

In [8]:
trainx = np.concatenate([audio_metadata_train, lyrics_train], axis =1 )
testx = np.concatenate([audio_metadata_test, lyrics_test], axis =1 )
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.789 total time=   4.3s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.816 total time=   3.3s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.806 total time=   3.3s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.812 total time=   5.8s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.816 total time=   5.7s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.811 total time=   7.5s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.807 total time=   5.6s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.822 total time=   5.3s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.811 total time=   6.9s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.792 total time=   3.3s
[CV 2/3] END alpha=1e-0

{'f1': 0.8334596629330626,
 'precision': 0.8314787599290354,
 'recall': 0.838625197270161,
 'accuracy': 0.8499308437067773}

In [9]:
trainx = np.concatenate([midi_train, lyrics_train], axis =1 )
testx = np.concatenate([midi_test, lyrics_test], axis =1 )
# trainx = midi_train.join(lyrics_train)
# testx = midi_test.join(lyrics_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.863 total time=   2.9s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.871 total time=   2.2s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.862 total time=   2.6s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.863 total time=   2.9s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.866 total time=   3.7s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.873 total time=   4.4s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.865 total time=   3.0s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.874 total time=   3.5s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.872 total time=   3.2s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.860 total time=   2.7s
[CV 2/3] END alpha=1e-0

{'f1': 0.9136775625258814,
 'precision': 0.9217149903479919,
 'recall': 0.9063206871092634,
 'accuracy': 0.9100968188105117}

In [32]:
trainx = np.concatenate([midi_train, audio_metadata_train], axis =1 )
testx = np.concatenate([midi_test, audio_metadata_test], axis =1 )
# trainx = audio_metadata_train.join(midi_train)
# testx = audio_metadata_test.join(midi_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.878 total time=   2.3s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.880 total time=   2.5s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.881 total time=   2.5s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.885 total time=   3.0s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.875 total time=   4.0s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.880 total time=   2.1s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.887 total time=   2.9s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.885 total time=   5.0s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.894 total time=   6.3s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.872 total time=   3.2s
[CV 2/3] END alpha=1e-0

{'f1': 0.9181253857964793,
 'precision': 0.92425100332191,
 'recall': 0.9134133664418427,
 'accuracy': 0.9246196403872753}

In [None]:
# --------------- MLP------------
# ALL MODALITIES

In [33]:
trainx = np.concatenate([midi_train, audio_metadata_train, lyrics_train], axis =1 )
testx = np.concatenate([midi_test, audio_metadata_test, lyrics_test], axis =1 )
# trainx = audio_metadata_train.join(midi_train)
# testx = audio_metadata_test.join(midi_test)
run_MLP(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.867 total time=   3.3s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.859 total time=   2.6s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(100, 80, 60);, score=0.868 total time=   2.3s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.862 total time=   2.9s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.873 total time=   3.1s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 120, 100);, score=0.873 total time=   3.3s
[CV 1/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.868 total time=   3.2s
[CV 2/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.876 total time=   3.2s
[CV 3/3] END alpha=0.001, hidden_layer_sizes=(150, 100);, score=0.881 total time=   3.3s
[CV 1/3] END alpha=1e-05, hidden_layer_sizes=(100, 80, 60);, score=0.870 total time=   2.2s
[CV 2/3] END alpha=1e-0

{'f1': 0.9225468905013273,
 'precision': 0.9383192616094808,
 'recall': 0.9090540717458606,
 'accuracy': 0.9204702627939142}

In [None]:
# -----------RF models--------------
# SINGLE MODALITY

In [None]:
trainx = midi_train
testx = midi_test
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_RF(trainx, y_train, testx, y_test)

In [None]:
trainx = lyrics_train
testx = lyrics_test
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_RF(trainx, y_train, testx, y_test)

In [None]:
trainx = audio_metadata_train
testx = audio_metadata_test
# trainx = audio_metadata_train.join(lyrics_train)
# testx = audio_metadata_test.join(lyrics_test)
run_RF(trainx, y_train, testx, y_test)

In [None]:
# -----------RF models--------------
# DOUBLE MODALITY

In [34]:
trainx = np.concatenate([audio_metadata_train, lyrics_train], axis =1 )
testx = np.concatenate([audio_metadata_test, lyrics_test], axis =1 )
run_RF(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.429 total time=   1.1s
[CV 2/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.432 total time=   1.1s
[CV 3/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.429 total time=   1.1s
[CV 1/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.430 total time=   2.2s
[CV 2/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.429 total time=   2.2s
[CV 3/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.428 total time=   2.2s
[CV 1/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.423 total time=   1.1s
[CV 2/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.423 total time=   1.1s
[CV 3/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.424 total time=   1.1s
[CV 1/3] END max_depth=3, min_samples_leaf=100, n_estimators=20

  _warn_prf(average, modifier, msg_start, len(result))


{'f1': 0.15424246509925357,
 'precision': 0.5572916666666666,
 'recall': 0.16879921864086705,
 'accuracy': 0.4785615491009682}

In [35]:
trainx = np.concatenate([midi_train, lyrics_train], axis =1 )
testx = np.concatenate([midi_test, lyrics_test], axis =1 )
run_RF(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.448 total time=   1.3s
[CV 2/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.446 total time=   1.3s
[CV 3/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.441 total time=   1.3s
[CV 1/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.449 total time=   2.5s
[CV 2/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.446 total time=   2.5s
[CV 3/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.444 total time=   2.5s
[CV 1/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.438 total time=   1.2s
[CV 2/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.436 total time=   1.2s
[CV 3/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.436 total time=   1.2s
[CV 1/3] END max_depth=3, min_samples_leaf=100, n_estimators=20

  _warn_prf(average, modifier, msg_start, len(result))


{'f1': 0.31876596931360085,
 'precision': 0.793465771649734,
 'recall': 0.27551455627338706,
 'accuracy': 0.5573997233748271}

In [36]:
trainx = np.concatenate([midi_train, audio_metadata_train], axis =1 )
testx = np.concatenate([midi_test, audio_metadata_test], axis =1 )
run_RF(trainx, y_train, testx, y_test)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.450 total time=   0.7s
[CV 2/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.449 total time=   0.7s
[CV 3/3] END max_depth=3, min_samples_leaf=50, n_estimators=100;, score=0.461 total time=   0.7s
[CV 1/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.453 total time=   1.3s
[CV 2/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.449 total time=   1.4s
[CV 3/3] END max_depth=3, min_samples_leaf=50, n_estimators=200;, score=0.456 total time=   1.3s
[CV 1/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.441 total time=   0.7s
[CV 2/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.436 total time=   0.7s
[CV 3/3] END max_depth=3, min_samples_leaf=100, n_estimators=100;, score=0.440 total time=   0.7s
[CV 1/3] END max_depth=3, min_samples_leaf=100, n_estimators=20

  _warn_prf(average, modifier, msg_start, len(result))


{'f1': 0.3695760615692208,
 'precision': 0.6853662953907452,
 'recall': 0.3191849358499159,
 'accuracy': 0.5912863070539419}

In [None]:
# --------------- MLP------------
# ALL MODALITIES

In [None]:
trainx = np.concatenate([midi_train, audio_metadata_train, lyrics_train], axis =1 )
testx = np.concatenate([midi_test, audio_metadata_test, lyrics_test], axis =1 )
# trainx = audio_metadata_train.join(midi_train)
# testx = audio_metadata_test.join(midi_test)
run_RF(trainx, y_train, testx, y_test)