In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
from ipynb.fs.full.data_loading import load_data_from_csv

In [3]:
data = load_data_from_csv()

In [4]:
new_data = data.copy()

In [5]:
df = pd.DataFrame(columns=["label1", "label2", "DTaccuracy", "RFaccuracy", "MLPaccuracy"])

In [6]:
labels = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
for i in range(len(labels)):
    for j in range(len(labels) - i - 1):
                
        print(labels[i], labels[i + j + 1])
        
        new_data_2genre = new_data[(new_data.label == labels[i]) | (new_data.label == labels[i + j + 1])]

        y = new_data_2genre['label']
        X = new_data_2genre.drop(labels=['label', 'filename', 'audio_data', 'audio_sr'],axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)
        
        param_grid_dt = {'splitter': ['best', 'random'], 'max_depth': ['None', 2, 4, 8],
                         'random_state': [0], 'min_samples_leaf': [1, 2, 4, 8]}
        grid_dt = GridSearchCV(DecisionTreeClassifier(), param_grid_dt, refit = True, verbose = 0, cv = 5)
        grid_dt.fit(X_train, y_train)
        y_pred_dt = grid_dt.predict(X_test)
        dt_accuracy = accuracy_score(y_test, y_pred_dt)
        dt_best_estimator = str(grid_dt.best_estimator_)
        print("DT accuracy:", dt_accuracy)
        print("DT config:", dt_best_estimator)
        
        param_grid_rf = {'n_estimators': [100, 500, 1000], 'max_depth' : [None, 2, 4, 8],
                        'random_state': [0], 'min_samples_leaf': [1, 2, 4, 8]}
        grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, refit = True, verbose = 0, cv = 5)
        grid_rf.fit(X_train, y_train)
        y_pred_rf = grid_rf.predict(X_test)
        rf_accuracy = accuracy_score(y_test, y_pred_rf)
        rf_best_estimator = str(grid_rf.best_estimator_)
        print("RF accuracy:", rf_accuracy)
        print("RF config:", rf_best_estimator)
        
        param_grid_mlp = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
                'solver': ['lbgfs', 'sgd', 'adam'],
                'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,), (250, 500), (250, 500, 250), (1000, 1000)],
                'alpha': [0.0001, 0.05],
                'learning_rate': ['constant', 'invscaling', 'adaptive']}  
        grid_mlp = GridSearchCV(MLPClassifier(), param_grid_mlp, refit = True, verbose = 0, cv=5)
        grid_mlp.fit(X_train, y_train)
        y_pred_mlp = grid_mlp.predict(X_test) 
        mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
        mlp_best_estimator = str(grid_mlp.best_estimator_)
        print("MLP accuracy:", mlp_accuracy)
        print("MLP config:", mlp_best_estimator)
                
        row = {"label1" : labels[i], "label2" : labels[i + j + 1], "DTaccuracy" : dt_accuracy, "RFaccuracy" : rf_accuracy, "MLPaccuracy" : mlp_accuracy,
              "DTestimator": dt_best_estimator, "RFestimator": rf_best_estimator, "MLPestimator": mlp_best_estimator}
        df = df.append(row, ignore_index=True)
        
        print()

blues classical
DT accuracy: 0.84375
DT config: DecisionTreeClassifier(max_depth=4, random_state=0, splitter='random')
RF accuracy: 0.98125
RF config: RandomForestClassifier(random_state=0)
MLP accuracy: 0.76875
MLP config: MLPClassifier(activation='tanh', hidden_layer_sizes=(250, 500),
              learning_rate='invscaling')

blues country
DT accuracy: 0.675
DT config: DecisionTreeClassifier(max_depth=2, min_samples_leaf=8, random_state=0)
RF accuracy: 0.75
RF config: RandomForestClassifier(min_samples_leaf=4, n_estimators=500, random_state=0)
MLP accuracy: 0.575
MLP config: MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50, 50),
              learning_rate='adaptive')

blues disco
DT accuracy: 0.775
DT config: DecisionTreeClassifier(max_depth=2, random_state=0)
RF accuracy: 0.86875
RF config: RandomForestClassifier(n_estimators=1000, random_state=0)
MLP accuracy: 0.625
MLP config: MLPClassifier(activation='logistic', hidden_layer_sizes=(250, 500))

blues hiphop
DT accurac

In [7]:
df.head()

Unnamed: 0,label1,label2,DTaccuracy,RFaccuracy,MLPaccuracy,DTestimator,RFestimator,MLPestimator
0,blues,classical,0.84375,0.98125,0.76875,"DecisionTreeClassifier(max_depth=4, random_sta...",RandomForestClassifier(random_state=0),"MLPClassifier(activation='tanh', hidden_layer_..."
1,blues,country,0.675,0.75,0.575,"DecisionTreeClassifier(max_depth=2, min_sample...","RandomForestClassifier(min_samples_leaf=4, n_e...","MLPClassifier(activation='tanh', hidden_layer_..."
2,blues,disco,0.775,0.86875,0.625,"DecisionTreeClassifier(max_depth=2, random_sta...","RandomForestClassifier(n_estimators=1000, rand...","MLPClassifier(activation='logistic', hidden_la..."
3,blues,hiphop,0.83125,0.90625,0.7,"DecisionTreeClassifier(max_depth=2, min_sample...",RandomForestClassifier(random_state=0),"MLPClassifier(activation='tanh', hidden_layer_..."
4,blues,jazz,0.66875,0.90625,0.58125,"DecisionTreeClassifier(max_depth=2, min_sample...",RandomForestClassifier(random_state=0),"MLPClassifier(activation='tanh', alpha=0.05, h..."


In [8]:
df.to_csv('classification_data_2_genre.csv', index=False)

In [9]:
df.mean()['DTaccuracy']

0.8013888888888889

In [10]:
df.mean()['RFaccuracy']

0.8798611111111111

In [11]:
df.mean()['MLPaccuracy']

0.6556944444444444

In [17]:
df.groupby(['DTestimator']).mean()['DTaccuracy']

DTestimator
DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, random_state=0)                                               0.839583
DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, random_state=0,\n                       splitter='random')    0.885417
DecisionTreeClassifier(max_depth=2, min_samples_leaf=4, random_state=0)                                               0.751786
DecisionTreeClassifier(max_depth=2, min_samples_leaf=4, random_state=0,\n                       splitter='random')    0.825000
DecisionTreeClassifier(max_depth=2, min_samples_leaf=8, random_state=0)                                               0.756250
DecisionTreeClassifier(max_depth=2, min_samples_leaf=8, random_state=0,\n                       splitter='random')    0.750000
DecisionTreeClassifier(max_depth=2, random_state=0)                                                                   0.819444
DecisionTreeClassifier(max_depth=2, random_state=0, splitter='random')                             

In [13]:
df.groupby(['RFestimator']).mean()['RFaccuracy']

RFestimator
RandomForestClassifier(max_depth=2, min_samples_leaf=4, n_estimators=1000,\n                       random_state=0)    0.743750
RandomForestClassifier(max_depth=2, n_estimators=500, random_state=0)                                                 0.837500
RandomForestClassifier(min_samples_leaf=2, n_estimators=1000, random_state=0)                                         0.825000
RandomForestClassifier(min_samples_leaf=2, random_state=0)                                                            0.781250
RandomForestClassifier(min_samples_leaf=4, n_estimators=500, random_state=0)                                          0.753125
RandomForestClassifier(min_samples_leaf=8, n_estimators=500, random_state=0)                                          0.815625
RandomForestClassifier(n_estimators=1000, random_state=0)                                                             0.860000
RandomForestClassifier(n_estimators=500, random_state=0)                                           

In [14]:
df.groupby(['MLPestimator']).mean()['MLPaccuracy']

MLPestimator
MLPClassifier(activation='identity')                                                                                                       0.456250
MLPClassifier(activation='identity', learning_rate='invscaling')                                                                           0.531250
MLPClassifier(activation='logistic', alpha=0.05,\n              hidden_layer_sizes=(1000, 1000), learning_rate='adaptive')                 0.675000
MLPClassifier(activation='logistic', alpha=0.05,\n              hidden_layer_sizes=(1000, 1000), learning_rate='invscaling')               0.750000
MLPClassifier(activation='logistic', alpha=0.05,\n              hidden_layer_sizes=(250, 500, 250), learning_rate='invscaling')            0.831250
MLPClassifier(activation='logistic', alpha=0.05, hidden_layer_sizes=(250, 500))                                                            0.568750
MLPClassifier(activation='logistic', alpha=0.05, hidden_layer_sizes=(250, 500),\n              lear