# Music Genre Classification using multiple classifiers
Team Members: Lisa Korntheuer, Jan Birkert, Adrian Desiderato, Jan Wangerin, Spyridon Spyropoulos

## Imports

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# 0. Data understanding
Data describe (Features, Target etc.)
- filename and length irrelevant for ML
- 57 features -> PCA?
- only numerical data except for class labels ("label")

In [None]:
df = pd.read_csv('./data/features_30_sec.csv')
df.info()

In [None]:
df.head()

Correlations between features:

In [None]:
cor = df.iloc[:, 2:-2].corr()
fig, ax = plt.subplots(figsize=(12,12))
ax = sns.heatmap(cor, square = True, xticklabels=True, yticklabels=True) 
plt.show()

Since there are quite a few feature combinations with high correlations, PCA may be worth a try. (See Data Prep)

## 1. Data preparation
Jan W.

Data splitting

y = LabelEncoder() 

MinMax()
Das andere() 


In [6]:
LabelEnc = LabelEncoder()
y = df['label']
y = pd.DataFrame(LabelEnc.fit_transform(y))
df['label_enc'] = y

In [None]:
scaler_mms = MinMaxScaler()
scaler_ss = StandardScaler()
X = df.loc[:, 'chroma_stft_mean' : 'mfcc20_var']
X_scaled_array_mms = scaler_mms.fit_transform(X)
X_scaled_array_ss = scaler_ss.fit_transform(X)
X_scaled_mms = pd.DataFrame(X_scaled_array_mms, columns=X.columns)
X_scaled_ss = pd.DataFrame(X_scaled_array_ss, columns=X.columns)
print(X)
print(X_scaled_mms)
print(X_scaled_ss)

PCA: (copied from Material Notebook 04, probably has to be adjusted later on)

In [8]:
pca = PCA() # typically you add here as a parameter the nbr. of cmponents: i.e.: n_components=2
            # we leave it blank to get all!
pcs = pca.fit_transform(X_scaled_ss) # principle components

Eigenvalues:

In [None]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

Principal Components (Dot Product of Data and Eigenvectors):

In [None]:
print(pcs[:5])
print()
print(len(pcs))

Scree Plot with Kaiser Criteria

In [None]:
import matplotlib.ticker as ticker
fig = plt.figure()
ax = plt.axes()

pc_values = np.arange(pca.n_components_) + 1
ax.plot(pc_values, pca.explained_variance_, 'o-', linewidth=2, color='blue')
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))

plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.axhline(y=1, linewidth=1, color='r')
plt.show()

Potentially, a lot of dimensions could be removed according to the Kaiser criteria. The following enumeration shows how much "information" is contained in how many of the principal components:

In [None]:
for i in [10, 15, 30, 45]:
    print(np.sum(pca.explained_variance_ratio_[:i]))

To fight the curse of dimensionality, some dimensions could be removed, for example the last 12 to even 27 dimensions, since about 94% of "information" is contained in the first 30 PCs.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train_mms, X_test_mms, y_train_mms, y_test_mms = train_test_split(X_scaled_mms, y, test_size=0.2, random_state=0, stratify=y)
X_train_ss, X_test_ss, y_train_ss, y_test_ss = train_test_split(X_scaled_ss, y, test_size=0.2, random_state=0, stratify=y)

## 2. Model training 

Each Modell is trained and the quality of the classifier(accuracy) is displayed. 

### 2.1 Random Forests
Spyridon 

In this section Random Forest as a classifier will be tested. In the first step all important libraries will be imported.

In [None]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.tree as tree

When training random forests, there is no heavy hyperparameter needed. The structure of the model is already decreasing Bias and Variance. So it is enough only to tune the numbers of trees in the ensemlbe "n_estimators" and the spliting criterion.

In [None]:

rf = RandomForestClassifier(random_state=0, n_jobs=-1)
param_grid = {'n_estimators': np.array([ 100, 250, 500, 1000, 2000, 7000]), 
              'criterion':['gini','entropy', 'log_loss'],
              }
grid_search_rf = GridSearchCV(rf, param_grid, n_jobs=-1, cv=2, scoring='accuracy', verbose=3, refit=True)
grid_search_rf.fit(X_train, y_train.values.ravel())
#rf.fit(X_train, y_train.values.ravel())
y_pred = grid_search_rf.predict(X_test)
print(grid_search_rf.best_params_)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
grid_search_rf.score(X_test, y_test)

### 2.2 Decision trees

Jan W.

First try using post-pruning and the entire dataset. Post-pruning is done using hyperparameter-tuning with GridsearchCV.

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn import tree

clf = DecisionTreeClassifier(random_state=0) #maybe use variable for random state so that all classifiers can be adjusted at the same time
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

plt.show()

In [None]:
parameters = {'ccp_alpha':ccp_alphas[:-1].tolist()}
gs = GridSearchCV(DecisionTreeClassifier(random_state=0), parameters, cv=10, refit=True)
gs.fit(X_train,y_train)
tree_best = gs.best_estimator_
pred = tree_best.predict(X_test)
print('Accuracy', accuracy_score(y_test, pred))

In [None]:
rules = export_text(tree_best, feature_names=X.columns)
print(rules)
print()
print("Feature importance:\n")
feature_importance = {}
i = 0
for col in X.columns:
    feature_importance[col] = tree_best.feature_importances_[i]
    i += 1
features_sorted = sorted(feature_importance.items(), key=lambda x : x[1])
features_sorted.reverse()
for feature in features_sorted:
    print(feature)

In [None]:
fig = plt.figure(figsize=(10,10))
text = tree.plot_tree(tree_best, 
                   feature_names=X.columns.to_list(), 
                   filled=True)

plt.show()

Maybe try pre pruning with lower maximum height of tree, although that probably won't lead to better results. 

In [None]:
cls = DecisionTreeClassifier(random_state=0)

params = {'max_depth':np.arange(3,15),
#          'min_samples_leaf':[3,5,10,15,20],
#          'min_samples_split':[8,10,12,18,20,16],
          'criterion':['gini','entropy']}
gs = GridSearchCV(cls, params, scoring='accuracy', cv=10, verbose=3, n_jobs=-1)
gs.fit(X_train, y_train)
params_optimal = gs.best_params_

print("Best Score: %f" % gs.best_score_)
print("Optimal Hyperparameter Values: ", params_optimal)

In [None]:
tree_best = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=11) #, min_samples_leaf=20, min_samples_split=8)
tree_best.fit(X_train, y_train)
pred = tree_best.predict(X_test)

print('Test accuracy',accuracy_score(y_test, pred))

In [None]:
fig = plt.figure(figsize=(10,10))
text = tree.plot_tree(tree_best, 
                   feature_names=X.columns.to_list(), 
                   filled=True)

plt.show()

Also try reduction of dimensions with PCA (only first 30 or so dimensions?)

### 2.3 KNN
Lisa 

In [76]:
# Define parameter combinations for hyperparameter tuning via cross validation 
params = {'n_neighbors': np.arange(1,40),               # parameter k 
              'weights': ['uniform', 'distance'],       # parameter weights
              'metric' : ['euclidean','manhattan']}     # parameter metric

In [None]:

# Create KNN classifier
knn = KNeighborsClassifier()
# Use GridSearchCV to tune multiple parameters
gs = GridSearchCV(knn, params, scoring='accuracy', cv=10, verbose=3, n_jobs=-1, refit=True)
# Train
gs.fit(X_train_mms, y_train_mms)    # Use training data scaled with MinMaxScaler

In [None]:
params_optimal = gs.best_params_

print("Best score: %f" % gs.best_score_)
print("Optimal hyperparameters: ", params_optimal)

In [79]:
# Choose optimal classifier to predict
knn_optimal = gs.best_estimator_
y_pred = knn_optimal.predict(X_test_mms)

In [None]:
# Accuracy for tuned KNN
accuracy = accuracy_score(y_test_mms, y_pred)
print('Accuracy:', accuracy)  

### 2.4 Neural Networks

## 3. Comparing Models 

AUC + ROC 
Accuracy summary

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

# Handle NaN values
X_train_mms = X_train_mms.fillna(X_train_mms.mean())
X_test_mms = X_test_mms.fillna(X_test_mms.mean())

# Binarize the output
y_test_bin = label_binarize(y_test_mms, classes=np.unique(y))
y_train_bin = label_binarize(y_train_mms, classes=np.unique(y))
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(10, 10))
colors = ['red', 'blue', 'green']
linestyles = ['-', '--', '-.']
classifiers = [knn_optimal, tree_best, grid_search_rf.best_estimator_]
labels = ['KNN', 'Decision Trees', 'Random Forest']

for clf, label, clr, ls in zip(classifiers, labels, colors, linestyles):
    classifier = OneVsRestClassifier(clf)
    y_score = classifier.fit(X_train_mms, y_train_bin).predict_proba(X_test_mms)    
    # Compute micro-average ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
    roc_auc = auc(fpr, tpr)
    
    # Plot the micro-average ROC curve
    plt.plot(fpr, tpr, color=clr, linestyle=ls, label='%s (AUC = %0.2f)' % (label, roc_auc))

# Add a diagonal line for reference
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2)

plt.legend(loc='lower right')
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.tight_layout()
# plt.savefig('./figures/roc.png', dpi=300)
plt.show()

## 4. OPTIONAL: Song import and classify

## 5. References 