In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import keras_tuner as kt
from keras.callbacks import EarlyStopping
import seaborn as sns
from IPython.core.display_functions import display
from IPython.display import Audio
from livelossplot import PlotLossesKeras

import keras
from keras import layers
from keras.models import Sequential

from keras_preprocessing.image import img_to_array

import tensorflow as tf
from keras.layers import *

import os
from tqdm.notebook import tqdm
from pathlib import Path
import shutil


from music_plots import *
from sklearn.metrics import roc_auc_score
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix


<h1> Data management </h1>

In [None]:
tracks_df = load("data/tracks.csv")
genres_df = load("data/genres.csv")
features_df = load("data/features.csv")

In [None]:
tracks_df.head()
genres_df.head()
#features_df.head()

In [None]:
track_genres = tracks_df.xs('track', level=0, axis=1)['genre_top'].loc[features_df.dropna().index]
track_genres = track_genres.dropna()
features_df = features_df.loc[track_genres.index]

track_genres

In [None]:
value_counts = track_genres.value_counts()
print(value_counts)
value_counts.plot.bar()

In [None]:
# genres dictionary
genres = {}
index = 0
for i in track_genres.unique():
    genres[i] = index
    index += 1
print(genres)

In [None]:
print(len(track_genres))
print(len(features_df))
features_df.isna().any()
features_df

In [None]:
# generate train and test set

X_train, X_test, y_train, y_test = train_test_split(features_df.iloc[:8000], track_genres.iloc[:8000], test_size=0.4, random_state=42, stratify=track_genres.iloc[:8000])
# X_train, X_test, y_train, y_test = train_test_split(features_df, track_genres, test_size=0.4, random_state=42, stratify=track_genres)

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

lab_encoder = LabelEncoder()
lab_encoder.fit(y_train)

y_train = lab_encoder.transform(y_train)
y_test = lab_encoder.transform(y_test)

In [None]:
def evaluate_classifier(x_tr, x_te, y_tr, y_te, model):
    model.fit(x_tr, y_tr)
    prediction = model.predict(x_te)
    print(prediction[:10])
    print(y_te[:10])
    proba_train = model.predict_proba(x_te)
    # pd.DataFrame(proba_train, columns=list(np.unique(lab_encoder.inverse_transform(y_train))))
    print(pd.DataFrame(proba_train))
    print(classification_report(y_te, prediction))

<h1>PCA</h1>

In [14]:
from sklearn.decomposition import PCA

def pca_data(train_set, test_set, n_components, to_scale=True):

    if to_scale:
        # scale x_data
        data_scaler = StandardScaler()
        data_scaler.fit(train_set)

        train_set = data_scaler.transform(train_set)
        test_set = data_scaler.transform(test_set)

    # PCA
    pca = PCA(n_components = n_components)
    pca.fit(train_set)

    train_set = pca.transform(train_set)
    test_set = pca.transform(test_set)

    return train_set, test_set

In [None]:
# PCA test
x_data, x_test,_,_ = train_test_split(features_df.iloc[:8000], track_genres.iloc[:8000], test_size=0.4, random_state=42, stratify=track_genres.iloc[:8000])

# already scaled
x, y = pca_data(X_train, X_test, 2)
print(x)
print(y)

# to scale
x, y = pca_data(x_data, x_test, 2, to_scale=True)
print(x)
print(y)

<h1>SVM</h1>

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True)
print("Without PCA")
evaluate_classifier(X_train, X_test, y_train, y_test, svm_model)

pca_svm_model = SVC(kernel='rbf', class_weight='balanced', probability=True)
print("With PCA")
pca_train, pca_test = pca_data(X_train, X_test, 2)
print(pca_train, pca_test)
evaluate_classifier(pca_train, pca_test, y_train, y_test, pca_svm_model)

<h1>K-nearest neighbors</h1>

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
evaluate_classifier(X_train, X_test, y_train, y_test, knn_model)

<h1>Random forest</h1>

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
evaluate_classifier(X_train, X_test, y_train, y_test, rf_model)

<h1>Naive bayes</h1>

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
evaluate_classifier(X_train, X_test, y_train, y_test, nb_model)

<h1>Neural network</h1>

In [None]:
# Split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(features_df, track_genres, train_size=0.8, random_state=42, stratify=track_genres)

# Split remaining dataset in test and validation
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

print(f"Training has {len(X_train), len(y_train)}, Validation has {len(X_valid), len(y_valid)}, Testing has {len(X_test), len(y_test)}")

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

lab_encoder = LabelEncoder()
lab_encoder.fit(y_train)

y_train = lab_encoder.transform(y_train)
y_valid = lab_encoder.transform(y_valid)
y_test = lab_encoder.transform(y_test)

y_train = tf.keras.utils.to_categorical(y_train)
y_valid = tf.keras.utils.to_categorical(y_valid)
y_test = tf.keras.utils.to_categorical(y_test)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

In [None]:
model = Sequential()

model.add(Dense(256, activation='relu', input_shape=(518,)))
model.add(Dense(128, activation='relu'))
print(len(y_train[1]))
model.add(Dense(len(y_train[1]), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

keras.utils.plot_model(model, show_shapes=True)

In [None]:
early_stop = EarlyStopping(monitor='val_loss',
                           patience=10,
                           restore_best_weights=True,
                           mode='min')

history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=50, callbacks=[early_stop, PlotLossesKeras()])

In [None]:
model.evaluate(X_test, y_test)

In [None]:
def model_builder(hp):
    model = keras.Sequential()

    # Tune the number of units in the first Dense layer
    # Choose an optimal value between 32-512
    hp_units1 = hp.Int('units1', min_value=32, max_value=512, step=32)
    model.add(Dense(hp_units1, activation='relu', input_shape=(518,)))
    hp_units2 = hp.Int('units2', min_value=32, max_value=512, step=32)
    model.add(Dense(hp_units2, activation='relu'))

    model.add(Dense(len(y_train[1]), activation='softmax'))
    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

    return model

tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch, validation_data=(X_valid, y_valid), callbacks=[PlotLossesKeras()])

In [None]:
hypermodel.evaluate(X_test, y_test)