In [1]:
# experiment with Wide, Deep, and Cross networks
# https://keras.io/examples/structured_data/wide_deep_cross_networks/

In [62]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

In [63]:
# Read in the data
df_h = pd.read_csv('C:/Users/Simon/Documents/projects/MusicLabel/data/h_list.csv', index_col=0)
df_s = pd.read_csv('C:/Users/Simon/Documents/projects/MusicLabel/data/s_list.csv', index_col=0)
df = pd.concat([df_h, df_s], ignore_index=True)

# Drop variables not used in model, and recode artist
df.drop(['title', 'album', 'sr'], axis=1, inplace=True)
artist_to_change = df.loc[df.groupby('artist')['artist'].transform('size') <= 2, 'artist']
df.loc[df['artist'].isin(artist_to_change), 'artist'] = 'Other artists'
df['artist'].fillna('Other artists', inplace = True)

In [64]:
train_splits = []
test_splits = []
split = 0.2

for _, group_data in df.groupby("label"):
    random_selection = np.random.rand(len(group_data.index)) <= (1-split)
    train_splits.append(group_data[random_selection])
    test_splits.append(group_data[~random_selection])

train_data = pd.concat(train_splits).sample(frac=1).reset_index(drop=True)
test_data = pd.concat(test_splits).sample(frac=1).reset_index(drop=True)

print(f"Train split size: {len(train_data.index)}")
print(f"Test split size: {len(test_data.index)}")

Train split size: 250
Test split size: 58


In [65]:
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False)
test_data.to_csv(test_data_file, index=False)

In [66]:
TARGET_FEATURE_NAME = "label"
TARGET_FEATURE_LABELS = ["0", "1"]
NUM_CLASSES = len(TARGET_FEATURE_LABELS)

NUMERIC_FEATURE_NAMES = df.select_dtypes(include=np.number).columns.tolist()[1:] # exclude label

CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "artist": list(df["artist"].unique())
}

CSV_HEADER = df.columns.tolist()

CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ["NA"]
    for feature_name in df.columns.tolist()
]

In [67]:
def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):

    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=True,
        shuffle=shuffle,
    )
    return dataset.cache()

In [68]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float64
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [69]:
def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int" if use_embedding else "binary",
            )
            if use_embedding:
                # Convert the string input values into integer indices.
                encoded_feature = lookup(inputs[feature_name])
                embedding_dims = int(math.sqrt(len(vocabulary)))
                # Create an embedding layer with the specified dimensions.
                embedding = layers.Embedding(
                    input_dim=len(vocabulary), output_dim=embedding_dims
                )
                # Convert the index values to embedding representations.
                encoded_feature = embedding(encoded_feature)
            else:
                # Convert the string input values into a one hot encoding.
                encoded_feature = lookup(tf.expand_dims(inputs[feature_name], -1))
        else:
            # Use the numerical features as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)

        encoded_features.append(encoded_feature)

    all_features = layers.concatenate(encoded_features)
    return all_features

In [70]:
def create_baseline_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)

    for units in hidden_units:
        features = layers.Dense(units)(features)
        features = layers.BatchNormalization()(features)
        features = layers.ReLU()(features)
        features = layers.Dropout(dropout_rate)(features)

    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [73]:
def run_experiment(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    test_dataset = get_dataset_from_csv(test_data_file, batch_size)

    print("Start training the model...")
    history = model.fit(train_dataset, epochs=num_epochs)
    print("Model training finished")

    _, accuracy = model.evaluate(test_dataset, verbose=0)

    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

In [74]:
learning_rate = 0.001
dropout_rate = 0.1
batch_size = 32
num_epochs = 20
hidden_units = [32, 32]

baseline_model = create_baseline_model()

In [35]:
run_experiment(baseline_model)

Start training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model training finished
Test accuracy: 60.71%


In [38]:
def create_wide_and_deep_model():

    inputs = create_model_inputs()
    wide = encode_inputs(inputs)
    wide = layers.BatchNormalization()(wide)

    deep = encode_inputs(inputs, use_embedding=True)
    for units in hidden_units:
        deep = layers.Dense(units)(deep)
        deep = layers.BatchNormalization()(deep)
        deep = layers.ReLU()(deep)
        deep = layers.Dropout(dropout_rate)(deep)

    merged = layers.concatenate([wide, deep])
    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


wide_and_deep_model = create_wide_and_deep_model()
run_experiment(wide_and_deep_model)

Start training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model training finished
Test accuracy: 71.43%


In [45]:
num_epochs = 10

run_experiment(wide_and_deep_model)

Start training the model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model training finished
Test accuracy: 73.21%


In [48]:
num_epochs = 5

run_experiment(wide_and_deep_model)

Start training the model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model training finished
Test accuracy: 66.07%


In [39]:
def create_deep_and_cross_model():

    inputs = create_model_inputs()
    x0 = encode_inputs(inputs, use_embedding=True)

    cross = x0
    for _ in hidden_units:
        units = cross.shape[-1]
        x = layers.Dense(units)(cross)
        cross = x0 * x + cross
    cross = layers.BatchNormalization()(cross)

    deep = x0
    for units in hidden_units:
        deep = layers.Dense(units)(deep)
        deep = layers.BatchNormalization()(deep)
        deep = layers.ReLU()(deep)
        deep = layers.Dropout(dropout_rate)(deep)

    merged = layers.concatenate([cross, deep])
    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


deep_and_cross_model = create_deep_and_cross_model()

"""
Let's run it:
"""

run_experiment(deep_and_cross_model)

Start training the model...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model training finished
Test accuracy: 62.5%


In [41]:
run_experiment(deep_and_cross_model, num_epochs=9)

Start training the model...
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
Model training finished
Test accuracy: 64.29%


In [42]:
# Improvement over basic model, wide and deep seem to work best, but still cannot get me in the 80%

In [11]:
# Try with PCA using these NN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector

In [8]:
x_train = train_data.loc[:, ~train_data.columns.isin(['label'])]
y_train = train_data.loc[:, 'label'].values

x_test = test_data.loc[:, ~test_data.columns.isin(['label'])]
y_test = test_data.loc[:, 'label'].values

In [12]:
# Run PCA and use 95% variance explained
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), make_column_selector(dtype_exclude='object')),
    ('cat', OneHotEncoder(), make_column_selector(dtype_include='object'))])
x_train_pp = preprocessor.fit_transform(x_train)

pca = PCA(.95)
x_train_pca = pca.fit_transform(x_train_pp)

In [15]:
x_test_pp = preprocessor.transform(x_test)
x_test_pca = pca.transform(x_test_pp)

In [99]:
# Combine back to a pd df to repeat the NN analysis above
train_pca = pd.concat([pd.DataFrame(x_train_pca).add_prefix('pca_'), pd.DataFrame(y_train, columns=['label'])], axis=1)
test_pca = pd.concat([pd.DataFrame(x_test_pca).add_prefix('pca_'), pd.DataFrame(y_test, columns=['label'])], axis=1)

In [100]:
TARGET_FEATURE_NAME = "label"
TARGET_FEATURE_LABELS = ["0", "1"]
NUM_CLASSES = len(TARGET_FEATURE_LABELS)

NUMERIC_FEATURE_NAMES = train_pca.select_dtypes(include=np.number).columns.tolist()[:-1] # exclude label

CSV_HEADER = train_pca.columns.tolist()

FEATURE_NAMES = NUMERIC_FEATURE_NAMES

COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ["NA"]
    for feature_name in train_pca.columns.tolist()
]

In [102]:
train_data_file = "train_data_pca.csv"
test_data_file = "test_data_pca.csv"

train_pca.to_csv(train_data_file, index=False)
test_pca.to_csv(test_data_file, index=False)

In [103]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float64
        )
    return inputs

In [104]:
def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
        encoded_feature = tf.expand_dims(inputs[feature_name], -1)
        encoded_features.append(encoded_feature)

    all_features = layers.concatenate(encoded_features)
    return all_features

In [148]:
hidden_units = [8, 8, 8, 8, 8]
learning_rate = 0.001
dropout_rate = 0.1
batch_size = 32
num_epochs = 100

baseline_model = create_baseline_model()
run_experiment(baseline_model)

Start training the model...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 

In [150]:
def create_wide_and_deep_model():

    inputs = create_model_inputs()
    wide = encode_inputs(inputs)
    wide = layers.BatchNormalization()(wide)

    deep = encode_inputs(inputs, use_embedding=True)
    for units in hidden_units:
        deep = layers.Dense(units)(deep)
        deep = layers.BatchNormalization()(deep)
        deep = layers.ReLU()(deep)
        deep = layers.Dropout(dropout_rate)(deep)

    merged = layers.concatenate([wide, deep])
    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [154]:
hidden_units = [16, 16, 16]
learning_rate = 0.001
dropout_rate = 0.1
batch_size = 32
num_epochs = 30

wide_and_deep_model = create_wide_and_deep_model()
run_experiment(wide_and_deep_model)

Start training the model...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Model training finished
Test accuracy: 69.01%


In [131]:
def create_deep_and_cross_model():

    inputs = create_model_inputs()
    x0 = encode_inputs(inputs, use_embedding=True)

    cross = x0
    for _ in hidden_units:
        units = cross.shape[-1]
        x = layers.Dense(units)(cross)
        cross = x0 * x + cross
    cross = layers.BatchNormalization()(cross)

    deep = x0
    for units in hidden_units:
        deep = layers.Dense(units)(deep)
        deep = layers.BatchNormalization()(deep)
        deep = layers.ReLU()(deep)
        deep = layers.Dropout(dropout_rate)(deep)

    merged = layers.concatenate([cross, deep])
    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [163]:
hidden_units = [8, 8, 8]
learning_rate = 0.001
dropout_rate = 0.1
batch_size = 32
num_epochs = 5

deep_and_cross_model = create_deep_and_cross_model()
run_experiment(deep_and_cross_model)

Start training the model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model training finished
Test accuracy: 56.34%


In [160]:
# tweaking these may improve it but doesn't seem likely to have a significant jump
# PCA w/ NN didn't really improve the performance much like in RF/xgboost

In [44]:
# Try alternative approach - instead of structured data, time series to perserve/represent music better
# Another is a bit like time series but instead of time series data, use the sepctogram image