https://towardsdatascience.com/how-to-set-started-with-tensorflow-using-keras-api-and-google-colab-5421e5e4ef56

intro tensorflow for colab
https://medium.com/@oluyaled/audio-classification-using-deep-learning-and-tensorflow-a-step-by-step-guide-5327467ee9ab

augmentation of audio data
https://towardsdatascience.com/audio-deep-learning-made-simple-part-3-data-preparation-and-augmentation-24c6e1f6b52

In [None]:
!pip install --quiet wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.3/277.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# import TensorFlow
import tensorflow as tf

#Check the version of TensorFlow you are using
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.15.0
[]


In [None]:
# load required libraries
import os
import librosa
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.image import resize
from tensorflow.keras.models import load_model
#import wandb

In [None]:
from google.colab import drive
import sys

#wandb.login()

drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/ucph')


Mounted at /content/drive


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

def create_fit_model(X_train, y_train, X_val, y_val, index, model_name, target_shape):

  if model_name == 'pose':
    model = tf.keras.Sequential()

    # add input layer with the dimension of our data
    model.add(tf.keras.layers.InputLayer(input_shape=target_shape))

    # add first layer of 2D Conv followed by a batch norm
    model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())

    # add second layer of 2D Conv followed by a batch norm
    model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dropout(0.25))

    # add third layer of 2D Conv followed by a batch norm
    model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())

    # add fourth layer of 2D Conv followed by a batch norm
    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())

    # add fifth layer of 2D Conv followed by a batch norm
    model.add(tf.keras.layers.Conv2D(filters=1024, kernel_size=(3, 3), activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Dropout(0.25))

    # add global average pooling layer
    model.add(tf.keras.layers.GlobalAveragePooling2D())

    # add dense layer which activates network into 6 category output
    model.add(tf.keras.layers.Dense(units=3, activation=tf.nn.softmax))

  else:
    # initiate model
    model = tf.keras.Sequential()

    # add input layer with the dimension of our data
    model.add(tf.keras.layers.InputLayer(input_shape=target_shape))

    # add first layer of 1D Conv followed by a batch norm
    model.add(tf.keras.layers.Conv1D(filters=256, kernel_size=10))
    model.add(tf.keras.layers.BatchNormalization())

    # add Relu and Global average pooling layer
    model.add(tf.keras.layers.ReLU())
    model.add(tf.keras.layers.GlobalAveragePooling1D())

    # add dense layer which activates network into 6 category output
    model.add(tf.keras.layers.Dense(units=3, activation=tf.nn.softmax))

  # compile model with optimizer -> optimize learning rate
  model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='categorical_crossentropy', metrics=['accuracy'])

  # Define the file path where the best model will be saved
  checkpoint_filepath = 'best_model.h5'

  # Define a ModelCheckpoint callback to save the best model during training
  model_checkpoint_callback = ModelCheckpoint(
      filepath=checkpoint_filepath,
      monitor='val_accuracy',  # Save the model based on validation accuracy
      save_best_only=True,
      mode='max',
      verbose=1
  )

  # fit our model indicating epoch and batch_size
  model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), callbacks=[model_checkpoint_callback])

  # After training, load the best model
  best_model = load_model(checkpoint_filepath)

  scores = best_model.evaluate(X_val, y_val,verbose=0)
  print(scores)

  # Save the model
  best_model.save(f'/content/drive/MyDrive/ucph/CCS2/models/{model_name}_classification_model_{index}.keras')

  return scores


In [None]:
from sklearn.model_selection import KFold

def train_model(model_name, target_shape, train_df, test_df):

  X_train, X_test, y_train, y_test = get_model_input_data(train_df, test_df, model_name)

  # Assuming X_train and y_train are your feature and target datasets
  kf = KFold(n_splits=10, shuffle=True, random_state=42)

  # Define lists to store training and validation indices
  train_indices_list = []
  val_indices_list = []

  # Generate the training and validation indices for each fold
  for train_indices, val_indices in kf.split(X_train):
      train_indices_list.append(train_indices)
      val_indices_list.append(val_indices)

  # define list to store scores (loss, accuracy, val_loss, val_arruracy)
  scores = []

  # use cross validation
  for fold in range(10):
      X_train_fold = X_train[train_indices_list[fold]]
      y_train_fold = y_train[train_indices_list[fold]]
      X_val_fold = X_train[val_indices_list[fold]]
      y_val_fold = y_train[val_indices_list[fold]]

      test_scores = create_fit_model(X_train_fold, y_train_fold, X_val_fold, y_val_fold, fold, model_name, target_shape)
      scores.append(test_scores)

  # find the best model among the 10 -> best val accuracy
  print(f"Model: {model_name}")
  for entry in scores:
    print(entry)

  # find max for val accuracy
  max_index = np.argmax(np.array(scores)[:, -1])
  print(f"Best model was found to be: {model_name}_classification_model_{max_index}")
  return f"{model_name}_classification_model_{max_index}"

In [None]:
def get_person(row):
  filename = row['filename']
  person = "_".split(filename)[1]
  return person

def convert_label(df):
  lab2id = {"clear": 0, "slice": 1, "smash": 2}
  # df['label'] = df['shot'].map(lab2id) this did not word as intended
  def map_label(shot):
        return lab2id.get(shot, -1)  # return -1 if shot is not in lab2id

  # Apply the map_label function to each element of the 'shot' column
  df['label'] = df['shot'].apply(map_label)
  return df

def add_mel_spectrogram(df):
  def create_mel_spectrogram(audio_array):
    audio_array = np.array(audio_array)
    mel_spectrogram = librosa.feature.melspectrogram(y=audio_array)
    return np.array(mel_spectrogram)

  df["audio"] = df["audio"].apply(create_mel_spectrogram)
  return df

def prepare_and_load_dataset():
  path = '/content/drive/MyDrive/ucph/CCS2/data/'
  column_names = ["filename", "shot", "pose", "audio", "mel_spectrogram", "start_point", "end_point"]
  # read csv and convert label
  train_df = pd.read_json(f'{path}train_0.json')
  test_df = pd.read_json(f'{path}test_0.json')

  for i in range(1, 10):
    data = pd.read_json(f'{path}train_{i}.json')
    train_df = pd.concat([train_df, data], ignore_index=True)
    data = pd.read_json(f'{path}test_{i}.json')
    test_df = pd.concat([test_df, data], ignore_index=True)

  # reset index
  train_df.reset_index(drop=True, inplace=True)
  test_df.reset_index(drop=True, inplace=True)

  # shuffle datasets
  train_df = train_df.sample(frac=1, random_state=42)
  test_df = test_df.sample(frac=1, random_state=42)

  # convert string lables to int
  train_df = convert_label(train_df)
  test_df = convert_label(test_df)

  # create mel_spectrograms
  train_df = add_mel_spectrogram(train_df)
  test_df = add_mel_spectrogram(test_df)

  # create pose arrays
  def create_pose_array(pose_array):
    try:
      pose_array = np.array(pose_array)
      if pose_array.shape == (62, 33, 4):
        # Calculate the mean and standard deviation over the second and third dimensions for each entry
        mean = np.mean(pose_array, axis=(0, 1))
        std = np.std(pose_array, axis=(0, 1))
        # Normalize the array
        pose_array = (pose_array - mean) / std
        return pose_array
      else:
        return np.nan
    except:
      return np.nan

  train_df['pose'] = train_df['pose'].apply(create_pose_array)
  train_df = train_df.dropna(subset=['pose'])

  test_df['pose'] = test_df['pose'].apply(create_pose_array)
  test_df = test_df.dropna(subset=['pose'])

  # create comb array
  def create_comb(row):
    pose_array = row['pose'].reshape(62, -1)
    mel_spec = np.array(row['audio'])
    if pose_array.shape == (62, 33, 4) and mel_spec.shape == (128, 176):
      pose = pose_array.resize(62, -1)
      zeros_array = np.zeros((128, 33*4))
      for i in range(62):
        zeros_array[(i*2)+1] = pose[i] # every second row for the pose part is empty
      combined = np.concatenate((zeros_array, mel_spec), axis=1)
      return combined

  train_df["comb"] = train_df.apply(create_comb, axis=1)
  test_df["comb"] = test_df.apply(create_comb, axis=1)

  return train_df, test_df

def get_model_input_data(train_df, test_df, column_name):
  # extract data and labels
  x_train = train_df[column_name].values
  x_train = np.stack(x_train)
  x_test = test_df[column_name].values
  x_test = np.stack(x_test)
  y_train = train_df["label"].values
  y_train_cat = to_categorical(y_train, num_classes=3)  # Convert labels to one-hot encoding
  y_test = test_df["label"].values
  return x_train, x_test, y_train_cat, y_test

In [None]:
from sklearn.metrics import f1_score, accuracy_score

# Define your class labels
classes = ['clear', 'slice', 'smash']

# Function to preprocess and classify one sample
def test_sample(sample, model):

    # Make predictions
    sample = sample.reshape(1, *sample.shape)
    predictions = model.predict(sample)

    # Get the class probabilities
    class_probabilities = predictions[0]

    # Get the predicted class index
    predicted_class_index = np.argmax(class_probabilities)
    return class_probabilities, predicted_class_index

# test the best model for audio, pose and comb for test data
def test_model(target_shape, model_file_name, X_test, y_test):

  # Load the saved model
  model = load_model(f'/content/drive/MyDrive/ucph/CCS2/models/{model_file_name}.keras')

  predictions = []
  predictions_per_class = []

  # Test an audio file
  for index in range(X_test.shape[0]):
    class_probabilities, predicted_class_index = test_sample(X_test[index], model)
    predictions.append(predicted_class_index)
    predictions_per_class.append(class_probabilities)

  y_test_cat = to_categorical(y_test, num_classes=3)  # Convert labels to one-hot encoding

  macro_f1 = f1_score(y_test, predictions, average='macro')

  # Calculate accuracy for each individual class
  class_accuracies = []
  for i in range(len(classes)):
      class_accuracy = accuracy_score(y_test == i, np.array(predictions) == i)
      class_accuracies.append(class_accuracy)

  # Calculate overall accuracy
  overall_accuracy = accuracy_score(y_test, predictions)

  # convert predictions to other data types
  predictions = np.array(predictions)
  predictions_per_class = np.stack(predictions_per_class)

  # calculate mean abs error
  mean_abs_error = np.sum(np.abs(np.where(y_test-predictions != 0, 1, 0))) / y_test.shape[0]

  # calculate rel error
  a = np.sum(np.multiply(y_test_cat, predictions_per_class), axis=1)
  b = np.where(y_test-predictions != 0, 1, 0)
  rel_error = np.sum(np.multiply(a, b))

  # print all the results
  print("-------------------------------------------------------------------------")
  print(f"{model_name} evaluation on test set")
  print(f"Macro F1 score: {macro_f1}")
  print("Accuracy for each individual class:")
  for i, accuracy in enumerate(class_accuracies):
      print(f"Class {classes[i]}: {accuracy}")
  print(f"Overall accuracy: {overall_accuracy}")
  print(f"Mean Absolute Error: {mean_abs_error}")
  print(f"Relative Error: {rel_error}")
  return predictions

In [None]:
train_df, test_df = prepare_and_load_dataset()

550


In [None]:
best_models = {}

# train_df, test_df = prepare_and_load_dataset()

for model_name, target_shape in zip(['audio', 'pose', 'comb'], [(128, 176), (62, 33, 4), (128, 308)]):
  # for model_name, target_shape in zip(['pose'], [(62, 33, 4)]):
  model_file_name = train_model(model_name, target_shape, train_df, test_df)
  X_train, X_test, y_train, y_test = get_model_input_data(train_df, test_df, model_name)
  predictions = test_model(target_shape, model_file_name, X_test, y_test)


Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.34545, saving model to best_model.h5
Epoch 2/20

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.34545 to 0.41818, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_accuracy improved from 0.41818 to 0.49091, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_accuracy improved from 0.49091 to 0.72727, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_accuracy improved from 0.72727 to 0.76364, saving model to best_model.h5
Epoch 6/20
Epoch 6: val_accuracy did not improve from 0.76364
Epoch 7/20
Epoch 7: val_accuracy improved from 0.76364 to 0.81818, saving model to best_model.h5
Epoch 8/20
Epoch 8: val_accuracy improved from 0.81818 to 0.83636, saving model to best_model.h5
Epoch 9/20
Epoch 9: val_accuracy did not improve from 0.83636
Epoch 10/20
Epoch 10: val_accuracy improved from 0.83636 to 0.85455, saving model to best_model.h5
Epoch 11/20
Epoch 11: val_accuracy improved from 0.85455 to 0.87273, saving model to best_model.h5
Epoch 12/20
Epoch 12: val_accuracy did not improve from 0.87273
Epoch 13/20
Epoch 13: val_accuracy did not 

  saving_api.save_model(


Epoch 2/20
Epoch 2: val_accuracy did not improve from 0.47273
Epoch 3/20
Epoch 3: val_accuracy did not improve from 0.47273
Epoch 4/20
Epoch 4: val_accuracy did not improve from 0.47273
Epoch 5/20
Epoch 5: val_accuracy improved from 0.47273 to 0.58182, saving model to best_model.h5
Epoch 6/20
Epoch 6: val_accuracy did not improve from 0.58182
Epoch 7/20
Epoch 7: val_accuracy did not improve from 0.58182
Epoch 8/20
Epoch 8: val_accuracy did not improve from 0.58182
Epoch 9/20
Epoch 9: val_accuracy did not improve from 0.58182
Epoch 10/20
Epoch 10: val_accuracy did not improve from 0.58182
Epoch 11/20
Epoch 11: val_accuracy did not improve from 0.58182
Epoch 12/20
Epoch 12: val_accuracy did not improve from 0.58182
Epoch 13/20
Epoch 13: val_accuracy improved from 0.58182 to 0.61818, saving model to best_model.h5
Epoch 14/20
Epoch 14: val_accuracy did not improve from 0.61818
Epoch 15/20
Epoch 15: val_accuracy did not improve from 0.61818
Epoch 16/20
Epoch 16: val_accuracy did not improve

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type NoneType).