In [None]:
!pip install tensorflow_io
!pip install pydub



In [None]:
import os
import tensorflow as tf
%config Completer.use_jedi = False
# import globals
# import preprocess_data
# from contrPredCod_model import CPC, InfoNCE
# from classifier_model import Classifier
# from training import main_train_eval_loop

# globals.py

In [None]:
def initialize():
    global data_generator_arguments
    data_generator_arguments = {}

    global encoder_args
    encoder_args = {}
    
    global ar_args
    ar_args = {}

# hyperparams

In [None]:
### Hyperparameters ###
# data path
cwd = '/content/drive/MyDrive/Colab Notebooks/WS2021/ANN/final'
cpc_data_path = cwd+"/data/2000songs.zip (Unzipped Files)/2000songs"  # cpc train data
files = os.listdir(cpc_data_path)
filepaths = [os.path.join(cpc_data_path, f) for f in files]
gtzan_feature_path = cwd+'/data/gtzan/features_30_sec.csv'  # baseline features
weight_path = cwd+'/model/cpc'  # where to save weights
load_path = False  # from where to load weights

# CPC data params
# TODO: globals.init if using python scipts and importing
initialize()  # init the global variable
data_generator_arguments = {
    "T": 27,  # timestep
    "k": 3,  # timestep
    "N": 8,  # number
    "full_duration": 4,  # sec
    "original_sr": 22050,  # Hz
    "desired_sr": 4410,  # Hz
    "filepaths": filepaths
    }

# classifier data params
split_rate = 0.8  # train_test split
batch_size_classifier = 16

# encoder params
enc_model = '1d_conv'  # 'spectogram' 
z_dim = 256  # output dim
encoder_args = {
    "z_dim": z_dim,
    "stride_sizes": [5,4,2,2,2],
    "kernel_sizes": [10,8,4,4,4],
    "n_filters": [512,512,512,512,512],
    "activation": tf.nn.leaky_relu
    }


# AR params
# TODO: import accordingly given the model name in modularized fashion
ar_model = 'GRU'  # 'transformer'
c_dim = 512
ar_args = {
    'num_enc_layers': 5,
    'num_heads': 8,
    'z_dim': z_dim,
    'dff': z_dim,
    'dense_units': [z_dim, z_dim, c_dim],
    'activation': tf.nn.tanh,
    'maximum_position_encoding': data_generator_arguments['T'],
    'rate': 0.1
    }

# training params
epochs_cpc = 1  #500
steps_per_epoch_cpc = 1  #100
epochs_class = 1  #1000
learning_rate = 1e-5

# Preprocess_data.py

In [None]:
import tensorflow as tf
import tensorflow_io as tfio
import tensorflow.experimental.numpy as tfnp
import numpy as np
import random
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# import globals


#### INPUT PIPELINE FUNCTIONS/GENERATORS

def decode_audio(audio_path, original_sr, desired_sr, duration, max_duration=30):
    """
    Loads and decodes wav file and applies sub- or supersampling to achieve a desired sampling rate. 
    Pads the audio tensor with zeros up to max_duration and then randomly takes a duration seconds long random crop.
    """

    audio_binary = tf.io.read_file(audio_path)
    audio, sr = tf.audio.decode_wav(audio_binary, desired_channels = 1, desired_samples = max_duration * original_sr)
    audio = tf.image.random_crop(audio, size = (duration*sr,1))

    if not desired_sr == original_sr:
        audio = tfio.audio.resample(audio, original_sr, desired_sr)

    return tf.squeeze(audio, axis=-1), desired_sr


def batch_data_generator():
    """
    Relies on a global argument dictionary "data_generator_arguments" containing the keys:

    T:                  Number of time-steps (each being an audio window) used for prediction
    k:                  Number of time-steps (each being an audio window) to predict
    N:                  Number of negative samples (false/random prediction audio-windows)
    original_sr:        Sampling rate of the audio files
    desired_sr:         Sampling rate used for resampling the audio files (can reduce computational load but cuts high frequencies)
    full_duration:      Length of audio files (shorter files get padded, longer files get cropped)
    filepaths:          List of filepaths to wav files.

    Negative samples are drawn only from other audio files in the batch as in [van den Oord et al 2018]. Batch size equals N.

    Outputs a batch tensor of shape (batch_size, T +k*N, window_size, 1)
    """

    global data_generator_arguments

    T = data_generator_arguments["T"]
    k = data_generator_arguments["k"]
    N = data_generator_arguments["N"]
    original_sr = data_generator_arguments["original_sr"]
    desired_sr = data_generator_arguments["desired_sr"]
    duration = data_generator_arguments["full_duration"]
    filepaths = data_generator_arguments["filepaths"]
    batch_size = N

    window_size = duration*desired_sr/(T+k)
    assert not window_size%1, f"duration*sample rate and (T+k) must be divisible. Currently duration*sample_rate = {duration*desired_sr} and (T+k) = {T+k}"
    window_size = int(window_size)
    
    while True:

        # get audio from randomly sampled paths, truncated to duration and resampled to desired sr
        paths = random.sample(filepaths, batch_size)
        songs = [decode_audio(path, original_sr, desired_sr, duration)[0] for path in paths]
        
        batch = []
        for idx in range(batch_size):
            samples = []
            positive_sample = songs[idx]
            positive_sample = tf.reshape(positive_sample, (1, T+k, window_size, 1))
            samples.append(positive_sample)

            # add a set of negative (not coming from index idx) sample audio windows of size (1,k,window_size,1)
            for i, audio in enumerate(songs):
                if i != idx:
                    samples.append(tf.reshape(
                        tensor = tf.image.random_crop(audio, size = [window_size * k]), 
                        shape  = (1, k, window_size, 1)))
            
            # get one sample with shape (1, T +k*N, window_size, 1)
            batch.append(tf.concat(samples, axis = 1))

        yield tf.concat(batch, axis= 0) # yield complete batch from single samples



def create_cpc_ds():
    """
    Uses a global dictionary "data_generator_arguments" to create a tf dataset from a generator that outputs batches already.

    The data_generator_arguments dictionary has the following arguments:

    T:                  Number of time-steps (each being an audio window) used for prediction
    k:                  Number of time-steps (each being an audio window) to predict
    N:                  Number of negative samples (false/random prediction audio-windows)
    original_sr:        Sampling rate of the audio files
    desired_sr:         Sampling rate used for resampling the audio files (can reduce computational load but cuts high frequencies)
    full_duration:      Length of audio files (shorter files get padded, longer files get cropped)
    filepaths:          List of filepaths to wav files.
    """

    global data_generator_arguments
    T = data_generator_arguments["T"]
    k = data_generator_arguments["k"]
    N = data_generator_arguments["N"]
    sampling_rate = data_generator_arguments["desired_sr"]
    batch_size = N
    duration = data_generator_arguments["full_duration"]
    sr = data_generator_arguments["desired_sr"]

    # output shape of generator given the arguments
    data_shape = (batch_size, T+k*N, int((duration*sr)/(T+k)), 1)

    train_ds = tf.data.Dataset.from_generator(
        generator = batch_data_generator,
        output_signature = tf.TensorSpec(data_shape, 
                                        dtype=tf.dtypes.float32,
                                        name=None)
                                        )
    
    train_ds = train_ds.prefetch(tf.data.AUTOTUNE).cache()

    return train_ds


In [None]:
def create_tfds(inputs, targets, batch_size=None, buffer_size=None, prefetch_factor=None):
    '''
    Create an input pipeline from tf.dataset.
    Adjusted to only take input as there are no labels for autoencoders.
    '''
    dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
    if not buffer_size is None:
        dataset = dataset.shuffle(buffer_size)
    if not batch_size is None:
        dataset = dataset.batch(batch_size)
    if not prefetch_factor is None:
        dataset = dataset.prefetch(prefetch_factor)
    return dataset



# Extract_Embeddings.py

In [None]:
### from a folder with embedding npy files, create a tf dataset with labels
def get_embedding_datasets(embedding_path, embedding_folder, files):
    #embedding_path = "/content/gtzan_embeddings/"
    #embedding_folder = "gtzan_embeddings/"

    gtzan_em_files = listdir(embedding_path)
    gtzan_em_filepaths = [os.path.join(embedding_path, f) for f in files] # train files was created for training

    embedding_data = [np.load(x) for x in gtzan_em_filepaths]

    classes = np.array(["blues", "reggae", "metal", "rock", "pop", "classical", "country", "disco", "jazz", "hiphop"])

    gtzan_onehot_labels = [tf.eye(10)[np.argwhere(classes == g.split(embedding_folder)[1].split(".")[0].split("__")[-1])[0][0]] for g in gtzan_filepaths]

    ds = tf.data.Dataset.from_tensor_slices((embedding_data, gtzan_onehot_labels))

    return ds





In [None]:
### create embeddings (requires its own main script where all trained models are used to get embeddings)
def create_embedding_dataset(model, iterations, gtzan_filepaths, gtzan_location, save_to):
     #TODO: incorporate the arguments such that it can be reused

    ######################### create a large dataset for training  ###############################################
    iterations = 10  #number of times that 4 (duration) seconds are sampled from an audio clip
    #gtzan_location = "GTZAN/"

    #gtzan_files = listdir("/content/gtzan_songs/GTZAN/")
    #gtzan_filepaths = [os.path.join("/content/gtzan_songs/GTZAN/", f) for f in gtzan_files]

    original_sr = 22050
    desired_sr = 4410       # should match the sample rate that CPC was trained with
    duration = 4            # in seconds, should match the length of audio that CPC was trained with
    segments = 30           # units, important to calculate segment_length (should match what T+k was during CPC training)
    max_duration = 30       # in seconds
    segment_length = int(duration*desired_sr/segments)

    for i in range(0,iterations):

        for fpath in gtzan_filepaths:
            audio_binary = tf.io.read_file(fpath)
            audio, sr = tf.audio.decode_wav(audio_binary, desired_channels = 1, desired_samples = max_duration * original_sr)

            if not desired_sr == original_sr:
                audio = tfio.audio.resample(audio, original_sr, desired_sr)

            audio = tf.squeeze(audio, axis=-1)
            audio = tf.image.random_crop(audio, size = (segments * segment_length,))
            audio = tf.reshape(audio, (1,segments, segment_length, 1))

            embedding = model.get_embedding(audio)
            embedding = tf.squeeze(embedding, axis= 0)
            save_to = "/content/gtzan_embeddings/"+ "sample__"+ str(i) + "__" + fpath.split(gtzan_location)[1].replace(".wav", ".npy")
            np.save(save_to, embedding.numpy())

# encoder_models.py

In [None]:
import tensorflow as tf

class Conv1DEncoder(tf.keras.layers.Layer):
    '''
    Encodes an input 1D sequence into an audio window embedding.

    z_dim: size of embedding

    stride_sizes: list of stride arguments for Conv1D layers
    kernel_sizes: list of kernel size arguments for Conv1D layers
    n_filters:    list of filter number arguments for Conv1D layers
    activation:   activation function used in Conv1D layers and for output Dense layer. (e.g. "relu" or tf.nn.relu)

    '''

    def __init__(self, z_dim, stride_sizes, kernel_sizes, n_filters, activation):
        super(Encoder, self).__init__()

        s = stride_sizes
        k = kernel_sizes
        f = n_filters       

        self.enc_layers = []

        for l in range(len(f)):
            self.enc_layers.append(tf.keras.layers.Conv1D(f[l],k[l],s[l]))
            self.enc_layers.append(tf.keras.layers.BatchNormalization())
            self.enc_layers.append(tf.keras.layers.Activation(activation))

            #self.enc_layers.append(tf.keras.layers.LayerNormalization())

        self.enc_layers.append(tf.keras.layers.Flatten())
        self.enc_layers.append(tf.keras.layers.Dropout(0.1))
        #self.enc_layers.append(tf.keras.layers.Dense(512))
        #self.enc_layers.append(tf.keras.layers.Activation(activation))
        #self.enc_layers.append(tf.keras.layers.Dropout(0.1))
        self.enc_layers.append(tf.keras.layers.Dense(z_dim))
        self.enc_layers.append(tf.keras.layers.Activation(activation))
        
    def call(self, x, training):
        # input dim: [batch, T+K*N, window_size, 1]

        for l in self.enc_layers:
            try:
                x = l(x, training)
            except:
                x = l(x)

        # ouput dim:[batch, T+K*N, z]
        return x

# autoregressive_models.py

In [None]:
import tensorflow as tf

# TODO: use transformer
class GRU_Autoregressive(tf.keras.layers.Layer):
    '''
    GRU RNN that takes a sequence of audio window embeddings and combines them into a context embedding.
    c_dim: length of context embedding vector
    '''

    def __init__ (self, c_dim):
        super(Autoregressive, self).__init__()
        self.gru = tf.keras.layers.GRU(c_dim, name='ar_context',)
        

    def call (self, z_sequence):
                                    # input dim: [batch, T, z]
        return self.gru(z_sequence) # output dim:[batch, c]


In [None]:
#### TRANSFORMER LAYER CLASS AND FUNCTIONS

import numpy as np
import tensorflow as tf

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)  # (1, position, d_model)


def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead)
    but it must be broadcastable for addition.

    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
            to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
                                tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
                                tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
                                ])  
 


class Trans_EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(Trans_EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2


class Transformer(tf.keras.layers.Layer):
    def __init__(self, num_enc_layers, num_heads,
                 z_dim, dff, dense_units, activation, maximum_position_encoding, rate=0.1):
        '''
        num_enc_layers: num. transformer encoder layers to be stacked
        num_heads: num. sets of q,k,v
        z_dim: z_dim
        dff: num. units for first dense layer within encoder layer
        dense_units: list of num. units for additional dense layers, last number is c_dim
        activation: activation func. to use for additional dense layers
        maximum_position_encoding: T in our case, max length of sequence
        rate: dropout rate
        '''
        super(Transformer, self).__init__()

        self.z_dim = z_dim
        self.num_enc_layers = num_enc_layers

        # embedding layer isn't needed as the input is already embedded
        # self.embedding = tf.keras.layers.Embedding(input_vocab_size, z_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                                self.z_dim)

        self.enc_layers = [Trans_EncoderLayer(z_dim, num_heads, dff, rate)
                        for _ in range(num_enc_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

        # additional dense layer and dropouts at the end
        # TODO: using 1d_conv might make more sense but too much hyperparam
        self.densenet = [[tf.keras.layers.Dense(n_l, activation), tf.keras.layers.Dropout(rate)] for n_l in dense_units]
        self.densenet = [l for sublist in self.densenet for l in sublist]  # flatten
        del self.densenet[-1]  # last dropout

    def call(self, x, training, mask=None):

        seq_len = tf.shape(x)[1]  # T

        # adding embedding and position encoding.
        # x = self.embedding(x)  # (batch_size, input_seq_len, z_dim)
        x *= tf.math.sqrt(tf.cast(self.z_dim, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_enc_layers):
            x = self.enc_layers[i](x, training, mask)  # (batch_size, input_seq_len, z_dim)

        x = tf.keras.layers.Flatten()(x)  # (batch_size, input_seq_len*z_dim)
        for i in range(len(self.densenet)):
            if i % 2 == 0:
                x = self.densenet[i](x)  # dense layer
            else:
                x = self.densenet[i](x, training)  # dropout     
        
        return x  # (batch_size, c_dim)

# cpc_model.py

In [None]:
import tensorflow as tf
# from autoreg_model import Autoregressive
# from autoencoder_model import Encoder


class Predict_z(tf.keras.layers.Layer):
    '''
    Layer that uses the context embedding c_t to predict K (future) embeddings
    '''

    def __init__(self, z_dim, K, mixed_precision=False):
        super(Predict_z, self).__init__()
        
        # input_dim: [batch, c_dim]
        self.transform_layers = []

        if mixed_precision:
            self.z_dtype = tf.float16
        else:
            self.z_dtype= tf.float32

        # one linear layer for each future time-step
        for k in tf.range(K):  
            self.transform_layers.append(tf.keras.layers.Dense(z_dim))

    def call(self, c_t):
        
        z_pred = tf.TensorArray(self.z_dtype, size=len(self.transform_layers))

        for l,layer in enumerate(self.transform_layers):
            # apply linear projection layer for each k
            z_pred = z_pred.write(l, layer(c_t))                        

        z_pred_t = z_pred.stack()
        # output_dim: [batch, K, z]
        return tf.transpose(z_pred_t, perm=[1,0,2])                      

def compute_f(z, z_pred):
    '''
    Compute f-scores following eq(3) in the paper to be batch (K x N) matrices.
    Computes similarity (f-)scores as the exp of the dot product of two embeddings. 
    First column of the returned f-score matrix is the postive sample.
    '''                                                                                         
                                                                                    # z_pred input dim: [batch, K, z]
                                                                                    # z input dim:      [batch, K, N, z]
    z = tf.expand_dims(z, axis=-2)                                                  # -> [batch, K, N, 1, z]
                                                                        
    pred = tf.repeat(z_pred, repeats=z.shape[2], axis=-2)                           # -> [batch, K*N, z]
    pred = tf.reshape(pred, shape=[z.shape[0],z.shape[1],z.shape[2],z.shape[-1]])   # -> [batch, K, N, z]
    pred = tf.expand_dims(pred, axis=-1)                                            # -> [batch, K, N, z, 1]

    dot_prod = tf.linalg.matmul(z, pred)                                            # -> [batch, K, N, 1, 1]
    #cosine_similarity = dot_prod/(tf.norm(z)*tf.norm(pred))
    dot_prod = tf.squeeze(dot_prod, axis=[-2,-1])                                   # -> [batch, K, N]
    f_mat = tf.exp(dot_prod)
                                                                                    # output dim: [batch, K, N]
    return f_mat 


class CPC(tf.keras.models.Model):
    '''
    Full Contrastive Predictive Coding Model.

    n_observations:     number of subsequent windows of audio used for prediction
    n_future:           number of future audio windows to predict
    n_negative_samples: number of random negative samples 
    z_dim:              audio window encoding size
    encoder_args:       argument dictionary for Encoder model
    '''

    def __init__ (self, n_observations, n_future, n_samples, z_dim, c_dim, Encoder, Autoregressive_Model, Predict_z, encoder_args, ar_args, m_precision):
        super(CPC, self).__init__()

        self.T = n_observations
        self.K = n_future
        self.N = n_samples

        self.z = z_dim
        self.c = c_dim

        self.g_enc = Encoder(**encoder_args)
        self.g_ar = Autoregressive_Model(**ar_args)
        self.p_z = Predict_z(z_dim=self.z, K=self.K, mixed_precision = m_precision)

    def get_embedding(self, x):

        z_t = tf.keras.layers.TimeDistributed(self.g_enc)(x, training= False)
        c_T = self.g_ar(z_t)

        return c_T

    def call(self, x, training=False):  
                                                                                                # input dim: [batch, T+K*N, window_size, 1]
        # Obtain Embeddings for T+k*N time windows of length d
        z_t = tf.keras.layers.TimeDistributed(self.g_enc)(x, training=training)                         # -> [batch, T+K*N, z_dim]
        
        # Split into current observation embeddings and (positive and negative) future embeddings
        z_obs = z_t[:, :self.T]                                                                         # -> [batch,   T, z]
        z_future = z_t[:, self.T:]                                                                      # -> [batch, K*N, z]
        z_future = tf.reshape(z_future, [-1, self.K, self.N, self.z])                                   # -> [batch, K, N, z]

        # Obtain context embedding vector for T encoded time-windows
        c_T = self.g_ar(z_obs)                                                                          # -> [batch, c]

        # Linearly project context vector to make predictions of the future encoded time-windows
        z_pred = self.p_z(c_T)                                                                          # -> [batch, K, z]

        # Compute f matrix in which the first column is the f-scores for the positive sample

        f_mat = compute_f(z_future, z_pred)                                                     #output dim: [batch, K, N]

        return f_mat

# InfoNCE_loss.py

In [None]:
class InfoNCE (tf.keras.losses.Loss):
    '''
    Compute InfoNCE loss given a batch of f matrices with dim (K x N)
    '''

    def __call__(self, f):
                                                         # input dim: [batch, K, N]
        denominator = tf.reduce_sum(f, axis=2)           # -> [batch, K]
        losses = - tf.math.log(f[:,:,0] / denominator)  # first column is the positive k predictions
        # TODO: weighted avg. instead of uniform avg.
        loss = tf.reduce_mean(losses, axis=None)         # Take mean loss over batch_size and K

        return loss

# classifier_model.py

In [None]:
import tensorflow as tf

### classification model
c_dim = 512 # needed as an argument

def get_classifier(c_dim, num_classes):

    embedding_inputs = tf.keras.Input(shape=(c_dim))
    x = tf.keras.layers.Dense(64, activation="relu")(embedding_inputs)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    outputs = tf.keras.layers.Dense(10, activation = "softmax")(x)
    model = tf.keras.Model(inputs= embedding_inputs, outputs=outputs, name="music_classifier")

    return model

# training.py

In [None]:
import tensorflow as tf

@tf.function
def train_step(model, ds, loss_function, optimizer, 
               steps_per_epoch, train_loss_metric=None, mixed_precision=False):

    for batch in ds.take(steps_per_epoch):

        with tf.GradientTape() as tape:

            prediction = model(batch,training=True)
            loss = loss_function(prediction)

            if mixed_precision:
                loss = optimizer.get_scaled_loss(loss) # scaled loss for mixed precision training
        
        gradients = tape.gradient(loss, model.trainable_variables) # get (scaled) gradients

        if mixed_precision:
            gradients = optimizer.get_unscaled_gradients(scaled_gradients) # get unscaled gradients from scaled gradients

        optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # apply unscaled gradients

        # update metric
        train_loss_metric.update_state(loss)


def eval_metric(metric, val_list):
    result = metric.result()
    metric.reset_states()
    val_list.append(result)
    return result


# TODO: eventually do training in a main loop and not with a single func
def train_cpc(epochs, model, ds_train, ds_test, loss_function, optimizer, steps_per_epoch,
                         train_loss_metric=None, train_acc_metric=None, test_loss_metric=None, test_acc_metric=None,
                         PATH=False):
    '''
    Call appropriate train and test steps depending on the mode.
    :param epochs: int, number of epochs to train
    :param model: tf.keras.model, model to train
    :param ds_train: tf.keras.data.dataset, dataset to train on
    :param ds_test: tf.keras.data.dataset, dataset to test on
    :param loss_function: tf.keras.loss, loss function to use
    :param optimizer: tf.keras.optimizer, optimizer to use
    :param steps_per_epoch: int, number of batch to feed per epoch
    :param train_loss_metric: tf.keras.metric
    :param test_loss_metric: tf.keras.metric
    :param PATH: str, path to save to trained model weights
    :return: list of list, 4 metrics for all epochs
    '''

    # arrays to save results for all epochs
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []

    for e in range(epochs):
        # Train
        train_step_cpc(model, ds_train, loss_function, optimizer, steps_per_epoch, train_loss_metric)
        
        eval_metric(train_loss_metric, train_losses)

        if train_acc_metric is not None:
            eval_metric(train_acc_metric, train_accuracies)

    # Save model weights
    if PATH:
        from datetime import datetime
        now = datetime.now()
        save_to = PATH + str(now)[:-10] + ".h5"
        model.save_weights(save_to, overwrite=False)

    return train_losses, train_accuracies, test_losses, test_accuracies

# formerly main eval train with mode (now we only train CPC with a custom train function and the classifier with model.fit()
def train_cpc(cpc_model, train_ds, loss_function, optimizer, epochs, steps_per_epoch, train_loss_metric, mixed_precision, save_to)

    epochs = 10
    for e in range(epochs):
        train_step(cpc_model, train_ds, loss_function, optimizer, 100, train_loss_metric, mixed_precision)

        train_losses.append(train_loss_metric.result().numpy())

        print(f"Episode:{e}    loss: {train_losses[-1]}")

        train_loss_metric.reset_states()

    now = datetime.now()

    # save model parameters to .h5 file. Can afterwards be loaded with cpc.load_weights(load_from)
    save_to = save_to + str(now)[:-10] + ".h5"
    cpc_model.save_weights(save_to, overwrite=False)

    # save loss array for later visualization
    losses_array = np.array(train_losses)
    np.save(save_to.replace(".h5",".npy"),losses_array)

# 

# train_classifiers.py

In [None]:
def plot_classifier_training(history, epochs, save_plot_as):
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0, epochs), history.history["loss"], label="train_loss")
    plt.plot(np.arange(0, epochs), history.history["val_loss"], label="val_loss")
    plt.plot(np.arange(0, epochs), history.history["accuracy"], label="train_acc")
    plt.plot(np.arange(0, epochs), history.history["val_accuracy"], label="val_acc")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.savefig(save_plot_as))

In [None]:
EPOCHS = 10000
optimizer = tf.keras.optimizers.Adam(1e-3)
loss = tf.keras.losses.CategoricalCrossentropy()
c_dim = 512

# TODO: get list of tuples (train_ds, test_ds) for each embedding  via create_embedding_dataset(filepaths ... )

#datasets = [create_embedding_dataset(...) for path in embedding_paths]

# create save_plot_as list of filenames/paths for the plots

# iterate over datasets list and do the following. also iterate (with zip) over the save_plot_as list.

train_dataset = ds[0]
test_dataset  = ds[1]
model = get_classifier(c_dim, 10)
model.compile(optimizer = optimizer, 
                loss = loss, 
                metrics=["accuracy"])
history = model.fit(train_dataset, epochs = EPOCHS, batch_size = 32, validation_data = test_dataset) # add additional arguments

### plot the history
plot_classifier_training(history, EPOCHS, plotname)

# Main.py

In [None]:
### (A) Train the CPC model ###
T = data_generator_arguments["T"]
k = data_generator_arguments["k"]
N = data_generator_arguments["N"]
batch_size = N

mixed_precision = False
if mixed_precision:
    tf.keras.mixed_precision.set_global_policy('mixed_float16') # use mixed precision training (on V100 supposedly a 3x performance boost + double memory)
else:
    tf.keras.mixed_precision.set_global_policy('float32')

# Generate a dataset
train_ds_cpc = create_cpc_ds()


# Define 3 design components
# Model

# import respective Encoder class as Encoder, import Autoregressive class as Autoregressive, import encoder_args and ar_args that are associated with them.

cpc = CPC(T, k, N, z_dim, c_dim, Encoder, Autoregressive, encoder_args, ar_args, mixed_precision)

# load trained model
if load_path:
    cpc.load_weights(load_path)

# Loss
infonce = InfoNCE()
train_loss_metric_cpc = tf.keras.metrics.Mean('train_loss_CPC')
# Optimizer
adam = tf.keras.optimizers.Adam(learning_rate)


# Training
res_train_loss_cpc, *_ = main_train_eval_loop('CPC', epochs_cpc, cpc, ds_train_cpc, None, infonce, adam, steps_per_epoch=steps_per_epoch_cpc,
                                          train_loss_metric=train_loss_metric_cpc, PATH=weight_path)
######



# Define 3 design components
# Model
num_classes = list(gtzan_train.take(1).as_numpy_iterator())[0][1].shape[1]
classi1 = Classifier(num_classes)
# Loss
cce = tf.keras.losses.CategoricalCrossentropy()
train_acc_metric_classi1 = tf.keras.metrics.CategoricalAccuracy('train_accuracy_classi1')
train_loss_metric_classi1 = tf.keras.metrics.Mean('train_loss_classi1')
test_acc_metric_classi1 = tf.keras.metrics.CategoricalAccuracy('test_accuracy_classi1')
test_loss_metric_classi1 = tf.keras.metrics.Mean('test_loss_classi1')

 
# Training and testing
res_train_loss_c1, res_train_acc_c1, res_test_loss_c1, res_test_acc_c1 = main_train_eval_loop(
    'classifier', epochs_class, classi1, gtzan_train, gtzan_test, cce, adam, steps_per_epoch=8, # TODO: steps for classi?
    train_loss_metric=train_loss_metric_classi1, train_acc_metric=train_acc_metric_classi1,
     test_loss_metric=test_loss_metric_classi1, test_acc_metric=test_acc_metric_classi1)


In [None]:
### (D,E) Classify with learned features from CPC model
# Get features using trained CPC
import tensorflow_datasets as tfds
gtzan_ds = tfds.load('gtzan', split='train', as_supervised=True).shuffle(1024).prefetch(tf.data.experimental.AUTOTUNE)
gtzan_ds_train = gtzan_ds.take(int(1000*split_rate))
gtzan_ds_test = gtzan_ds.skip(int(1000*split_rate))


def get_embedded_ds(ds, model):
    features = []
    labels = []
    for audio, label in ds:
        audio = audio[:(T+k)*d]  # if GTZAN happends to be longer
        audio = tf.reshape(audio, (1, T+k, d, 1))  # 
        print(audio.shape)
        # TODO: Value for attr 'T' of int64 is not in the list of allowed values:
        features.append(model.get_embedding(audio))
        labels.append(label)
    print("features, labels")
    print(np.array(features).shape, np.array(labels).shape)
    return tf.data.Dataset.from_tensor_slices((features,labels))


cpc_train_features = get_embedded_ds(gtzan_ds_train, cpc).batch(batch_size)
cpc_test_features = get_embedded_ds(gtzan_ds_test, cpc).batch(batch_size)

# Design components
# model
classi2 = Classifier(num_classes)
# metrics
train_acc_metric_classi2 = tf.keras.metrics.CategoricalAccuracy('train_accuracy_classi2')
train_loss_metric_classi2 = tf.keras.metrics.Mean('train_loss_classi2')
test_acc_metric_classi2 = tf.keras.metrics.CategoricalAccuracy('test_accuracy_classi2')
test_loss_metric_classi2 = tf.keras.metrics.Mean('test_loss_classi2')

# Training and testing
res_train_loss_c2, res_train_acc_c2, res_test_loss_c2, res_test_acc_c2 = main_train_eval_loop(
    'classifier', epochs_class, classi2, cpc_train_features, cpc_test_features, cce, adam, steps_per_epoch=batch_size_classifier,  # TODO: steps for classi?
    train_loss_metric=train_loss_metric_classi2, train_acc_metric=train_acc_metric_classi2,
     test_loss_metric=test_loss_metric_classi2, test_acc_metric=test_acc_metric_classi2)

[1mDownloading and preparing dataset gtzan/1.0.0 (download: 1.14 GiB, generated: 3.71 GiB, total: 4.85 GiB) to /root/tensorflow_datasets/gtzan/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…






KeyboardInterrupt: ignored

In [None]:
for i,l in gtzan_ds.take(1):
    print(i,l)

In [None]:
# TODO: automatic result and figure saving
import matplotlib.pyplot as plt
fig, ax = plt.subplots(ncols=2)

ax[0].plot(np.array(res_train_loss_c1), color='r')
ax[0].plot(np.array(res_test_loss_c1), color='b')
ax[0].set(title='cce loss')
ax[1].plot(np.array(res_train_acc_c1), color='r')
ax[1].plot(np.array(res_test_acc_c1), color='b')
ax[1].set(title='accuracy')

In [None]:
while True:
    pass