In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
# import tensorflow_io as tfio
import random

%config Completer.use_jedi = False

In [4]:
# get minimal length of music sample
ds_info = pd.read_csv('data/gtzan/features_30_sec.csv')
print(ds_info)

            filename  length  chroma_stft_mean  chroma_stft_var  rms_mean  \
0    blues.00000.wav  661794          0.350088         0.088757  0.130228   
1    blues.00001.wav  661794          0.340914         0.094980  0.095948   
2    blues.00002.wav  661794          0.363637         0.085275  0.175570   
3    blues.00003.wav  661794          0.404785         0.093999  0.141093   
4    blues.00004.wav  661794          0.308526         0.087841  0.091529   
..               ...     ...               ...              ...       ...   
995   rock.00095.wav  661794          0.352063         0.080487  0.079486   
996   rock.00096.wav  661794          0.398687         0.075086  0.076458   
997   rock.00097.wav  661794          0.432142         0.075268  0.081651   
998   rock.00098.wav  661794          0.362485         0.091506  0.083860   
999   rock.00099.wav  661794          0.358401         0.085884  0.054454   

      rms_var  spectral_centroid_mean  spectral_centroid_var  \
0    0.0028

# GTZAN Dataset - Music Genre Classification

In [4]:
# get file paths
ds_info = pd.read_csv('data/features_30_sec.csv')

file_names = ds_info['filename'].values
file_names.astype(np.str)
file_paths = np.array(['data/no_genres/' + file for file in file_names])

file_path_ds = tf.data.Dataset.from_tensor_slices(file_paths)

# read files and decode them
ds = [tfio.audio.AudioIOTensor(path) for path in file_paths]

# make length for each sample the same
min_len = min([a.to_tensor().shape[0] for a in ds])

# extract np arrays from the tf.ds
ds_np = np.array([a.to_tensor().numpy()[0:min_len] for a in ds])

ds_rate = np.array([a.rate.numpy() for a in ds])
if np.all(ds_rate == ds_rate[0]):
    ds_rate = ds_rate[0]

FileNotFoundError: [Errno 2] No such file or directory: 'data/features_30_sec.csv'

In [62]:
T = 20
k = 10

N = 10
d = 1
filenames = file_paths.tolist()


# load audio
def decode_audio(audio_binary):
    audio, sample_rate = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1), sample_rate

def music_generator():
    """
    assumes we have a list of all filenames/paths

    T: number of time-steps to use for the context embedding c_t

    k: number of future time-steps to predict

    N: number of samples (1 positive + N-1 negative samples)

    d: window size in seconds

    filenames: list of all filepaths to all audio files
    """
    global filenames, T, k, N, d
    while True:
        
        #randomly select positive sample filenames from list        
        samples = random.sample(filenames, N)
        positive_sample = samples[0]
        negative_samples = samples[1:]

        # positive sample
        positive_audio = tf.io.read_file(positive_sample)
        positive_audio, sample_rate = decode_audio(positive_audio)
        
        end = sample_rate * (T+k)   
        window_size = int(sample_rate * d)
        positive_audio = tf.reshape(positive_audio[:end], (T+k, window_size, 1))

        # negative samples (find a way to do it without a for loop pls)
        sample_tensors = []
        sample_tensors.append(positive_audio)

        for ns in negative_samples:
            ns = tf.io.read_file(ns)
            ns, sample_rate = decode_audio(ns)
            window_size = sample_rate * d
            end = sample_rate * (T+k)   

            ns = tf.reshape(ns[:end], (T+k,window_size,1))

            # only take the last k entries (pls double check this)
            ns = ns[T:T+k]

            sample_tensors.append(ns)

        # concatenate all tensors, making its shape (T+k*N, window_size,1)
        data = tf.concat(sample_tensors, axis= 0)

        yield data                        

In [63]:
gen = music_generator()
next(gen)

<tf.Tensor: shape=(120, 22050, 1), dtype=float32, numpy=
array([[[-0.23886108],
        [-0.41534424],
        [-0.3161621 ],
        ...,
        [ 0.02081299],
        [ 0.01882935],
        [ 0.01901245]],

       [[ 0.02401733],
        [ 0.02923584],
        [ 0.03115845],
        ...,
        [ 0.05682373],
        [ 0.06643677],
        [ 0.06881714]],

       [[ 0.06442261],
        [ 0.05459595],
        [ 0.04019165],
        ...,
        [ 0.3184204 ],
        [ 0.24853516],
        [ 0.2013855 ]],

       ...,

       [[-0.21835327],
        [-0.15158081],
        [-0.05319214],
        ...,
        [-0.15216064],
        [-0.13912964],
        [-0.12341309]],

       [[-0.11651611],
        [-0.12738037],
        [-0.14666748],
        ...,
        [ 0.04693604],
        [ 0.03289795],
        [ 0.01449585]],

       [[ 0.03625488],
        [ 0.03643799],
        [-0.03161621],
        ...,
        [ 0.0255127 ],
        [ 0.01348877],
        [-0.06192017]]], dtype=float3

In [None]:
# (batch, t+k, rate, 1)
BATCH_SIZE = 64
SAMPLE_LEN = 30
RATE = ds_rate
DS_LEN = len(ds_np)
T = 20
K = 10

In [None]:
def music_generator():
    global ds_np, RATE, SAMPLE_LEN, BATCH_SIZE, T, K, DS_LEN
    if not ds_np[0].shape[0]/RATE == SAMPLE_LEN:
        end = RATE*SAMPLE_LEN
        ds_np = np.array([a.to_tensor().numpy()[0:end] for a in ds])
    
    while True:
        idx = random.randint(0, DS_LEN)
        sample = ds_np[idx].reshape((-1, RATE))
        #sample = np.expand_dims(sample, axis=0)
        
        yield sample

In [None]:
gen = music_generator()
next(gen).shape

In [None]:
ds = tf.data.Dataset.from_generator(generator=music_generator,
                                    output_types=tf.float32,
                                    # (time_len, 30), (time_len,2)
                                    output_shapes=(SAMPLE_LEN, RATE)
                                    )
ds = ds.batch(BATCH_SIZE)

In [None]:
for i in ds.take(1):
    print(i.shape)

# in sec

delta_t = 30
time_timestep = 1


In [None]:
z_dim = 512  # latent dim z_t
c_dim = 256  # dim of g_ar output c_t

In [None]:
class InfoNCE (tf.keras.losses.Loss):
    '''
    Compute loss given batch times f matrices with dim (K x N)
    '''

    def __call__(self, f):
        # input dim: [batch, K, N]
        denominator = tf.reduce_sum(f, axis=2)  # [batch, K]
        losses = - tf.math.log(f[:,:,0] / denominator)  # first column is positive
        return tf.reduce_mean(losses, axis=1)  # [batch]. Take a mean over k timesteps

In [None]:
class Encoder (tf.keras.layers.Layer):
    '''
    g_enc: strided 1d convolution
    '''

    def __init__ (self, z_dim):
        super(Encoder, self).__init__()
        s = [5,4,2,2,2]  # stride sizes
        k = [10,8,4,4,4]  # kernel sizes
        f = [512,512,512,512,512]  # num filters

        # input dim: [batch, T+K*N, d, 1]
        self.layers = []
        for l in range(5):
            self.layers.append(tf.keras.layers.Conv1D(f[l],k[l],s[l]))
            self.layers.append(tf.keras.layers.BatchNormalization())
            self.layers.append(tf.keras.layers.LeakyReLU())
        self.layers.append(tf.keras.layers.GlobalAveragePooling1D())
        self.layers.append(tf.keras.layers.Dense(z_dim, activation='tanh'))
        # ouput dim:[batch, T+K*N, z]

    def call (self, x, training):
        
        for l in self.layers:
            try:  # batch normalization 
                x = l(x, training)
            except:
                x = l(x)
        return x  


class Autoregressive (tf.keras.layers.Layer):
    '''
    g_ar: GRU RNN
    '''

    def __init__ (self, c_dim):
        super(Autoregressive, self).__init__()
        # input dim: [batch, T, z]
        self.l = tf.keras.layers.GRU(c_dim, name='ar_context') 
        # output dim:[batch, c] since return_seq is False

    def call (self, z):
        return self.l(z) 


class Predict_z (tf.keras.layers.Layer):
    '''
    transformation of c_t, currently linear (W_k) for all future timesteps
    '''

    def __init__ (self, z_dim, K):
        super(Predict_z, self).__init__()
        
        # input_dim: [batch, c]
        self.layers = []
        for k in range(K):  # k different layers for each timestep
            self.layers.append(tf.keras.layers.Dense(z_dim)) 

    def call(self, c_t):
        # TODO: maybe size should be multidimensional
        z_pred = tf.TensorArray(tf.float32, size=len(self.layers))
        for l in tf.range(len(self.layers)):  
            z_pred = z_pred.write(l, self.layers[l](c_t))  # apply for each k
            z_pred_t = z_pred.stack()
            # [K, batch, z]
        return tf.transpose(z_pred_t, perm=[1,0,2])  # output_dim: [batch, K, z]


def compute_f (z, z_pred):
    '''
    compute f following eq(3) in the paper to be batch (K x N) matrices.
    First column is the postive sample.
    '''

    # z input dim: [batch, K, N, z], 
    z = tf.expand_dims(z, axis=-2)  # [batch, K, N, 1, z]
    
    # z_pred input dim: [batch, K, z]
    pred = tf.repeat(z_pred, repeats=z.shape[2], axis=-2)  # [batch, K*N, z]
    pred = tf.reshape(pred, shape=[z.shape[0],z.shape[1],z.shape[2],z.shape[-1]])  # [batch, K, N, z]
    pred = tf.expand_dims(pred, axis=-1)  # [batch, K, N, z, 1]

    dot_prod = tf.linalg.matmul(z, pred)  # [batch, K, N, 1, 1]
    dot_prod = tf.squeeze(dot_prod, axis=[-2,-1])  # [batch, K, N]
    dot_prod = tf.exp(dot_prod)
    return dot_prod  # output dim: [batch, K, N]


class CPC (tf.keras.models.Model):
    '''
    put everything together. Return f_k for every k
    '''

    def __init__ (self, num_time_observations, num_time_future, num_negative_samples, z_dim, c_dim):
        super(CPC, self).__init__()
        self.T = num_time_observations
        self.K = num_time_future
        self.N = num_negative_samples
        self.z = z_dim
        self.c = c_dim

        self.g_enc = Encoder(self.z)
        self.g_ar = Autoregressive(self.c)
        self.p_z = Predict_z(z_dim=self.z, K=self.K)

    def call(self, x, training=False):  
        # input dim: [batch, T+K*N, d, 1]
        print('input dim: ', x.shape)
        # Embedding
        z_t = tf.keras.layers.TimeDistributed( # dim 1 is the temporal dim 
            self.g_enc)(x, training=training)  # [batch, T+K*N, z]
        print('embedding dim: ', z_t.shape)    
        

        # Split current observation embeddings and future embeddings
        z_obs = z_t[:, :self.T]  # t = {0,...,T}, dim: [batch, T, z]
        z_future = z_t[:, self.T:]  # t = {T+1,,,T+K} for N samples, dim:[batch, K*N, z]
        z_future = tf.reshape(z_future, [-1, self.K, self.N, self.z])  # [batch, K, N, z]
        print('embedding obs:', z_obs.shape)
        print('embedding pred:', z_future.shape)

        # Predict embeddings
        c_T = self.g_ar(z_obs)  # [batch, c]
        print('context:', c_T.shape)
        z_pred = self.p_z(c_T)  # [batch, K, z]
        print('transformed_context:', z_pred.shape)

        # Compute f matrices
        f_mat = compute_f(z_future, z_pred)  # [batch, K, N]

        return f_mat

In [None]:
batch = 1
T = 8
K = 3
N = 5
d = 1000

data = np.random.rand(batch, T+K*N, d, 1)
data = tf.constant(data)
print('input shape:', data.shape)

cpc = CPC(T, K, N, z_dim, c_dim)
f_mat = cpc(data)
