In [17]:
import tensorflow as tf
import numpy as np
from bregman.suite import *

In [18]:
k = 2
segment_size = 50
max_iterations = 100

In [19]:
chromo = tf.placeholder(tf.float32)
max_freqs = tf.argmax(chromo, 0)

def get_chromogram(audio_file):
    F = Chromagram(audio_file, nfft=16384, wfft=8192, nhop=2205)
    return F.X

def extract_feature_vector(sess, chromo_data):
    num_features, num_samples = np.shape(chromo_data)
    freq_vals = sess.run(max_freqs, feed_dict={chromo: chromo_data})
    hist, bins = np.histogram(freq_vals, bins=range(num_features + 1))
    return hist.astype(float) / num_samples

def get_dataset(sess, audio_file):
    chromo_data = get_chromogram(audio_file)
    print('chromo_data', np.shape(chromo_data))
    chromo_length = np.shape(chromo_data)[1]
    xs = []
    for i in range(chromo_length // segment_size):
        chromo_segment = chromo_data[:, i*segment_size:(i + 1) * segment_size]
        x = extract_feature_vector(sess, chromo_segment)
        if len(xs) == 0:
            xs = x
        else:
            xs = np.vstack((xs, x))
    return xs

In [20]:
def initial_cluster_centroids(X, k):
    return X[0:k, :]

def assign_cluster(X, centroids):
    expanded_vectors = tf.expand_dims(X, 0)
    expanded_centroids = tf.expand_dims(centroids, 1)
    distances = tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroids)), 2)
    mins = tf.argmin(distances, 0)
    return mins

def recompute_centroids(X, Y):
    sums = tf.unsorted_segment_sum(X, Y, k)
    counts = tf.unsorted_segment_sum(tf.ones_like(X), Y, k)
    return sums / counts

In [22]:
with tf.Session() as sess:
    # https://archive.org/details/BEETHOVENViolinSonataNo.7-NEWTRANSFER/03.Iii.Scherzo.mp3
    X = get_dataset(sess, '03_Iii_Scherzo.wav')
    print(np.shape(X))
    centroids = initial_cluster_centroids(X, k)
    i, converged = 0, False
    while not converged and i < max_iterations:
        i += 1
        Y = assign_cluster(X, centroids)
        centroids = sess.run(recompute_centroids(X, Y))
        if i % 50 == 0:
            print('iteration', i)
    segments = sess.run(Y)
    for i in range(len(segments)):
        seconds = (i * segment_size) / float(10)
        min, sec = divmod(seconds, 60)
        time_str = '{}m {}s'.format(min, sec)
        print(time_str, segments[i])

  self.STFT = P.zeros((self.nfft / 2 + 1, num_frames), dtype='complex')
  mxnorm = P.empty(self._cqtN)  # Normalization coefficients
  for i in P.arange(self._cqtN)])


chromo_data (12, 3605)
(72, 12)
iteration 50
iteration 100
0.0m 0.0s 0
0.0m 5.0s 0
0.0m 10.0s 0
0.0m 15.0s 0
0.0m 20.0s 0
0.0m 25.0s 0
0.0m 30.0s 0
0.0m 35.0s 0
0.0m 40.0s 1
0.0m 45.0s 1
0.0m 50.0s 1
0.0m 55.0s 1
1.0m 0.0s 0
1.0m 5.0s 0
1.0m 10.0s 0
1.0m 15.0s 0
1.0m 20.0s 0
1.0m 25.0s 0
1.0m 30.0s 1
1.0m 35.0s 1
1.0m 40.0s 1
1.0m 45.0s 1
1.0m 50.0s 1
1.0m 55.0s 0
2.0m 0.0s 0
2.0m 5.0s 0
2.0m 10.0s 0
2.0m 15.0s 0
2.0m 20.0s 0
2.0m 25.0s 0
2.0m 30.0s 0
2.0m 35.0s 0
2.0m 40.0s 0
2.0m 45.0s 0
2.0m 50.0s 0
2.0m 55.0s 1
3.0m 0.0s 1
3.0m 5.0s 1
3.0m 10.0s 1
3.0m 15.0s 1
3.0m 20.0s 0
3.0m 25.0s 0
3.0m 30.0s 1
3.0m 35.0s 1
3.0m 40.0s 0
3.0m 45.0s 1
3.0m 50.0s 1
3.0m 55.0s 1
4.0m 0.0s 1
4.0m 5.0s 1
4.0m 10.0s 1
4.0m 15.0s 0
4.0m 20.0s 1
4.0m 25.0s 1
4.0m 30.0s 0
4.0m 35.0s 0
4.0m 40.0s 0
4.0m 45.0s 0
4.0m 50.0s 0
4.0m 55.0s 0
5.0m 0.0s 0
5.0m 5.0s 0
5.0m 10.0s 1
5.0m 15.0s 1
5.0m 20.0s 1
5.0m 25.0s 1
5.0m 30.0s 1
5.0m 35.0s 0
5.0m 40.0s 0
5.0m 45.0s 0
5.0m 50.0s 0
5.0m 55.0s 0
