### K-means spherical clustering
Some functions of this code is based on work derived from Dan Stowell. Will put in the doc-string as necessary.

His work is from here: https://core.ac.uk/reader/30341728 (shorter paper) and https://peerj.com/articles/488/ (longer paper) and code is located here https://dfzljdn9uc3pi.cloudfront.net/2014/488/1/oskmeans.py.txt

#### Notes 
- Mel spectrograms used directly as features

#### Outline of work
##### Preprocessing step
- Resample to standard 44.1kHz
- Spectrogram (frame size of 1024 frames with Hamming windowing and no overlap)
- Calculate Mel spectrogram for each file (which are directly used as features)
- High-pass filtering (filter spectral energy below 500 Hz to reduce environmental noise)
- RMS normalization in each spectrogram
- Spectral median noise reduction (http://sabiod.univ-tln.fr/NIPS4B2013_book.pdf aka common median-based thresholding)
- PCA-whiten the data: https://dfzljdn9uc3pi.cloudfront.net/2014/488/1/oskmeans.py.txt

##### Classification
- Apply spherical k-means clustering

In [1]:
import os 
import pydub
from pydub import AudioSegment
from scipy.io import wavfile
import samplerate
import librosa
from scipy import signal
import numpy as np
from librosa import feature
from librosa import filters
from scipy.signal import butter, filtfilt
from scipy import ndimage
import librosa.display
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pydub
from pydub import AudioSegment
import tempfile
import soundfile as sf
import scipy
from scipy.io.wavfile import write
import glob

#### Functions to use

In [2]:
def read_mp3(f, normalized=False):
    """
    Inputs: MP3 to numpy array, f = filename
    Output: a.frame_rate (sampling rate)
    """
    a = pydub.AudioSegment.from_file(f)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
    if normalized:
        return np.float32(y) / 2**15, a.frame_rate
    else:
        return y, a.frame_rate

def resample(filepath, des_sr=44100): 
    """
    filepath = path to desired file
    data = input data from audio file
    des_sr = desired sampling rate in Hz
    """
    filename = os.path.basename(filepath) 
    print(filename)
    if filename.endswith(".mp3") or filename.endswith(".MP3"): 
        data, sr = read_mp3(filepath)
        output_data = samplerate.resample(data, des_sr/sr, 'sinc_best')
    elif filename.endswith(".wav") or filename.endswith(".WAV"): 
        sr, data = wavfile.read(filepath)
        output_data = samplerate.resample(data,des_sr /sr, 'sinc_best')
    else: 
        print("Not a valid file type (not .wav or .mp3)")
        pass 
        
    return output_data

# Reference: https://stackoverflow.com/questions/39032325/python-high-pass-filter
def butter_highpass(cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def butter_highpass_filter(data, cutoff, fs, order=5):
    b, a = butter_highpass(cutoff, fs, order=order)
    y = filtfilt(b, a, data)
    return y

def high_pass_filter(data, sr): 
    # set as a highpass filter for 500 Hz
    filtered_signal = butter_highpass_filter(data, 500, sr, order=5)
    return filtered_signal

# Referenced librosa: https://github.com/librosa/librosa/blob/main/librosa/feature/spectral.py
def make_mel_spectrogram(input_data, sample_rate): 
    """
    input data: (Time series of measurement values)
    output = S : np.ndarray [shape=(n_mels, t)]
    
    melspectrograms in itself are used as features
    """
    
    f, t, Sxx = signal.spectrogram(input_data, fs=sample_rate, window = 'hamming', noverlap=None, nfft = 1024, mode='magnitude')
    # Sxx is spectrogram of x and last axis of Sxx is segment times
    mel_basis = filters.mel(sample_rate, n_fft = 1024)
    output = np.dot(mel_basis, Sxx)
    return output

def median_denoising(data): 
    """
    Using scipy's median filter: https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.median_filter.html
    """
    # every frame, subtract the median value of spectral band
    result = ndimage.median_filter(data, 3)
    return result
    
def convert_mp3_to_wav(mp3_path, sr=44100, mono=True, overwrite=False, dtype='float32'): 
    # Need to change sample rate to 44.1kHz if using audiomoths
    # since mono=True by default, unless you pass mono=False, 
    # this function will save the wav as mono
#     print("entering convert_mp3_to_wav")
    """
    Parts of code from 
    https://github.com/bill317996/Audio-to-midi/blob/master/cfp.py
    """
    
#     warnings.warn("deprecated", DeprecationWarning)
    
    # in case there is an .MP3
    assert mp3_path.lower().endswith('.mp3'), 'filename indicates not mp3'
    wav_path_to_write = os.path.splitext(mp3_path)[0] + '.wav'
    if not overwrite and os.path.exists(wav_path_to_write):
        return
    
    mp3 = AudioSegment.from_file(mp3_path)
    
    _, temp_path = tempfile.mkstemp() 
    mp3.export(temp_path, format='wav')
    del mp3
    x, fs = sf.read(temp_path)
    os.remove(temp_path)
    
    if mono and len(x.shape)>1: 
        x = np.mean(x, axis = 1) 
    if sr:
        x = scipy.signal.resample_poly(x, sr, fs)
        fs = sr 
    x = x.astype(dtype)
    write(wav_path_to_write, fs, x)
    return x, fs  

#TODO: output a mel spectrogram with and without median noise to cross check

In [3]:
def find_n_second_multiples_and_filenames(dir_path, n, des_sr): 
    """ 
    dir_path (path): path of a directory with desired files
    n (int): split data into n-second clips
    # Note that this function also resamples!
    
    #TODO: reduce the numpy array (wav_data)
    """
    wav_data = []
    allfilenames = []
    # entire filename 
    timestamp = []

#     for filename in glob.glob(os.path.join(dir_path, '*.wav')):
#     for file in glob.glob(os.path.join(dir_path, '*.mp3'))
    for filename in os.listdir(dir_path):        
        filepath = os.path.join(dir_path, filename)
        
        print(filepath)
        
        if filepath == ".DS_Store": 
            pass 
        elif filename.endswith(".mp3") or filename.endswith(".MP3"):
            pass
#             data, sr = convert_mp3_to_wav(filepath)
#             print("sr", sr)
        elif filename.endswith(".wav") or filename.endswith(".WAV"):
            sr, data = wavfile.read(filepath)
            print('data', data)
            print('sr', sr)
            # comment out previous for no resampling
#             data = samplerate.resample(data, des_sr/sr, 'sinc_best')
#             sr = des_sr
            
        num_samples_in_n_seconds =  n  * sr 
        length_data = len(data)
        length_in_seconds = length_data / sr 
        
        if length_data < num_samples_in_n_seconds: 
#             print("first case")
            pass
        
        elif length_data > num_samples_in_n_seconds: 
#             print("second case")
            
            if (length_data % num_samples_in_n_seconds) != 0:
                data = data[:-(length_data % num_samples_in_n_seconds)]
            num_of_n_clips = int(np.floor(length_data / num_samples_in_n_seconds))
#             print("num_of_n_clips", num_of_n_clips)
                
            for i in range(num_of_n_clips): 
                allfilenames.append(filename)
                data_n_seconds = data[i*num_samples_in_n_seconds:(i+1)*num_samples_in_n_seconds]
                timestamp.append([i*n,(i+1)*n])
                wav_data.append(data_n_seconds)

        else: #length_data == num_samples_in_n_seconds: 
#             print("third case")
            allfilenames.append(filename)
            wav_data.append(data)
            
    return wav_data, allfilenames, timestamp

##### Preprocessing step
- Resample to standard 44.1kHz
- Spectrogram (frame size of 1024 frames with Hamming windowing and no overlap)
- Calculate Mel spectrogram for each file (which are directly used as features)
- High-pass filtering (filter spectral energy below 500 Hz to reduce environmental noise)
- RMS normalization in each spectrogram
- Spectral median noise reduction (http://sabiod.univ-tln.fr/NIPS4B2013_book.pdf aka common median-based thresholding)
##### Processing step
- PCA-whiten the data: https://dfzljdn9uc3pi.cloudfront.net/2014/488/1/oskmeans.py.txt

In [4]:
# dir_path = '/Volumes/Elements/Test/Data/'
dir_path = '/Volumes/Elements/Mixed_AM_Dataset2/'
# dir_path = '/Volumes/Elements/Madre_de_Dios_Xeno_Canto_Birdcalls/'

# find . -name ".DS_Store" -delete

# the function find_n_second_multiples_and_filenames resamples the data as well
shorter_clips, file_names, timestamp = find_n_second_multiples_and_filenames(dir_path, 10, 44100)
print(file_names)
print(len(file_names))

/Volumes/Elements/Mixed_AM_Dataset2/20190610_022000.WAV
data [141  44 171 ...  68  99 178]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190610_033000.WAV
data [  43   42  -87 ... -290 -192 -191]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190610_043000.WAV
data [-61  36  19 ... 106 105  72]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190611_031000.WAV
data [ 747  935 1250 ...  805  928 1035]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190611_035000.WAV
data [   -55     74     89 ... -12197 -12264 -12282]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190611_041000.WAV
data [  93 -148 -243 ...  566  659  575]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190611_044000.WAV
data [ 898  813  856 ...  662  258 -176]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190611_050000.WAV
data [ -93   36  115 ... -280 -342 -260]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190611_054000.WAV
data [ -443  -264  -150 ... -1027 -1053 -1015]
sr 384000
/Volumes/Elements/Mixed_AM_Data

data [  -8    9   24 ...   87 -138 -137]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190621_173000.WAV
data [-183 -198 -117 ... -339 -257  -95]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190621_174000.WAV
data [ 27  58 -23 ... 158 189 140]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190621_181000.WAV
data [ -31 -142 -141 ... -155 -138 -281]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190621_185000.WAV
data [   0   64   -1 ...   63 -210 -144]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190621_190000.WAV
data [-102 -149 -100 ...   90   57  152]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190623_074000.WAV
data [-364 -282 -232 ...  293  227  225]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190623_075000.WAV
data [145 160  95 ...  72 103  54]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190623_081000.WAV
data [  88  -89  -40 ... -296 -278 -132]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190623_082000.WAV
data [-36 -51 -82 ...  41  40  39]
sr 384000
/Vol

data [214 212 114 ... -81 -32 -79]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190628_100000.WAV
data [111 110 141 ...  28  11  26]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190628_102000.WAV
data [219 297 279 ... 249  87 166]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190628_104000.WAV
data [ -14   19   50 ...  -61 -156 -187]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190628_105000.WAV
data [2454 2489 2412 ...  965  784  668]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190628_111000.WAV
data [-431 -236  -58 ... -241 -175 -174]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190630_025000.WAV
data [ 276  354  464 ... -139 -170 -217]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190630_034000.WAV
data [ 104   55 -250 ...  -56  -55 -102]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190630_042000.WAV
data [-331 -361 -455 ...  109  156   91]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190630_044000.WAV
data [ 565  610  654 ...  100 -109 -300]
sr 384000
/Vol

data [  13  156  267 ... -151  -86  -21]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_073000.WAV
data [ -76 -123 -106 ...   26  105  184]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_083000.WAV
data [-101 -132  -67 ...  -30  -29   36]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_084000.WAV
data [-104   -7   90 ...  -28  -11  -10]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_094000.WAV
data [ -47  -30  -13 ...  -34 -145 -144]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_095000.WAV
data [ 134  101    4 ... -165 -148 -179]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_115000.WAV
data [-63 -78 -61 ... -29 -12   5]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_124000.WAV
data [ -78 -125  -92 ...  122  105  104]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_135000.WAV
data [245 227 129 ... 102 149 148]
sr 384000
/Volumes/Elements/Mixed_AM_Dataset2/20190707_144000.WAV
data [542 555 568 ... -12 -27 -10]
sr 384000
/Vol

In [5]:
print(shorter_clips)
print(len(shorter_clips))
print(timestamp)
print(len(timestamp))

[array([141,  44, 171, ...,  98, 129, 128], dtype=int16), array([143, 190, 221, ..., 150, 197, 148], dtype=int16), array([115,  98,  65, ...,  20,  83,  66], dtype=int16), array([ 65,  64,  15, ...,   1, -16,  49], dtype=int16), array([ 80,  31,  14, ...,  95,  30, -19], dtype=int16), array([ -82, -113,  -64, ...,   68,   99,  178], dtype=int16), array([ 43,  42, -87, ..., -41,  -8, -23], dtype=int16), array([-38, -21, -20, ...,  48,  79, 126], dtype=int16), array([109, 156, 123, ..., 130, 129,  96], dtype=int16), array([ 63,  46,  29, ..., 134, 229,  99], dtype=int16), array([ 18, -79, -78, ...,  87, 102,   5], dtype=int16), array([  52,   99,   66, ..., -290, -192, -191], dtype=int16), array([ -61,   36,   19, ...,   17,  -16, -111], dtype=int16), array([-142, -157, -108, ..., -114,  -97,  -80], dtype=int16), array([-79, -62, -77, ...,  79,  -2,  -1], dtype=int16), array([-16,  17,   0, ..., -26,   7,   6], dtype=int16), array([37, 52,  3, ..., 32, 47, -2], dtype=int16), array([ 47, 

In [None]:
final_d = []

for i in range(len(shorter_clips)): 
    print("shorter_clips[i].shape", shorter_clips[i].shape)
    hp_d = high_pass_filter(shorter_clips[i],44100)
    output = make_mel_spectrogram(hp_d, 44100)
    # http://man.hubwiz.com/docset/LibROSA.docset/Contents/Resources/Documents/generated/librosa.feature.rmse.html
    rms = librosa.feature.rms(y=None, S=output, frame_length = 254)
    norm_d = output/rms
    median_d = median_denoising(norm_d)
    median_d_transpose = median_d.T
    final_d.append(median_d_transpose.flatten())
#     print(final_d)
    
# print(final_d)
# print(len(final_d))
# print(final_d[0].shape)


shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[

shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[

shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[

shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[

shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[

shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)
shorter_clips[i].shape (3840000,)


# Clustering
Graph elbow method and visualization of clusters

In [None]:
def cluster_plot(pos, title='', without_labels=False, width=20, height=20, savepath=''):
    plt.rc('font', size=15)
    
    xs, ys = pos[:, 0], pos[:, 1]
    
    labels = range(len(shorter_clips))

    # Data frame with TSNE data, the cluster numbers and titles
    cluster_df = pd.DataFrame(dict(x=xs, y=ys, clusters=clusters, labels=labels)) 

    fig, ax = plt.subplots(figsize=(width, height))
    ax.margins(0.05)
    ax.set_title(title)

    groups = cluster_df.groupby('clusters')
    for name, group in groups:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, mec='none', label=name)
        ax.set_aspect('auto')
        ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
        ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off')

    ax.legend(numpoints=1)
    ax.legend(bbox_to_anchor=(1.1, 1.05))

    #Add labels
    if not without_labels:
        texts = []
        for i in range(len(cluster_df)):
            texts.append(ax.text(cluster_df.loc[i, 'x'], cluster_df.loc[i, 'y'], cluster_df.loc[i,'labels'], size=10)) 

        #adjust_text(texts, arrowprops=dict(arrowstyle='->', color='red'))
    
    if savepath != '':
        plt.savefig(savepath, dpi=200, bbox_inches="tight")

    plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

n_components = 134
dist = 1 - cosine_similarity(final_d)
mds = PCA(n_components, random_state=1, whiten=True)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)
X = pos

# Spherical K-means clustering
# https://github.com/jasonlaska/spherecluster
from spherecluster import SphericalKMeans
n_clusters = 134
clusters = SphericalKMeans(n_clusters).fit_predict(X)

#cluster_plot(X, 'PCA Cluster Plot', without_labels=True, savepath='/Volumes/Elements/Madre_de_Dios_Xeno_Canto_Birdcalls/Clusters_XenoCanto/pca.png')

In [None]:
cluster_plot(X, 'PCA Cluster Plot', without_labels=True, savepath='/Volumes/Elements/Madre_de_Dios_Xeno_Canto_Birdcalls/Clusters_XenoCanto/pca_no_downsampling.png')



In [None]:
print(clusters)
print(len(clusters))

### Clustering k-means elbow

In [None]:
from kneed import KneeLocator

Sum_of_squared_distances = []
x = range(1, 100)
for i in x:
    kmeans = SphericalKMeans(n_clusters = i).fit(X)
    Sum_of_squared_distances.append(kmeans.inertia_)

kn = KneeLocator(x, Sum_of_squared_distances, curve='convex', direction='decreasing')
print(kn.knee)
    
plt.plot(x, Error)
plt.title('Elbow method')
plt.xlabel('No of clusters')
plt.ylabel('Error')
plt.show()

# Cluster Interpretation

In [None]:
# Returns all clip indexes within a cluster
def get_clip_indexes(cluster_label, clusters):
    clips_i = []
    for i in range(len(clusters)):
        if clusters[i] == cluster_label:
            clips_i.append(i)
    return clips_i

# Plays a random clip in a given cluster
def play_random_clip_in_cluster(index, clusters, option='cluster'):
    assert(option == 'cluster' or option == 'clip')
    if option == 'clip':
        index = find_cluster(index, clusters) # Get cluster index
    
    clips = get_clip_indexes(index, clusters)
    clip_i = clips[random.randint(0, len(clips) - 1)]
    print('Playing clip index %d ' % clip_i)
    play_clip(clip_i)
        
# Returns the cluster label that a clip belongs to
def find_cluster(clip_index, clusters):
    return clusters[clip_index]

# Plays an audio clip given the clip index
def play_clip(clip_index):
    samples = shorter_clips[clip_index]
    sd.play(samples, sample_rate)

# Flattens a given list
def list_flatten(l):
    flat_list = []
    for sublist in l:
        for item in sublist:
            flat_list.append(item)
    return flat_list

# Saves clip as a wav file
def save_clip(filename, wav_array):
#     assert('.' not in filename)
    write(filename + '.wav', 44100, wav_array)
    
# Saves all clips to a directory
def save_clips_to_dir(shorter_clips, dirname):
    for i, clip in enumerate(shorter_clips):
        save_clip(dirname + '/%d' % i, clip) 

In [None]:
# Prints: (Cluster number, count of clips in cluster)
from collections import Counter
import random
import sounddevice as sd

Counter(clusters).most_common()

In [None]:
sample_rate = 44100
play_random_clip_in_cluster(0, clusters) # Play random clip in a cluster

In [None]:
sd.stop() # Stop playing

In [None]:
play_clip(10)

In [None]:
sd.stop() # Stop playing

## Save clusters of audio files

#### When not xeno canto:

In [None]:
cluster_list_ordered = []

for i in range(len(np.unique(clusters))): 
    cluster_set = set()
    for j in get_clip_indexes(i, clusters):
        cluster_list_ordered.append(i)
        print("get_clip_indexes(i, clusters)", get_clip_indexes(i, clusters))
        print("file_names[j]", file_names[j])
        print("len(file_names)", len(file_names))
        print("os.path.splitext(file_names[j])",os.path.splitext(file_names[j]))
        print(" os.path.splitext(file_names[j])[0]",  os.path.splitext(file_names[j])[0])
#         print("os.path.splitext(file_names[j])[0].split("-",1)", os.path.splitext(file_names[j])[0].split(" - ",1))
        print()
#         bird_species = os.path.splitext(file_names[j])[0].split("-",1)[1].strip()
#         print(bird_species)
#         print(type(bird_species))
#         cluster_set.add(bird_species)
#     num_bird_species.append(len(cluster_set))
    print(cluster_set)

print(file_names)
print(cluster_list_ordered)
# print(num_bird_species)   

#### If xeno canto:

In [None]:
num_bird_species = []
cluster_list_ordered = []

for i in range(len(np.unique(clusters))): 
    cluster_set = set()
    for j in get_clip_indexes(i, clusters):
        cluster_list_ordered.append(i)
        print("get_clip_indexes(i, clusters)", get_clip_indexes(i, clusters))
        print("file_names[j]", file_names[j])
        print("len(file_names)", len(file_names))
        print("os.path.splitext(file_names[j])",os.path.splitext(file_names[j]))
        print(" os.path.splitext(file_names[j])[0]",  os.path.splitext(file_names[j])[0])
#         print("os.path.splitext(file_names[j])[0].split("-",1)", os.path.splitext(file_names[j])[0].split(" - ",1))
        print()
        bird_species = os.path.splitext(file_names[j])[0].split("-",1)[1].strip()
#         print(bird_species)
#         print(type(bird_species))
        cluster_set.add(bird_species)
    num_bird_species.append(len(cluster_set))
    print(cluster_set)

print(file_names)
print(cluster_list_ordered)
# print(num_bird_species)   

## Evaluation Metrics

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

# n_clusters defined above 
kmeans = SphericalKMeans(n_clusters).fit(X)
print("Sum of squared distances of samples to their closest cluster center:", kmeans.inertia_)

print("Silhouette score:", silhouette_score(X, kmeans.labels_))

print("Dunn index:", davies_bouldin_score(X, kmeans.labels_))

## Save as .csv

In [None]:
## pandas dataframe ## 
import pandas as pd 

full_data = {
    'File name': file_names,
    'Time in clip (seconds)': timestamp,
    'Cluster number': cluster_list_ordered
}

df = pd.DataFrame(full_data, columns = ['File name', 'Time in clip (seconds)', 'Cluster number'])

df.to_csv(r'/Users/yoo-jin/Desktop/073120_export_dataframe_kmeanspherical_xenocanto_notdownsampled.csv',index = False, header=True)
            
print(df.head())

# Optional (save clip)

In [None]:
dirname = '/Volumes/Elements/Test/Clusters/'
# dirname = '/Volumes/Elements/Test/Clusters/'

for i in range(len(np.unique(clusters))):
    os.mkdir(dirname + 'cluster_%d' % i)
    for j in get_clip_indexes(i, clusters):
        save_clip(dirname + 'cluster_%d/%s' % (i, file_names[j]), final_d[j])

## Analyze clusters 
Find the total number of bird species in cluster and also the list of dictionary of each file name corresponding to each cluster number

In [None]:
num_bird_species = []
cluster_list_ordered = []

for i in range(len(np.unique(clusters))): 
    cluster_set = set()
    for j in get_clip_indexes(i, clusters):
        cluster_list_ordered.append(i)
        bird_species = os.path.splitext(file_names[j])[0].split("-",1)[1].strip()
#         print(bird_species)
#         print(type(bird_species))
        cluster_set.add(bird_species)
    num_bird_species.append(len(cluster_set))
    print(cluster_set)

print(file_names)
print(cluster_list_ordered)
# print(num_bird_species)        

In [None]:
# Number of Bird Species in Every Cluster
print(len(num_bird_species))
cluster_num = range(0,len(num_bird_species))

x = cluster_num 
y = num_bird_species

# Add title, axes
plt.bar(x,y)
plt.title('Number of Bird Species in Every Cluster')
plt.xlabel('Cluster Number')
plt.ylabel('Number of Bird Species')

average_num_birds = sum(num_bird_species)/len(num_bird_species)
print("Average number of birds over clusters:", average_num_birds)

# Visualization

### (Optional) testing convert_mp3_to_wav function with one file
This is an extra step that I made to quickly check and troubleshoot the convert_mp3_to_wav function

In [None]:
dir_path = '/Volumes/Elements/Madre_de_Dios_Xeno_Canto_Birdcalls/'
file_path = dir_path + 'XC431125 - Rufous Twistwing - Cnipodectes superrufus.mp3'
# file_path = dir_path + 'XC91323 - White-eyed Parakeet - Psittacara leucophthalmus.mp3'

x, fs = convert_mp3_to_wav(file_path, overwrite=True)

In [None]:
# Graphs to sanity check output of load_audio with online mp3 to wav converter 

fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot(x)

print(x.shape)

# This file is from a random mp3 to wav convert I found here
# https://online-audio-converter.com/ and then downloaded on Desktop
check_dir_path = '/Users/yoo-jin/Desktop/XC431125 - Rufous Twistwing - Cnipodectes superrufus.wav'
x_test, fs_test = sf.read(check_dir_path)

x_test = np.mean(x_test, axis = 1) 
print(x_test.shape)

ax2.plot(x_test)


### Optional: For K-means clustsering 
The function, find_optimal_clusters is used to find the optimal number of clusters for K-means clustering. Take a look at this link for more information: https://hlab.stanford.edu/brian/number_of_clusters_.html.

In [None]:
# dir_path = '/Volumes/Elements/Madre_de_Dios_Xeno_Canto_Birdcalls/'
dir_path = '/Volumes/Elements/Test/'
samplerate = None
wav_data = []
bird_names_and_lengths = []
num_files = 0

for file in glob.glob(os.path.join(dir_path, '*.mp3')):
#     print(file)
#     convert_mp3_to_wav(file, overwrite=True)
    num_files = num_files+1
    try: 
        data, rate = convert_mp3_to_wav(file, overwrite=True)
#         data = data.astype(int)
        samplerate = rate
        if wav_data == []:
            wav_data = data
        else:
            wav_data = np.concatenate((wav_data, data))
        clip_len = len(data) / samplerate
        
        # Input only bird species
        bird_species = file.split(' - ')[2][:-4].replace(' ', '_')
        bird_names_and_lengths.append([bird_species, clip_len])
#         print(file)
        
        # TO DO: deal with warning, don't suppress
        warnings.filterwarnings("ignore", category=DeprecationWarning) 
        warnings.filterwarnings("ignore", category=FutureWarning) 
    except Exception as e:
        print('(failed) ' + file)
        print('\t' + str(e))
        pass
    
print(bird_names_and_lengths)
    
# with warnings.catch_warnings():
#     warnings.simplefilter("ignore")
#     fxn()

### Optional: list all bird species included in the file

In [None]:
dir_path = '/Volumes/Elements/Test/'
list_of_species = []

for f in glob.glob(os.path.join(dir_path, '*.mp3')): 
    bird_name = os.path.splitext(f)[0].split(" - ", 1)[1].strip()
    if list_of_species == []: 
        list_of_species = [bird_name]
    elif bird_name in list_of_species: 
        pass 
    else: 
        list_of_species.append(bird_name)

print(list_of_species)
# print(len(list_of_species))

In [None]:
print("shape of wav_data", len(wav_data))
print("sample rate", sr)
print("type of wav_data", type(wav_data))
print('sample rate = %d' % samplerate)
print(wav_data.shape)
length = wav_data.shape[0] / samplerate
print('length = %.1fs' % length)

In [None]:
# loop through all the cluster folders 
from os import listdir
from os.path import isfile, join

# dirname = '/Volumes/Elements/Madre_de_Dios_Xeno_Canto_Birdcalls/Clusters_XenoCanto/'
dirname = '/Volumes/Elements/Test/Clusters/'
directory_list = os.listdir(dirname)
# print(directory_list)
# make "find . -name ".DS_Store" -delete" in the terminal 

final_bird_list = []
num_bird_species = []

for d in directory_list: 
    path = dirname + d
    file_list = [f for f in os.listdir(path) if not f.startswith('.')]
    
    cluster_set = set()
    for f in file_list: 
#         print(f)
#         if not f.startswith('.'):
        cluster_set = cluster_set | find_bird_species_from_file_name(f)
#         print(f)
    
    num_bird_species.append(len(cluster_set))
    final_bird_list.append(num_bird_species)
        
# print(final_bird_list)
# print(cluster_set)
print(num_bird_species)