## Importing Important Libraries and Python Flies

In [1]:
import numpy as np
import audio_metadata as am
import soundfile as sf                                                      
import scipy.io
import glob, os
from scipy import signal
from scipy.fft import fftshift
import matplotlib.pyplot as plt
from IPython.display import Audio
%matplotlib inline

# Import self-implemented functions
from Distortion import distortion, normalize
from Convolution import convolution
from WeinerFilter import wiener

## Dataset Preparation

In [2]:
metadata = []
for filename in os.listdir("./Data"):
    if filename.endswith(('.wav', 'flac', 'mp3')):
        # Extract metadata information from audio files
        d = am.load(os.path.join('./Data', filename))['tags']
        x, sr_d = sf.read(os.path.join('./Data', filename))
        print(filename, len(x))
        # Create segments of 10 seconds from the audio file
        for i in range(0, len(x), (10 * sr_d)):
            if ((i + (10 * sr_d)) >= len(x)):
                data = x[i:]
            else:
                data = x[i : (i + (10 * sr_d))]
            if (len(np.shape(data)) > 1):
                data = np.mean(data, axis = 1)
            # Create 5 copies and apply different effects to each
            for n in range(5):
                # append metadata
                metadata.append([d['title'][0], d['artist'][0], d['album'][0]])
                # save cropped audio
                sf.write(os.path.join('./Data/Train', filename[:-5] + str(i // (10 * sr_d)) + '-' + str(n) + '.wav'), data, sr_d)
                # Choose a random impulse response to convolve the audio signal with
                r = np.random.randint(0, 7)
                for irfilename in os.listdir("./Data/Impulses"):
                    if (irfilename == "{}.wav".format(str(r))):
                        sr, ir = scipy.io.wavfile.read("./Data/Impulses/{}.wav".format(str(r)))
                        print(r, sr)
                        if (len(np.shape(ir)) > 1):
                            ir = np.mean(ir, axis = 1)
                        sf.write(os.path.join('./Data/Impulses/{}.wav'.format(str(r))), ir, sr)
                # Convolve only 4 out of 5 times
                # Also add distortion depending on randomly generated probability
                if (n != 0):
                    convolution(os.path.join('./Data/Train', filename[:-5] + str(i // (10 * sr_d)) + '-' + str(n) + '.wav'), os.path.join('./Data/Impulses', str(r) + '.wav'), os.path.join('./Data/Train', filename[:-5] + str(i // (10 * sr_d)) + '-' + str(n) + '.wav'))
                    sr_d1, y = scipy.io.wavfile.read(os.path.join('./Data/Train', filename[:-5] + str(i // (10 * sr_d)) + '-' + str(n) + '.wav'))
                    # add white noise
                    y = (y + (10 * np.random.random((len(y), ))))
                    # add distortions
                    p_td = np.random.random()
                    p_cd = np.random.random()
                    print(p_td, p_cd)
                    if (p_td > 0.75):
                        print("tanh distortion")
                        y = distortion(x = y, gain = 2e-4, mode = 'tanh')
                    if (p_cd > 0.6):
                        print("clipping distortion")
                        y = distortion(x = y, gain = 4, mode = 'clipping')
                    scipy.io.wavfile.write(os.path.join('./Data/Train', filename[:-5] + str(i // (10 * sr_d)) + '-' + str(n) + '.wav'), sr_d1, y)
print(metadata)

    Ignoring ``TBPM``.
    Numeric text frame values must consist only of digits.
    


01 - Arijit Singh - Tum Hi Ho.wav 11641812
0 44100
2 48000
0.050583259167371275 0.20555625818173595
2 48000
0.08040902494495661 0.47919310177947694
4 48000
0.6256299378517755 0.20844640599674358
1 48000
0.18914038469721017 0.6010949108795447
clipping distortion
2 48000
0 44100
0.7842357616750265 0.978837063677233
tanh distortion
clipping distortion
3 48000
0.9729342978314508 0.6468390578849468
tanh distortion
clipping distortion
2 48000
0.562044277471443 0.403885960754222
4 48000
0.7373822944224337 0.5372669213498573
2 48000
3 48000
0.46826360583006554 0.7151828035262394
clipping distortion
4 48000
0.11144897177007274 0.6626716455251367
clipping distortion
1 48000
0.5071459831772576 0.8696758871816276
clipping distortion
2 48000
0.7088717697862945 0.987766864266225
clipping distortion
5 48000
6 96000
0.16524377656758316 0.744352605640346
clipping distortion


    invalid value encountered in sqrt


3 48000
0.9098534852448127 0.793212954843084
tanh distortion
clipping distortion
1 48000
0.41582047342455064 0.08483968511387052
0 44100
0.7111578193147292 0.870657091153813
clipping distortion
5 48000
1 48000
0.12518898236283516 0.3011634244239685
0 44100
0.4066689027276337 0.27367844599181124
6 96000
0.2535135651421858 0.3140798456042796
0 44100
0.8901458960055474 0.382051131345019
tanh distortion
1 48000
2 48000
0.7571213288614655 0.921511582819672
tanh distortion
clipping distortion
6 96000
0.1810373087411593 0.21295656686488817
3 48000
0.9404001692315583 0.1459575847833109
tanh distortion
3 48000
0.3457941980132937 0.956245287375521
clipping distortion
5 48000
5 48000
0.4684758072825751 0.7651732849418403
clipping distortion
4 48000
0.24715042017735012 0.8293285710454814
clipping distortion
3 48000
0.4109184951963538 0.8833743153369557
clipping distortion
3 48000
0.34280394268203684 0.8275133729309859
clipping distortion
3 48000
0 44100
0.8333551728351349 0.9576285224873484
tanh d

    invalid value encountered in sqrt


0.4391808688226453 0.7320260069594434
clipping distortion
4 48000
0.7487607902896316 0.134580100952869
6 96000
0.731500224232235 0.6473756716961802
clipping distortion
0 44100
3 48000
0.14161516291498022 0.6355355793839632
clipping distortion
4 48000
0.7885733133829724 0.16049000868727858
tanh distortion
1 48000
0.3321171259318052 0.47821806346989537
0 44100
0.5106544310495486 0.07549287202480948
6 96000
1 48000
0.7274079000343892 0.1684668972876906
5 48000
0.00838631654632116 0.11292404669357203
6 96000
0.5288334389941173 0.7851816084350514
clipping distortion
3 48000
0.10974592790001692 0.6938733760763287
clipping distortion
2 48000
5 48000
0.1410449021681962 0.8215565127072733
clipping distortion
3 48000
0.9182748716297597 0.982194534477979
tanh distortion
clipping distortion
2 48000
0.22508116339775697 0.7343354267077162
clipping distortion
5 48000
0.052619422291764195 0.15838399436602613
0 44100
4 48000
0.4797452690111079 0.9235284590431299
clipping distortion
0 44100
0.9838601007

0.45774102230563185 0.9801587309600821
clipping distortion
2 48000
0.4738602253806976 0.248444605622674
6 96000
2 48000
0.5133380395498316 0.9977583255905681
clipping distortion
3 48000
0.33773572298112986 0.16785572799177295
4 48000
0.8038137858830707 0.5201203571695794
tanh distortion
2 48000
0.08518745656749793 0.43592328668831093
6 96000
3 48000
0.18736573037913729 0.370801650398916
1 48000
0.8996264250131331 0.31392786483995216
tanh distortion
3 48000
0.36038015864451645 0.8593866575418205
clipping distortion
0 44100
0.5482181519040099 0.045091387734984
6 96000
6 96000
0.19366625867574794 0.07479385249383541
6 96000
0.43927611749117745 0.17068612602281674
6 96000
0.0926549750400304 0.5924325862379651
4 48000
0.9321736658819045 0.021582868030076696
tanh distortion
5 48000
1 48000
0.10757640639657107 0.2709976105477594
5 48000
0.339155364388519 0.8950044255743898
clipping distortion
6 96000
0.6547213670929329 0.4858150705445836
1 48000
0.0916203305021388 0.6326439707076768
clipping 

2 48000
0.23929191410778117 0.3195686317723089
5 48000
6 96000
0.8599790336871976 0.5060514204004732
tanh distortion
6 96000
0.16908990796517864 0.7252097248635629
clipping distortion
1 48000
0.017771920137395147 0.39591440276166967
5 48000
0.4491512614128509 0.8949552614283974
clipping distortion
1 48000
0 44100
0.8169735616952803 0.6395401396701593
tanh distortion
clipping distortion
5 48000
0.7053266745705369 0.14124259198456757
3 48000
0.15773285695791073 0.798796200170871
clipping distortion
0 44100
0.6773914000355533 0.22108586198860403
0 44100
3 48000
0.4329401295768899 0.5242681367878703
4 48000
0.7749251648903717 0.9138150459593769
tanh distortion
clipping distortion
4 48000
0.2829296441959881 0.7254599631358265
clipping distortion
6 96000
0.23845798273574892 0.9296321028453229
clipping distortion
3 48000
6 96000
0.8408884213980269 0.8865483323967137
tanh distortion
clipping distortion
4 48000
0.28626809245867046 0.08864820398356799
3 48000
0.8043357431145585 0.124266988616522

1 48000
0.5710242596848507 0.34897356323944506
4 48000
0.6201677043423486 0.8056081581095791
clipping distortion
3 48000
4 48000
0.6073790413284073 0.11650577738501966
5 48000
0.6546940450413165 0.0813151428310287
1 48000
0.5687055080884409 0.798677117386151
clipping distortion
2 48000
0.30658875411429864 0.7422807165130075
clipping distortion
4 48000
0 44100
0.8454757673157247 0.7341876543506373
tanh distortion
clipping distortion
3 48000
0.25351962958586327 0.4457351158519939
6 96000
0.5008023200985712 0.9110098256535523
clipping distortion
2 48000
0.9126166705196208 0.3014105382385287
tanh distortion
5 48000
1 48000
0.5129720997805994 0.4066177800565409
1 48000
0.6951748917417565 0.42244464052800745
1 48000
0.9699103450131663 0.31660726586533616
tanh distortion
4 48000
0.8095959849591431 0.6262899639399861
tanh distortion
clipping distortion
2 48000
5 48000
0.08524229681014661 0.630919228841186
clipping distortion
4 48000
0.4318269728774018 0.09727171788588251
1 48000
0.536365835061

    Ignoring ``TBPM``.
    Numeric text frame values must consist only of digits.
    


02 - Tu Hai.wav 10389416
3 48000
5 48000
0.3735903547804539 0.48595820082160857
3 48000
0.9217567809556726 0.972442879063634
tanh distortion
clipping distortion
5 48000
0.6403988266498121 0.8043236217101629
clipping distortion
0 44100
0.4873010468928072 0.23104142914050263
3 48000
1 48000
0.39255955802176534 0.7435633331968053
clipping distortion
5 48000
0.5091784765460604 0.8411923707488309
clipping distortion
2 48000
0.5640240628865401 0.3062862107000317
2 48000
0.6551505862531125 0.10194005821644947
2 48000
1 48000
0.5199343616849958 0.4128013711392099
0 44100
0.5575010486129736 0.1289051887608902
5 48000
0.5941158856990533 0.09600028802205918
3 48000
0.5440871650771163 0.6949725344873077
clipping distortion
6 96000
4 48000
0.5072031714838302 0.0416389757732889
5 48000
0.44441953218762853 0.8407611787419937
clipping distortion
6 96000
0.46185965083091407 0.05295976228621191
1 48000
0.9668620574057784 0.4849334114711279
tanh distortion
4 48000
3 48000
0.11343587361521157 0.4132877892

0.7150980749226032 0.45145958206505943
3 48000
2 48000
0.11303716525022578 0.26426451824912756
3 48000
0.044104324819217666 0.42799772636101086
4 48000
0.16128446809809793 0.7704271087207789
clipping distortion
0 44100
0.08122945645720125 0.9816809171060662
clipping distortion
5 48000
3 48000
0.6821400634121442 0.2997946363363396
5 48000
0.29940508980825065 0.1448849449295817
5 48000
0.5612874003929691 0.5650232825157812
6 96000
0.9153604604775726 0.8439243011213802
tanh distortion
clipping distortion
5 48000
2 48000
0.45183396642972473 0.03890957510161097
3 48000
0.09022084050143031 0.5194417570901649
5 48000
0.10696886769179736 0.8236531223024449
clipping distortion
3 48000
0.6097007243106034 0.8292434964921023
clipping distortion
3 48000
4 48000
0.6728465455948623 0.9478276869624918
clipping distortion
3 48000
0.4862141725035011 0.002019539163986672
3 48000
0.22848757861057478 0.19895494704300276
5 48000
0.5805771427341742 0.3766161570491784
4 48000
3 48000
0.9753923616007146 0.6433

5 48000
0.015575493684874542 0.8308746967612736
clipping distortion
2 48000
0.6787128258420424 0.1840904392976862
3 48000
0.8770516714639564 0.9594924689206045
tanh distortion
clipping distortion
5 48000
0 44100
0.169294242299727 0.5563233046574327
2 48000
0.12877907058035154 0.6550541972867671
clipping distortion
3 48000
0.40010773568646485 0.4345704755580957
0 44100
0.8247290322986921 0.1588682343499318
tanh distortion
[['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho', 'Arijit Singh', 'Aashiqui 2 (OST)'], ['Tum Hi Ho',

In [39]:
print(np.shape(metadata))

(1015, 3)


## Audio Fingerprinting

In [4]:
def DFT(x, N):
    '''
    This function performs the N-point DFT of the 1D signal x.
    
    Returns: N-point DFT of signal x as a 1D numpy array.
    
    x: Given 1D signal.
    N: Size of DFT to be performed on x.
    '''
    
    # find the length of the given signal
    L = len(x)
    # n goes from 0 to (N - 1)
    n = np.arange(N)
    # define k variable
    k = n.reshape((N, 1))
    # pad signal wirh zeros if length of signal is less than number of points of DFT
    if (L < N):
        x = np.append(x, np.zeros(N - L))
    # perform the DFT
    # exponential term in the expression
    e = np.exp(-1j * 2 * np.pi * k * n / N)
    # multiplication with signal and summation
    X = np.dot(e, x[:N])
    # return the DFT signal
    return X

In [5]:
def FFT(x, N):
    '''
    This function performs the N-point FFT of the 1D signal x.
    
    Returns: N-point FFT of signal x as a 1D numpy array.
    
    x: Given 1D signal.
    N: Size of FFT to be performed on x.
    '''
    
    # raise error if N is not a power of 2
    if ((N % 2) > 0):
        raise ValueError("N must be a power of 2")
    # find the length of the given signal
    L = len(x)
    # n goes from 0 to (N - 1)
    n = np.arange(N)
    # pad signal wirh zeros if length of signal is less than number of points of DFT
    if (L < N):
        x = np.append(x, np.zeros(N - L))
    # no more divisions can be done
    if (N <= 2):
        return DFT(x, N)
    x = x[:N]
    # calculate FFT of the even terms
    X_even = FFT(x[::2], int(N / 2))
    # calculate FFT of the odd terms
    X_odd = FFT(x[1::2], int(N / 2))
    # the exponential term to be multiplied to the FFT of the odd term
    e = np.exp(-1j * 2 * np.pi * np.arange(N) / N)
    # combine the terms
    return np.concatenate([(X_even + np.multiply(e[:int(N / 2)], X_odd)), (X_even + np.multiply(e[int(N / 2):], X_odd))])

In [6]:
def IDFT(X, N):
    '''
    This function performs the N-point IDFT of the 1D signal X.
    
    Returns: N-point IDFT of signal X as a 1D numpy array.
    
    X: Given 1D signal.
    N: Size of IDFT to be performed on X.
    '''
    
    # find the length of the given signal
    L = len(X)
    # n goes from 0 to (N - 1)
    n = np.arange(N)
    # define k variable
    k = n.reshape((N, 1))
    # pad signal wirh zeros if length of signal is less than number of points of IDFT
    if (L < N):
        X = np.append(X, np.zeros(N - L))
    # perform the IDFT
    # exponential term in the expression
    e = np.exp(1j * 2 * np.pi * k * n / N)
    # multiplication with signal and summation
    x = np.dot(e, X[:N])
    # return the IDFT signal
    return (x / N)

In [7]:
def IFFT(X, N):
    '''
    This function performs the N-point IFFT of the 1D signal X.
    
    Returns: N-point IFFT of signal X as a 1D numpy array.
    
    X: Given 1D signal.
    N: Size of IFFT to be performed on X.
    '''
    
    # raise error if N is not a power of 2
    if ((N % 2) > 0):
        raise ValueError("N must be a power of 2")
    # find the length of the given signal
    L = len(X)
    # n goes from 0 to (N - 1)
    n = np.arange(N)
    # pad signal wirh zeros if length of signal is less than number of points of IDFT
    if (L < N):
        X = np.append(X, np.zeros(N - L))
    # no more divisions can be done
    if (N <= 2):
        return IDFT(X, N)
    X = X[:N]
    # calculate IFFT of the even terms
    x_even = IFFT(X[::2], int(N / 2))
    # calculate IFFT of the odd terms
    x_odd = IFFT(X[1::2], int(N / 2))
    # the exponential term to be multiplied to the IFFT of the odd term
    e = np.exp(1j * 2 * np.pi * np.arange(N) / N)
    # combine the terms
    return np.concatenate([((x_even + np.multiply(e[:int(N / 2)], x_odd)) / 2), ((x_even + np.multiply(e[int(N / 2):], x_odd)) / 2)])

### Create Acoustic Fingerprints

In [32]:
fingerprints = np.zeros((1015, 300))
song = 0
for filename in os.listdir("./Data/Train"):
    if filename.endswith(('.wav')):
        x, sr_d = sf.read(os.path.join('./Data/Train', filename))
        # Use Weiner filter to remove the noise
        x = weiner(x, 50)
        print(filename, len(x))
        l = len(x)
        n = (l // 30)
        arr = []
        # Segment the audio data into 30 segments
        for i in range(30):
            # Take the Short-term Fourier Transform
            X = FFT(x[(i * n) : ((i + 1) * n)], 1024)
            X_abs = np.absolute(X)
            # Isolate dominant 10 frequencies
            fingerprint = np.array([(np.argpartition(X_abs, -10)[-10:] * (sr_d / 1024))])
            # Append to fingerprints
            arr.append(fingerprint)
        fingerprints[song] = np.array(arr).ravel()
        song = (song + 1)
print(fingerprints)

01 - Arijit Singh - Tum Hi H0-0.wav 441000
01 - Arijit Singh - Tum Hi H0-1.wav 441000
01 - Arijit Singh - Tum Hi H0-2.wav 441000
01 - Arijit Singh - Tum Hi H0-3.wav 441000
01 - Arijit Singh - Tum Hi H0-4.wav 441000
01 - Arijit Singh - Tum Hi H1-0.wav 441000
01 - Arijit Singh - Tum Hi H1-1.wav 441000
01 - Arijit Singh - Tum Hi H1-2.wav 441000
01 - Arijit Singh - Tum Hi H1-3.wav 441000
01 - Arijit Singh - Tum Hi H1-4.wav 441000
01 - Arijit Singh - Tum Hi H10-0.wav 441000
01 - Arijit Singh - Tum Hi H10-1.wav 441000
01 - Arijit Singh - Tum Hi H10-2.wav 441000
01 - Arijit Singh - Tum Hi H10-3.wav 441000
01 - Arijit Singh - Tum Hi H10-4.wav 441000
01 - Arijit Singh - Tum Hi H11-0.wav 441000
01 - Arijit Singh - Tum Hi H11-1.wav 441000
01 - Arijit Singh - Tum Hi H11-2.wav 441000
01 - Arijit Singh - Tum Hi H11-3.wav 441000
01 - Arijit Singh - Tum Hi H11-4.wav 441000
01 - Arijit Singh - Tum Hi H12-0.wav 441000
01 - Arijit Singh - Tum Hi H12-1.wav 441000
01 - Arijit Singh - Tum Hi H12-2.wav 44100

01 - No Time To Die13-4.wav 441000
01 - No Time To Die14-0.wav 441000
01 - No Time To Die14-1.wav 441000
01 - No Time To Die14-2.wav 441000
01 - No Time To Die14-3.wav 441000
01 - No Time To Die14-4.wav 441000
01 - No Time To Die15-0.wav 441000
01 - No Time To Die15-1.wav 441000
01 - No Time To Die15-2.wav 441000
01 - No Time To Die15-3.wav 441000
01 - No Time To Die15-4.wav 441000
01 - No Time To Die16-0.wav 441000
01 - No Time To Die16-1.wav 441000
01 - No Time To Die16-2.wav 441000
01 - No Time To Die16-3.wav 441000
01 - No Time To Die16-4.wav 441000
01 - No Time To Die17-0.wav 441000
01 - No Time To Die17-1.wav 441000
01 - No Time To Die17-2.wav 441000
01 - No Time To Die17-3.wav 441000
01 - No Time To Die17-4.wav 441000
01 - No Time To Die18-0.wav 441000
01 - No Time To Die18-1.wav 441000
01 - No Time To Die18-2.wav 441000
01 - No Time To Die18-3.wav 441000
01 - No Time To Die18-4.wav 441000
01 - No Time To Die19-0.wav 441000
01 - No Time To Die19-1.wav 441000
01 - No Time To Die1

01. Natural3-4.wav 441000
01. Natural4-0.wav 441000
01. Natural4-1.wav 441000
01. Natural4-2.wav 441000
01. Natural4-3.wav 441000
01. Natural4-4.wav 441000
01. Natural5-0.wav 441000
01. Natural5-1.wav 441000
01. Natural5-2.wav 441000
01. Natural5-3.wav 441000
01. Natural5-4.wav 441000
01. Natural6-0.wav 441000
01. Natural6-1.wav 441000
01. Natural6-2.wav 441000
01. Natural6-3.wav 441000
01. Natural6-4.wav 441000
01. Natural7-0.wav 441000
01. Natural7-1.wav 441000
01. Natural7-2.wav 441000
01. Natural7-3.wav 441000
01. Natural7-4.wav 441000
01. Natural8-0.wav 441000
01. Natural8-1.wav 441000
01. Natural8-2.wav 441000
01. Natural8-3.wav 441000
01. Natural8-4.wav 441000
01. Natural9-0.wav 441000
01. Natural9-1.wav 441000
01. Natural9-2.wav 441000
01. Natural9-3.wav 441000
01. Natural9-4.wav 441000
02 - Dirty Walk0-0.wav 441000
02 - Dirty Walk0-1.wav 441000
02 - Dirty Walk0-2.wav 441000
02 - Dirty Walk0-3.wav 441000
02 - Dirty Walk0-4.wav 441000
02 - Dirty Walk1-0.wav 441000
02 - Dirty Wal

02 - Tu Ha2-4.wav 441000
02 - Tu Ha20-0.wav 441000
02 - Tu Ha20-1.wav 441000
02 - Tu Ha20-2.wav 441000
02 - Tu Ha20-3.wav 441000
02 - Tu Ha20-4.wav 441000
02 - Tu Ha21-0.wav 441000
02 - Tu Ha21-1.wav 441000
02 - Tu Ha21-2.wav 441000
02 - Tu Ha21-3.wav 441000
02 - Tu Ha21-4.wav 441000
02 - Tu Ha22-0.wav 441000
02 - Tu Ha22-1.wav 441000
02 - Tu Ha22-2.wav 441000
02 - Tu Ha22-3.wav 441000
02 - Tu Ha22-4.wav 441000
02 - Tu Ha23-0.wav 246416
02 - Tu Ha23-1.wav 246416
02 - Tu Ha23-2.wav 246416
02 - Tu Ha23-3.wav 246416
02 - Tu Ha23-4.wav 246416
02 - Tu Ha3-0.wav 441000
02 - Tu Ha3-1.wav 441000
02 - Tu Ha3-2.wav 441000
02 - Tu Ha3-3.wav 441000
02 - Tu Ha3-4.wav 441000
02 - Tu Ha4-0.wav 441000
02 - Tu Ha4-1.wav 441000
02 - Tu Ha4-2.wav 441000
02 - Tu Ha4-3.wav 441000
02 - Tu Ha4-4.wav 441000
02 - Tu Ha5-0.wav 441000
02 - Tu Ha5-1.wav 441000
02 - Tu Ha5-2.wav 441000
02 - Tu Ha5-3.wav 441000
02 - Tu Ha5-4.wav 441000
02 - Tu Ha6-0.wav 441000
02 - Tu Ha6-1.wav 441000
02 - Tu Ha6-2.wav 441000
02 - 

1. Calum Scott - Biblical2-3.wav 441000
1. Calum Scott - Biblical2-4.wav 441000
1. Calum Scott - Biblical20-0.wav 441000
1. Calum Scott - Biblical20-1.wav 441000
1. Calum Scott - Biblical20-2.wav 441000
1. Calum Scott - Biblical20-3.wav 441000
1. Calum Scott - Biblical20-4.wav 441000
1. Calum Scott - Biblical21-0.wav 441000
1. Calum Scott - Biblical21-1.wav 441000
1. Calum Scott - Biblical21-2.wav 441000
1. Calum Scott - Biblical21-3.wav 441000
1. Calum Scott - Biblical21-4.wav 441000
1. Calum Scott - Biblical22-0.wav 426598
1. Calum Scott - Biblical22-1.wav 426598
1. Calum Scott - Biblical22-2.wav 426598
1. Calum Scott - Biblical22-3.wav 426598
1. Calum Scott - Biblical22-4.wav 426598
1. Calum Scott - Biblical3-0.wav 441000
1. Calum Scott - Biblical3-1.wav 441000
1. Calum Scott - Biblical3-2.wav 441000
1. Calum Scott - Biblical3-3.wav 441000
1. Calum Scott - Biblical3-4.wav 441000
1. Calum Scott - Biblical4-0.wav 441000
1. Calum Scott - Biblical4-1.wav 441000
1. Calum Scott - Biblical

In [40]:
print(np.shape(fingerprints))

(1015, 300)


Now, these fingerprints will serve as inputs and metadata will serve as outputs for our ML model.

### Create the Dataframe for Training

In [33]:
import pandas as pd

In [36]:
# Normalize the inputs
fing = ((fingerprints - np.min(fingerprints)) * (1 / (np.max(fingerprints) - np.min(fingerprints))))

In [79]:
dfI = pd.DataFrame(fing, columns = ['I{}'.format(i) for i in range(np.shape(fing)[1])])
dfO = pd.DataFrame(metadata, columns = ['O{}'.format(i) for i in range(np.shape(metadata)[1])])
df = pd.concat([dfI, dfO], axis = 1)
df.head()

Unnamed: 0,I0,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,I293,I294,I295,I296,I297,I298,I299,O0,O1,O2
0,0.154023,0.154921,0.157166,0.151329,0.153574,0.153125,0.152676,0.152227,0.151778,0.459375,...,0.454885,0.454435,0.00449,0.005838,0.005389,0.004041,0.453986,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
1,0.000898,0.458926,0.458477,0.001347,0.00449,0.009879,0.454885,0.00494,0.449945,0.455334,...,0.455783,0.453986,0.00449,0.455334,0.005389,0.005838,0.454435,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
2,0.000898,0.458926,0.458477,0.001347,0.00449,0.009879,0.454885,0.00494,0.449945,0.455334,...,0.455783,0.453986,0.00449,0.455334,0.005389,0.005838,0.454435,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
3,0.004041,0.455783,0.456232,0.00449,0.454885,0.005389,0.003592,0.00494,0.454435,0.455334,...,0.453986,0.455783,0.004041,0.454435,0.005389,0.00449,0.455334,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
4,0.00449,0.455334,0.449496,0.010328,0.015268,0.444556,0.005389,0.454435,0.010777,0.449047,...,0.017513,0.442311,0.0,0.016615,0.443209,0.443658,0.016166,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)


In [80]:
dfO

Unnamed: 0,O0,O1,O2
0,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
1,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
2,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
3,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
4,Tum Hi Ho,Arijit Singh,Aashiqui 2 (OST)
...,...,...,...
1010,Biblical,Calum Scott,Bridges
1011,Biblical,Calum Scott,Bridges
1012,Biblical,Calum Scott,Bridges
1013,Biblical,Calum Scott,Bridges


In [81]:
# One-Hot-Encode the outputs
one_hot0 = pd.get_dummies(dfO['O0'])
one_hot1 = pd.get_dummies(dfO['O1'])
one_hot2 = pd.get_dummies(dfO['O2'])
dfO = dfO.drop('O0', axis = 1)
dfO = dfO.drop('O1', axis = 1)
dfO = dfO.drop('O2', axis = 1)
dfO = dfO.join(one_hot0, lsuffix = 'title_', rsuffix = 'title_')
dfO = dfO.join(one_hot1, lsuffix = 'artist_', rsuffix = 'artist_')
dfO = dfO.join(one_hot2, lsuffix = 'album_', rsuffix = 'album_')
dfO.head()

Unnamed: 0,24K Magicalbum_,Biblical,Dirty Walk,Fadedalbum_,Get Ready,Natural,No Time To Diealbum_,The Lonely Pine-Tree,Tu Hai,Tum Hi Ho,...,Imagine Dragons,24K Magicalbum_.1,Aashiqui 2 (OST),Birdman (Alejandro González Iñárritu's Original Motion Picture Soundtrack),Bridges,Fadedalbum_.1,From My Book of Melodies,Mohenjo Daro,No Time To Diealbum_.1,Origins (Deluxe)
0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [188]:
print(dfO.columns.values.tolist())

['24K Magicalbum_', 'Biblical', 'Dirty Walk', 'Fadedalbum_', 'Get Ready', 'Natural', 'No Time To Diealbum_', 'The Lonely Pine-Tree', 'Tu Hai', 'Tum Hi Ho', 'A.R. Rahman, Sanah Moidutty', 'Alan Walker', 'Alma Deutscher', 'Antonio Sanchez', 'Arijit Singh', 'Billie Eilish', 'Bruno Mars', 'Calum Scott', 'Imagine Dragons', '24K Magicalbum_', 'Aashiqui 2 (OST)', "Birdman (Alejandro González Iñárritu's Original Motion Picture Soundtrack)", 'Bridges', 'Fadedalbum_', 'From My Book of Melodies', 'Mohenjo Daro', 'No Time To Diealbum_', 'Origins (Deluxe)']


### Create Artificial Neural Network

In [62]:
import tensorflow as tf
from tensorflow import keras

In [86]:
tf.convert_to_tensor(dfI)

<tf.Tensor: shape=(1015, 300), dtype=float64, numpy=
array([[0.15402309, 0.15492119, 0.15716642, ..., 0.00538856, 0.00404142,
        0.45398644],
       [0.00089809, 0.45892595, 0.45847691, ..., 0.00538856, 0.00583761,
        0.45443548],
       [0.00089809, 0.45892595, 0.45847691, ..., 0.00538856, 0.00583761,
        0.45443548],
       ...,
       [0.02020711, 0.43961694, 0.01032808, ..., 0.44051503, 0.03278043,
        0.01930902],
       [0.00269428, 0.45712977, 0.00134714, ..., 0.45847691, 0.00134714,
        0.00269428],
       [0.45712977, 0.45847691, 0.00269428, ..., 0.00089809, 0.45892595,
        0.459375  ]])>

In [87]:
tf.convert_to_tensor(dfO)

<tf.Tensor: shape=(1015, 28), dtype=uint8, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=uint8)>

In [90]:
normalizer = tf.keras.layers.Normalization(axis = -1)
normalizer.adapt(dfI)

In [162]:
def get_basic_model():
    model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(300, activation = 'relu'),
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(28)
    ])

    model.compile(optimizer = 'adam',
                loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
                metrics = ['accuracy'])
    return model

In [163]:
model = get_basic_model()
model.summary()
model.fit(dfI, dfO, epochs = 200, batch_size = 50)

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization_4 (Normalizat  (None, 300)              601       
 ion)                                                            
                                                                 
 dense_142 (Dense)           (None, 300)               90300     
                                                                 
 dense_143 (Dense)           (None, 128)               38528     
                                                                 
 dense_144 (Dense)           (None, 256)               33024     
                                                                 
 dense_145 (Dense)           (None, 256)               65792     
                                                                 
 dense_146 (Dense)           (None, 128)               32896     
                                                     

Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x1968a790a60>

### Create Inference Function

In [164]:
ip = dfI.iloc[0].to_numpy()
op = dfO.iloc[0].to_numpy()
acc, loss = model.evaluate(np.reshape(ip, (1, -1)), np.reshape(op, (1, -1)))



In [171]:
ip = dfI.iloc[0].to_numpy()
op = dfO.iloc[0].to_numpy()
y_hat = model.predict(dfI)



In [187]:
y_norm = np.absolute(y_hat) / (np.max(y_hat) - np.min(y_hat))
pred = np.array(y_norm[(np.argpartition(y_norm, -3)[-3:])])
print(np.array([(np.argpartition(pred[0][0], -3)[-3:])]))

[[16 19  0]]


Here, we can see that we get inference outputs as indices 0, 16 and 19, which correspond to the outputs:

1. <b>Title:</b> 24K Magic
2. <b>Artist:</b> Bruno Mars
3. <b>Album:</b> 24K Magic

In [183]:
x, fs = sf.read(os.path.join('./Data', 'audio.wav'))
print(filename, len(x))
signal = x[:(10 * fs)]
l = len(x)
n = (l // 30)
arr = []
for i in range(30):
    X = FFT(x[(i * n) : ((i + 1) * n)], 1024)
    X_abs = np.absolute(X)
    fingerprint = np.array([(np.argpartition(X_abs, -10)[-10:] * (sr_d / 1024))])
    arr.append(fingerprint)
fing = ((arr - np.min(arr)) * (1 / (np.max(arr) - np.min(arr))))
model.evaluate(np.reshape(fing, (1, -1)))

1. Calum Scott - Biblical9-4.wav 441000


[0.0, 0.0]

Link for GitHub repo: [link](https://github.com/PrashantP2k/Robust-Music-Recognition)