# Part 1

## 1.1

In [1]:
#part1
import numpy as np      
import matplotlib.pyplot as plt 
import scipy.io.wavfile 
import subprocess
import librosa
import librosa.display
import IPython.display as ipd

from pathlib import Path, PurePath   
from tqdm.notebook import tqdm

import random
import pandas as pd
import os
import difflib
from scipy.spatial import distance


#part3
from random import randint

### Utility functions

In [2]:
def convert_mp3_to_wav(audio:str) -> str:  
    """Convert an input MP3 audio track into a WAV file.

    Args:
        audio (str): An input audio track.

    Returns:
        [str]: WAV filename.
    """
    if audio[-3:] == "mp3":
        wav_audio = audio[:-3] + "wav"
        if not Path(wav_audio).exists():
                subprocess.check_output(f"ffmpeg -i {audio} {wav_audio}", shell=True)
        return wav_audio
    
    return audio

def plot_spectrogram_and_picks(track:np.ndarray, sr:int, peaks:np.ndarray, onset_env:np.ndarray) -> None:
    """[summary]

    Args:
        track (np.ndarray): A track.
        sr (int): Aampling rate.
        peaks (np.ndarray): Indices of peaks in the track.
        onset_env (np.ndarray): Vector containing the onset strength envelope.
    """
    times = librosa.frames_to_time(np.arange(len(onset_env)),
                            sr=sr, hop_length=HOP_SIZE)

    plt.figure()
    ax = plt.subplot(2, 1, 2)
    D = librosa.stft(track)
    librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
                            y_axis='log', x_axis='time')
    plt.subplot(2, 1, 1, sharex=ax)
    plt.plot(times, onset_env, alpha=0.8, label='Onset strength')
    plt.vlines(times[peaks], 0,
            onset_env.max(), color='r', alpha=0.8,
            label='Selected peaks')
    plt.legend(frameon=True, framealpha=0.8)
    plt.axis('tight')
    plt.tight_layout()
    plt.show()

def load_audio_picks(audio, duration, hop_size):
    """[summary]

    Args:
        audio (string, int, pathlib.Path or file-like object): [description]
        duration (int): [description]
        hop_size (int): 

    Returns:
        tuple: Returns the audio time series (track) and sampling rate (sr), a vector containing the onset strength envelope
        (onset_env), and the indices of peaks in track (peaks).
    """
    try:
        track, sr = librosa.load(audio, duration=duration)
        onset_env = librosa.onset.onset_strength(track, sr=sr, hop_length=hop_size)
        peaks = librosa.util.peak_pick(onset_env, 10, 10, 10, 10, 0.5, 0.5)
    except Error as e:
        print('An error occurred processing ', str(audio))
        print(e)

    return track, sr, onset_env, peaks

### Settings

In [3]:
N_TRACKS = 1413
HOP_SIZE = 512
DURATION = 30 
THRESHOLD = 0 # TODO: to be tuned!

In [4]:
data_folder = Path("Part1/archive/mp3s-32k/")
mp3_tracks = data_folder.glob("*/*/*.mp3")
tracks = data_folder.glob("*/*/*.wav") 

### Preprocessing

In [5]:
for track in tqdm(mp3_tracks, total=N_TRACKS):
    convert_mp3_to_wav(str(track))

  0%|          | 0/1413 [00:00<?, ?it/s]

### Store peaks values and tracks titles

In [6]:
all_peaks=[]
titles=[]
t=open('titles.txt','w')
for ind,audio in tqdm(enumerate(tracks)):
    track, sr, onset_env, peaks = load_audio_picks(audio, DURATION, HOP_SIZE)
    all_peaks.append(peaks)
    title=str(audio).replace('Part1/archive/mp3s-32k/','').replace('.wav','')
    #print(title)
    titles.append(title)
    t.write(title+'\n')
t.close()

0it [00:00, ?it/s]

## 1.2

### Create our Database

Create a dictionary with all the possible peaks as keys and values set to 0, we will use it to map the peaks previously stored into a space that is $\mathbb{R}^{k}$, with $k=\#\,total\, peaks$. Each of these vectors will be composed by only 0's and 1's, each value representing if the peak appears for that song (0=False, 1=True).

In [7]:
dic={}
for peaks in all_peaks:
    for peak in peaks:
        if peak not in dic:
            dic[peak]=0

The following function maps each of the peaks array (representing the peaks of a single song) into the space $\mathbb{R}^{k}$ and returns the vector as a numpy array.

In [8]:
def transform01(array,dic):
    
    dic=dict.fromkeys(dic, 0) #initialize to 0 all values, we wil call this func multiple tiems and don't want
                              #mistakes in storing the 
    
    for i in range(len(array)):
        dic[array[i]]=1
        
    return np.asarray(list(dic.values()))

In [9]:
peaks_01=[]
for i in range(len(all_peaks)):
    peaks_01.append(transform01(all_peaks[i],dic))  #peaks_01 now contains all the 01 vectors

In [10]:
print(len(peaks_01),len(peaks_01[0])) #---> k=1287

1413 1287


Now that a 01 vector for each song has been obtained, it will be used to produce a signature as follows:\
Repeat for $n_{permutation}=300$ times:

1.   Perform a random shuffling over the elements of the 01 vector;
1.   Store the index of the first "1" element into the signature for the song;

By doing this we are reducing the dimensions of each vector from $\mathbb{R}^{k=1287}$ to $\mathbb{R}^{n_{permutation}=300}$, this will allow for faster operations on the vectors  

In [11]:
def first1(array): #returns the index of the first '1' element of a np.array
    return np.where(array==1)[0][0]

In [12]:
def minHash(array,n_perms=300): #return list with indexes of first element of the given array
    
    indexes=[]
    random.seed(42) #set seed so we are applying the same transformation to the vectors when iterating(see later)
    
    for i in range(n_perms):
        random.shuffle(array)
        indexes.append(first1(array))
    return indexes

In [13]:
#now save in an array the values we get applying minHash on the whole list of vectors
new_indexes=[]
for i in tqdm(range(len(peaks_01))): #new_indexes will have as elements the columns of the signature matrix 
    indexes=minHash(peaks_01[i], n_perms=300) #we are trying to reduce the dimensions by a factor ~4, 
    new_indexes.append(np.asarray(indexes))    #that's why 300 perms 
    #new indexes is list of np.array

  0%|          | 0/1413 [00:00<?, ?it/s]

In [14]:
print(len(new_indexes[0]),len(new_indexes)) #we have 1413 vectors long 300 each, seems fine

300 1413


So now we have the list 'new_indexes' that contains the columns of the signature matrix, which is a matrix (300,1413) 

### Create Buckets for faster matching with queries

In this section we are dividing the signature matrix in bands, each band containing $bandwidth=10$ rows. Then we map each of the bands through a hash function into a value $v\,\epsilon\,(0,1423)$. After this we store the data obtained in a dictionary as follows $data[v]=index$ where index refers to the index of the song, which we will use to retrieve the songs' titles later on when trying to match queries.

In [15]:
#if I want to do with hash function.... choose prime number =1423
def rands(n,prime):
    a=[]
    random.seed(42)
    for i in range(n):
        a.append(random.randint(0, prime-1))
    return a

def hashf(A,a,prime):
    a=np.asarray(a)
    A=np.asarray(A)
    return (np.sum(a*A))%prime

def mapHash(A,bandwidth=10):
    prime=1423
    a=rands(bandwidth,prime)
    return hashf(A,a,prime)

In [16]:
def Bucket1(array, data, bandwidth=10):
    
    for j in tqdm(range(len(array))):
        
        for i in range(0,len(array[j]),bandwidth):
            
            key=mapHash(array[j][i:i+bandwidth],bandwidth)
            
            if key not in data:
                data[key]=[]
                data[key].append(int(j))
            else:
                data[key].append(int(j))
    return data

In [17]:
data={}
data=dict.fromkeys(data, 0)
data=Bucket1(new_indexes,data,bandwidth=10)

  0%|          | 0/1413 [00:00<?, ?it/s]

### Prepare to compare queries with dataset

Define a similarity score to compare the query with the elements in our dataset and a function to retrieve the title of a song given its index.

In [18]:
def sim_score(a1,a2): #returns value in interval (0,1) that states how similar two arrays are
    return difflib.SequenceMatcher(None,a1,a2).ratio()

    
def retrieveTitle(index): #Return the title of a song given its index
    f=open('titles.txt')
    lines=f.readlines()
    f.close()
    return lines[index]

For each of the query songs we do the following:
1. Get the values of its peaks
1. Transform them into a 01 vector
1. Apply minHash to retrieve a vector we called $sign\_colq$
1. Map to buckets
1. Compare the vector $sign\_colq$ with the possible vectors saved in the buckets (actually their index $index\,song$ is saved)
1. Pick the song with highest similarity value

In [19]:
q_tracks=['Part1/queries/track1.wav','Part1/queries/track2.wav','Part1/queries/track3.wav','Part1/queries/track4.wav',
          'Part1/queries/track5.wav','Part1/queries/track6.wav','Part1/queries/track7.wav','Part1/queries/track8.wav',
          'Part1/queries/track9.wav','Part1/queries/track10.wav']
for q_track in q_tracks:
    track, sr, onset_env, peakis = load_audio_picks(q_track, DURATION, HOP_SIZE) #point 1
    q_01=np.array([0])
    q_01=peakis
    q_01=transform01(q_01,dic)                                                   #point 2
    sign_colq=minHash(q_01,n_perms=300)                                          #point 3
    
    keys=[]
    bandwidth=10
    for i in range(0,len(sign_colq),bandwidth):                                  #point 4
        key=mapHash(sign_colq[i:i+bandwidth])                              
        keys.append(key)  
        
    score=-1
    for key in keys:
        for value in data[key]:                                                  #point 5
            similarity=sim_score(np.asarray(sign_colq),new_indexes[value])
            if similarity>score:
                best=value
                score=similarity
    if best!=-1:                                                                 #point 6
        print(('Requested song of track {} should be {}').format(q_track.replace('Part1/queries/','').replace('.wav','')
                                                                 ,retrieveTitle(best)))
    else:
        print('something went wrong...')

Requested song of track track1 should be aerosmith/Aerosmith/03-Dream_On

Requested song of track track2 should be queen/The_Works/06-I_Want_To_Break_Free

Requested song of track track3 should be u2/October/07-October

Requested song of track track4 should be beatles/The_White_Album_Disc_1/04-Ob-La-Di_Ob-La-Da

Requested song of track track5 should be radiohead/OK_Computer/06-Karma_Police

Requested song of track track6 should be led_zeppelin/Led_Zeppelin_II/05-Heartbreaker

Requested song of track track7 should be fleetwood_mac/Rumours/05-Go_Your_Own_Way

Requested song of track track8 should be green_day/American_Idiot/01-American_Idiot

Requested song of track track9 should be depeche_mode/Some_Great_Reward/06-Somebody

Requested song of track track10 should be steely_dan/Katy_Lied/01-Black_Friday



# Part 3

You are given a list of integers, A, and another integer s. Write an algorithm that outputs all the pairs in A that equal s.
For example, if
A = [7, -2, 8, 2, 6, 4, -7, 2, 1, 3, -3] and s = 4
the algorithm should output: (7, -3), (-2, 6), (2, 2), (3, 1).

The easy way to solve the problem would be implementing a brute force method, which would lead to a complexity equal to $O(n^2)$.\
We managed to implement an algorithm that, making use of sort method of a list ($O(n\, log(n)$) and a cycle in which we use two indexes (more details in the code, below) solves the problem in $O(n\, log(n))$ [removing constants].

In [20]:
def findPairs(A,s):
    pairs=[]
    A.sort                                   #First sort the array O(nlogn)
    start=0                                  #We define 2 indexes to go through the list A, and verify wheter the sum
    end=len(A)-1                             #of A[index1=start]+A[index2=end] is equal to the given sum, in the case it
                                             #is we store the value and keep looking for other values
                                             #if it's not we check if that's greater or not of the given sum:
    while start<end:                         #in the case it is we decrease the second index (end) while in the other
                                             #case we increase the first index (start)
        if A[start]+A[end]==s: 
            pairs.append((A[start],A[end]))
            start+=1
    
        elif A[start]+A[end]<s:
            start+=1
            
        else:
            end-=1
            
    return pairs

In [21]:
N=10
for i in range(5):
    A=[]
    if i!=0: N*=10
    for i in range(N):
        A.append(randint(-50, 50))
    print(N)
    %timeit pairs=findPairs(A,10)

10
1.28 µs ± 21 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
100
12.5 µs ± 436 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
1000
137 µs ± 2.13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
10000
1.43 ms ± 56.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
100000
13.7 ms ± 81.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
