In [None]:
import torch
import torchaudio
import os
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, Audio
import warnings
warnings.filterwarnings('ignore')

# Set the sample rate to match the audio requirements
SAMPLE_RATE = 16000

# Function to convert AAC to WAV and save in a new folder
def convert_aac_to_wav(aac_file_path):
    wav_filename = os.path.basename(aac_file_path)[:-3] + 'wav'
    output_folder = "./wav_files"  # New folder to save WAV files
    os.makedirs(output_folder, exist_ok=True)
    output_wav_path = os.path.join(output_folder, wav_filename)
    import subprocess
    with open(os.devnull, 'w') as devnull:
        subprocess.run(['ffmpeg', '-y', '-i', aac_file_path, output_wav_path], stdout=devnull, stderr=devnull)
    return output_wav_path

# Function to generate audio embedding
def generate_audio_embedding(audio_path):
    # Our audio files are in the aac format. We need to convert them to wave format to load them with the torchaudio
    audio_path = convert_aac_to_wav(audio_path)

    # Load audio waveform
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample audio if necessary
    if sample_rate != SAMPLE_RATE:
        waveform = torchaudio.transforms.Resample(sample_rate, SAMPLE_RATE)(waveform)

    # Convert audio waveform to mono if necessary
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Convert audio waveform to mel spectrogram
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE)(waveform)

    # Normalize the mel spectrogram
    mel_spectrogram = torchaudio.transforms.AmplitudeToDB()(mel_spectrogram)

    # Collapse the mel spectrogram to a 1-dimensional embedding
    embedding = mel_spectrogram.mean(dim=-1).squeeze()

    return embedding.numpy()

# Function for semantic search
def semantic_search(query_embedding, audio_folder, top_n=1):
    similarities = []
    for filename in os.listdir(audio_folder):
        if filename.endswith(('.wav', '.mp3', '.aac')):
            audio_path = os.path.join(audio_folder, filename)
            # Generate embedding for the audio
            audio_embedding = generate_audio_embedding(audio_path)
            # Compute cosine similarity between query embedding and audio embedding
            similarity = cosine_similarity([query_embedding], [audio_embedding])[0][0]
            similarities.append((filename, similarity))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Example usage
query_audio_folder = "/usr/local/datasetsDir/audio-dataset/queries"  # Path to the query audio
audio_folder = "/usr/local/datasetsDir/audio-dataset/data"  # Path to the folder containing audio files

for filename in os.listdir(query_audio_folder):
    if filename.endswith(('.wav', '.mp3', '.aac')):
        query_audio_path = os.path.join(query_audio_folder, filename)
        print("Query Audio: ", filename)
        display(Audio(filename=query_audio_path, autoplay=True))
        query_embedding = generate_audio_embedding(query_audio_path)
        results = semantic_search(query_embedding, audio_folder)
        for filename, similarity in results:
            print(f"Matching Audio: {filename}, Similarity: {similarity:.2f}")
            display(Audio(filename=os.path.join(audio_folder, filename), autoplay=True))
