In [2]:
import os
import librosa
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, RepeatVector
from sklearn.preprocessing import MinMaxScaler

2023-12-18 10:25:52.252163: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-18 10:25:52.274646: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 10:25:52.427036: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-18 10:25:52.428094: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
directory = 'Audio_Work/birth_inference_minutes_wav/output_2023-11-04_06-31-01'

In [4]:
# Choosing the sample rate for the audio files 
#TODO:
# 1. Automate the sampling rate selection.

def analyze_sampling_rates(directory):
    sampling_rates = []
    for file in os.listdir(directory):
        if file.endswith('.wav'):
            path = os.path.join(directory, file)
            _, sr = librosa.load(path, sr=None, duration=5)  # Load only a part  of the file.
            sampling_rates.append(sr)
    return sampling_rates

def choose_optimal_sampling_rate(sampling_rates):
    # Choosing the most common sampling rate 
    optimal_sr = max(set(sampling_rates), key=sampling_rates.count)
    return optimal_sr

sampling_rates = analyze_sampling_rates(directory)
optimal_sr = choose_optimal_sampling_rate(sampling_rates) # (44100) This will be constant for all the files

print("Sampling Rates:", sampling_rates)
print("Optimal Sampling Rate:", optimal_sr)

Sampling Rates: [44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100, 44100]
Optimal Sampling Rate: 44100


In [5]:
# Choosing the optimal_duration for the audio files 
# Should ideally be 1 minute ( but its better to check)
# TODO:
#       1.Consider  np.median(durations) for a median duration

def get_optimal_duration(directory, sr=22050):
    durations = []
    for file in os.listdir(directory):
        if file.endswith('.wav'):
            path = os.path.join(directory, file)
            data, _ = librosa.load(path, sr=sr)
            durations.append(len(data) / sr)
    return min(durations)  

In [6]:
def load_and_preprocess_audio(directory, sr):
    duration = get_optimal_duration(directory, sr)
    max_length = int(sr * duration)
    audio_data = []
    for file in os.listdir(directory):
        if file.endswith('.wav'):
            path = os.path.join(directory, file)
            data, _ = librosa.load(path, sr=sr, duration=duration)
            if len(data) > max_length:
                data = data[:max_length]
            else:
                padding = max_length - len(data)
                data = np.pad(data, (0, int(padding)), 'constant')
            audio_data.append(data)
    return np.array(audio_data)


audio_data = load_and_preprocess_audio(directory=directory, sr=optimal_sr)

In [7]:
def reshape_data(data, time_steps=10):
    reshaped_data = np.array([data[i:i+time_steps] for i in range(len(data) - time_steps)])
    return reshaped_data.reshape(reshaped_data.shape[0], time_steps, -1)

time_steps = 10 
reshaped_audio_data = reshape_data(audio_data, time_steps)


In [8]:
def build_model(input_shape):
    model = Sequential([
        # Encoder Start
        LSTM(32, input_shape=input_shape, return_sequences=True), # returns sequences for the next layer (Uncompressed) 128
        LSTM(16, return_sequences=False), #  No return sequences (compressed) 64
        # Encoder End

        RepeatVector(input_shape[0]), # Bottleneck

        # Decoder Start
        LSTM(16, return_sequences=True), # returns sequences 64
        LSTM(32, return_sequences=True), #  returns sequences 128
        TimeDistributed(Dense(input_shape[1])) # Reconstructs the original time series data
        # Decoder End
    ])
    model.compile(optimizer='adam', loss='mse') # Compiles the model using Adam optimizer and MSE loss function
    return model


model = build_model(input_shape=reshaped_audio_data.shape[1:])


2023-12-18 10:25:55.838316: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-12-18 10:25:55.838658: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [11]:
def train_model(model, data, epochs=50, batch_size=64):
    model.fit(data, data, epochs=epochs, batch_size=batch_size)

train_model(model, reshaped_audio_data)


Epoch 1/50


2023-12-18 10:26:24.245349: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (mklcpu) ran out of memory trying to allocate 504.49MiB (rounded to 528998400)requested by op GatherV2
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-12-18 10:26:24.245610: I tensorflow/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for mklcpu
2023-12-18 10:26:24.245622: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-12-18 10:26:24.245624: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-12-18 10:26:24.245626: I tensorflow/tsl/framework/bfc_allocator.cc:1046] Bin (1024): 	

ResourceExhaustedError: Graph execution error:

OOM when allocating tensor with shape[10,5,2644992] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node transpose_0}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[sequential/lstm/PartitionedCall]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_11268]

In [10]:
def detect_anomalies(model, data, threshold=0.1):
    predictions = model.predict(data)
    anomalies = []
    for i in range(len(data)):
        loss = np.mean(np.abs(data[i] - predictions[i]))
        if loss > threshold:
            anomalies.append(i)
    return anomalies

# anomalies = detect_anomalies(model, reshaped_audio_data)
# print("Anomalies found at indices:", anomalies)
