In [129]:
import pyaudio
import wave
import time
import numpy as np
import noisereduce as nr

In [130]:
import librosa
import sklearn

In [131]:
import os

In [132]:
import tensorflow as tf

In [133]:
class CFG:
    device = 'CPU'
    seed = 42
    
    # Input image size and batch size
    img_size = [318, 216]
    batch_size = 128
    upsample_thr = 30 # min sample of each class (upsample)
    cv_filter = True # always keeps low sample data in train
    
    # Audio duration, sample rate, and length
    duration = 10 # second
    sample_rate = 16000
    audio_len = duration*sample_rate
    
    # STFT parameters
    nfft = 1024
    window = 1024
    hop_length = 512
    fmin = 20
    fmax = 8000
    normalize = True
    nmels = 216
    
    # Inference batch size, test time augmentation, and drop remainder
    infer_bs = 2
    tta = 1
    drop_remainder = True
    
    # Number of epochs, model name, and number of folds
    epochs = 25
    model_name = 'EfficientNetB1'
    fsr = False # reduce stride of stem block
    num_fold = 5
    
    # Selected folds for training and evaluation
    selected_folds = [0]

    # Pretraining, neck features, and final activation function
    pretrain = 'imagenet'
    neck_features = 0
    final_act = 'softmax'
    
    # Learning rate, optimizer, and scheduler
    lr = 1e-3
    scheduler = 'cos'
    optimizer = 'Adam' # AdamW, Adam
    
    # Loss function and label smoothing
    loss = 'CCE' # BCE, CCE
    label_smoothing = 0.05 # label smoothing
    
    # Data augmentation parameters
    augment=True
    spec_augment_prob = 0.60
    
    # Time Freq masking
    freq_mask_prob=0.50
    num_freq_masks=1
    freq_mask_param=10
    time_mask_prob=0.50
    num_time_masks=2
    time_mask_param=25

    # Audio Augmentation Settings
    audio_augment_prob = 0.5
    
    mixup_prob = 0.65
    mixup_alpha = 0.5
    
    cutmix_prob = 0.65
    cutmix_alpha = 2.5
    
    timeshift_prob = 0.0
    
    gn_prob = 0.35

    class_names = sorted(set(os.listdir('/home/saarthak/Downloads/research/spectrogram dataset/train')))
    num_classes = len(class_names)
    class_labels = list(range(num_classes))
    label2name = dict(zip(class_labels, class_names))
    name2label = {v:k for k,v in label2name.items()}
    
    # Training Settings
    target_col = ['target']
    tab_cols = ['filename']
    monitor = 'auc'
    debug = True
    verbose = 0


    # PyAudio
    chunk = 1024  # Samples to read per frame
    format = pyaudio.paInt16  # Audio format
    channels = 1   

In [134]:
def build_model(N_CLASSES=264):
    inp = tf.keras.layers.Input(shape=(318, 216, 3))
#     x = tf.keras.layers.LayerNormalization(name='batch_norm')(inp)
    x = tf.keras.layers.Conv2D(8, kernel_size=(7,7), activation='tanh', padding='same', name='conv2d_tanh')(inp)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_1')(x)
    
    x = tf.keras.layers.Conv2D(16, kernel_size=(5,5), activation='relu', padding='same', name='conv2d_relu_1')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_2')(x)
    
    x = tf.keras.layers.Conv2D(16, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_2')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_3')(x)
    
    x = tf.keras.layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_3')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_4')(x)
    
    x = tf.keras.layers.Conv2D(32, kernel_size=(3,3), activation='relu', padding='same', name='conv2d_relu_4')(x)
    x = tf.keras.layers.MaxPooling2D(pool_size=(2,2), padding='same', name='max_pool_2d_5')(x)
    
    x = tf.keras.layers.Conv2D(64, kernel_size=(2,2), activation='relu', padding='same', name='conv2d_relu_5')(x)
    x = tf.keras.layers.Flatten(name='flatten')(x)
    
    x = tf.keras.layers.Dropout(rate=0.2, name='dropout')(x)
    x = tf.keras.layers.Dense(128, activation='relu', activity_regularizer=tf.keras.regularizers.l2(0.001), name='dense')(x)
    
    o = tf.keras.layers.Dense(N_CLASSES, activation='softmax', name='softmax')(x)
    model = tf.keras.Model(inputs=inp, outputs=o, name='2d_convolution')
    return model

In [135]:
model = build_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=CFG.lr),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=CFG.label_smoothing), 
              metrics=['accuracy'])

In [136]:
model.load_weights('/home/saarthak/Documents/birdclef23/2024-05-16__04-58-15/cp.weights.h5')



In [137]:

# Define parameters for audio capture
FORMAT = pyaudio.paInt16  # Audio format (16-bit resolution)
CHANNELS = 1  # Number of channels (1 for mono, 2 for stereo)
RATE = 16000  # Sample rate (samples per second)
CHUNK = 1024  # Number of audio frames per buffer
RECORD_SECONDS = 10  # Duration of each recording in seconds
OUTPUT_FILENAME = "output.wav"  # Output filename


In [138]:
from scipy.io import wavfile

In [139]:
def wav_ops(audio, data):
    bytes_wav = bytes()
    # Save the recorded data as a WAV file
    with wave.open('output.wav', 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(data)
    rate, data = wavfile.read("output.wav")
    reduced_noise = nr.reduce_noise(y=data, sr=rate)
    wavfile.write("output.wav", rate, reduced_noise)
    return reduced_noise

In [140]:
from sklearn import preprocessing

In [141]:
def Spec2Img(spec, num_channels=3):
    # If the original image has 1 channel, convert it to a 3 channel image by repeating the same image across channel axis
    if num_channels > 1:
        img = tf.tile(spec[..., tf.newaxis], [1, 1, num_channels])
    else:
        img = spec[..., tf.newaxis]
    return img

In [142]:
def get_melspectrogram(audio):
    spec = librosa.feature.melspectrogram(y=audio, sr=CFG.sample_rate, 
                                       n_mels=CFG.nmels,
                                       n_fft=CFG.nfft,
                                       hop_length=CFG.hop_length,
                                       fmax=CFG.fmax,
                                       fmin=CFG.fmin,
                                       )
    spec = librosa.power_to_db(spec, ref=np.max)
    # print(spec.shape)
    normalised_db = preprocessing.minmax_scale(spec)
    return normalised_db

In [143]:

def record_audio():
    audio = pyaudio.PyAudio()

    # Open stream
    stream = audio.open(format=FORMAT,
                        channels=CHANNELS,
                        rate=RATE,
                        input=True,
                        frames_per_buffer=CHUNK)

    print("Listening...")

    frames = []

    for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = np.frombuffer(stream.read(CHUNK))
        if np.isnan(data).any():
            data = np.nan_to_num(data)
        frames.append(data)

    print("Finished.")
    # # print(frames)
    # print(type(frames[0]), frames[0].shape)

    data = np.concatenate(frames)

    # Stop and close the stream
    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    reduced_noise = wav_ops(audio, data)
    reduced_noise = reduced_noise.astype('float32')
    # reduced_noise = np.expand_dims(reduced_noise, axis=-1)
    spec = get_melspectrogram(reduced_noise)
    spec_tensor = tf.convert_to_tensor(spec)
    spec_tensor = tf.transpose(spec_tensor)
    spec_tensor = Spec2Img(spec_tensor)
    spec_tensor = tf.image.resize_with_crop_or_pad(
        spec_tensor, 318, 216
    )
    spec_tensor = tf.expand_dims(spec_tensor, axis = 0)
    output = model.predict(spec_tensor)
    print(output)
    


In [144]:

# Main loop to record audio every 10 seconds
while True:
    record_audio()
    time.sleep(20 - RECORD_SECONDS)  # Wait for the remaining time to make it 10 seconds total


Listening...
Finished.
[[8.64459260e-04 9.08913091e-03 2.86971510e-04 1.66294037e-03
  1.38167001e-04 4.84880205e-04 8.24150105e-04 9.72185167e-04
  4.31471213e-04 1.84404515e-02 6.91391295e-04 3.80631682e-05
  1.37498864e-04 5.55342471e-04 4.88913711e-03 4.54133675e-02
  4.46443853e-04 7.15099799e-04 5.30494493e-04 9.12372416e-05
  5.79478918e-04 4.77342494e-03 1.14030023e-04 8.82263994e-05
  2.93016341e-03 5.00452355e-04 1.03020291e-04 2.04454325e-02
  1.98587566e-03 4.68381440e-05 1.22497911e-02 1.13026050e-04
  5.19907662e-05 4.95412340e-03 1.74448434e-02 2.20477134e-02
  4.72693355e-05 6.06958696e-04 4.06068470e-03 2.62921036e-04
  7.38924777e-04 9.96794086e-03 1.22329472e-02 3.44861735e-04
  3.82643455e-04 2.25974189e-04 3.11797230e-05 1.40886055e-04
  8.35569855e-03 4.03604330e-03 2.00332855e-04 3.91955973e-06
  9.83919366e-04 3.34785815e-04 3.98593947e-05 1.24758780e-02
  1.12750125e-03 1.68467127e-03 2.87966104e-04 4.32427507e-03
  8.88353406e-06 1.35275058e-03 2.73692273e-02 

KeyboardInterrupt: 