In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import librosa
# import librosa.display
import torchaudio
import matplotlib.pyplot as plt
from IPython.display import Audio

In [2]:
df_taxonomy = pd.read_csv('birdclef-2025/taxonomy.csv')
df_train_base = pd.read_csv('birdclef-2025/train.csv')

df_all = pd.merge(df_train_base, df_taxonomy[['class_name', 'primary_label']], on='primary_label', how='inner')
train_df = df_all[['primary_label','filename', 'scientific_name', 'class_name','latitude', 'longitude']]
train_df = train_df.copy()

In [3]:
aves_labels = train_df.loc[train_df['class_name']=='Aves', 'primary_label'].unique()
train_df.loc[:,'is_ave'] = train_df['primary_label'].isin(aves_labels)

In [None]:
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad') # type: ignore

(get_speech_timestamps, _, read_audio, *_) = utils

def highpass_filter(clean_audio, sr=32000, cutoff=800, order=6):
  import scipy.signal
  nyquist = 0.5 * sr                     
  normal_cutoff = cutoff / nyquist       
  b, a = scipy.signal.butter(order, normal_cutoff, btype='high', analog=False) # type: ignore
  y_filtered = scipy.signal.lfilter(b, a, clean_audio)
  return y_filtered

def _filter_voice(file_path, tramos = False, high_pass= False, audio_base = False):
    
  filename = file_path  
  wav, sr = torchaudio.load(filename)
  # print('Rate: ', sr)
  wav = wav.mean(dim=0)  # convertir a mono si es estéreo

  if audio_base is True:
    # print('Base')
    return wav.numpy()
    
  if tramos is True:
    speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sr)
    non_speech_segments = []
    prev_end = 0

    total_samples = len(wav)
    for ts in speech_timestamps:
      start = ts['start']
      end = ts['end']
        
      if prev_end < start:
        non_speech_segments.append((prev_end, start))
        prev_end = end

    # Si sobra algo al final
    if prev_end < total_samples:
      non_speech_segments.append((prev_end, total_samples))

    # recortar cada segmento:
    clean_audio = (torch.concat([wav[start:end] for start, end in non_speech_segments])).numpy()

  else:
    def fun_clean(wav, sr):
      speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sr)
      if len(speech_timestamps) != 0:

        if speech_timestamps[0]['start']<=100:
          clean_audio = (wav[speech_timestamps[0]['end']:])
        
        else:
          clean_audio = (wav[0:speech_timestamps[0]['start']])
          
      else:
        clean_audio = wav

      return clean_audio
    
    clean_audio = fun_clean(wav=wav, sr=sr)
    clean_audio = fun_clean(wav=clean_audio, sr=sr).numpy()

  if high_pass is True:
    clean_audio = highpass_filter(clean_audio=clean_audio)

  return clean_audio

def ext_chunks(audio_clean, sr, time):
  n_samples = sr * time
  chunks = []
  for i in range(0, len(audio_clean), n_samples):
    start = i
    end = i + n_samples

    if end <= len(audio_clean):
      chunk = audio_clean[start:end]
    else:   
      # padding circular
      faltan = end - len(audio_clean)
      padding = audio_clean[:faltan]
      chunk = np.concatenate([audio_clean[start:], padding])

    chunks.append(chunk)
  return np.array(chunks)

def spectogram(array_audio, n_fft=2048):
    if isinstance(array_audio, np.ndarray):
        # array_audio = array_audio.mean(dim=0) 
        dta = np.abs(librosa.stft(array_audio, n_fft=n_fft))
        D = librosa.amplitude_to_db(dta, ref=np.max)
    else:
        wav = array_audio.mean(dim=0) 
        dta = np.abs(librosa.stft(wav.numpy()))
        D = librosa.amplitude_to_db(dta, ref=np.max)
    return D


def _extract_features(audio, sr=32000):

    # MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfccs_mean = mfccs.mean(axis=1)
    mfccs_var = mfccs.var(axis=1)

    # Spectral Centroid
    sc = librosa.feature.spectral_centroid(y=audio, sr=sr)
    sc_mean = sc.mean()
    sc_var = sc.var()

    # Spectral Bandwidth 
    bw = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
    bw_mean = bw.mean()
    bw_var = bw.var()

    # Spectral Rolloff 
    srf = librosa.feature.spectral_rolloff(y=audio, sr=sr, roll_percent=0.85)
    srf_mean = srf.mean()
    srf_var = srf.var()

    # Spectral Flux 
    sf = librosa.onset.onset_strength(y=audio, sr=sr)  # proxy para flux
    sf_mean = sf.mean()
    sf_var = sf.var()

    # Spectral Flatness 
    sfm = librosa.feature.spectral_flatness(y=audio)
    sfm_mean = sfm.mean()
    sfm_var = sfm.var()

    # Zero Crossing Rate 
    zcr = librosa.feature.zero_crossing_rate(y=audio)
    zcr_mean = zcr.mean()
    zcr_var = zcr.var()

    # RMS Energy 
    rms = librosa.feature.rms(y=audio)
    rms_mean = rms.mean()
    rms_var = rms.var()

    # Vector final
    features = np.concatenate([
        mfccs_mean, mfccs_var,
        [sc_mean, sc_var],
        [bw_mean, bw_var],
        [srf_mean, srf_var],
        [sf_mean, sf_var],
        [sfm_mean, sfm_var],
        [zcr_mean, zcr_var],
        [rms_mean, rms_var]
    ])

    return features
# Nombres de las columnas
mfcc_cols = [f'mfcc_{i+1}_mean' for i in range(13)] + [f'mfcc_{i+1}_var' for i in range(13)]
extra_cols = [
    'spec_centroid_mean', 'spec_centroid_var',
    'spec_bandwidth_mean', 'spec_bandwidth_var',
    'rolloff_mean', 'rolloff_var',
    'flux_mean', 'flux_var',
    'flatness_mean', 'flatness_var',
    'zcr_mean', 'zcr_var',
    'rms_mean', 'rms_var'
]
column_names = mfcc_cols + extra_cols


Using cache found in C:\Users\overm/.cache\torch\hub\snakers4_silero-vad_master


In [5]:
# model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad') # type: ignore
# (get_speech_timestamps, _, read_audio, *_) = utils


# filename ='audio_test/audio.mp3'
# wav, sr = torchaudio.load(filename)
# print('Rate: ', sr)
# wav = wav.mean(dim=0) 

# def fun_clean(wav, sr):
#   speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=sr)
#   print(speech_timestamps)
#   if len(speech_timestamps) != 0:
#     # print('True')
#     # No puede ser 0, ya que si el audio empieza unso pocos segundo en 0, no signica que no empize hablando la persona
#     if speech_timestamps[0]['start']<=100: 
#       clean_audio = (wav[speech_timestamps[0]['end']:])
    
#     else:
#       clean_audio = (wav[0:speech_timestamps[0]['start']])
      
#   else:
#     clean_audio = wav

#   return clean_audio

# clean_audio = fun_clean(wav=wav, sr=sr)
# # clean_audio = fun_clean(wav=clean_audio, sr=sr)

In [6]:
filename=train_df[train_df['class_name'] == 'Insecta']['filename']
id = f'birdclef-2025/train_audio/{filename.iloc[15]}'
clean_audio = _filter_voice(file_path=id,  tramos=False)
clean_audio = highpass_filter(clean_audio)
chunks = ext_chunks(audio_clean=clean_audio, sr=32000, time=5)
# futures = _extract_features(audio=chunks[0], sr=32000)



In [35]:
def process_audio_file(row, sr=32000, chunk_duration=5, max_chunks=3, 
                       tramos=False, apply_highpass=True, audio_base = False):

    filename = row['filename']
    is_ave = row['is_ave']
    audio_path = f'birdclef-2025/train_audio/{filename}'

    try:
        # Preprocesamiento
        clean_audio = _filter_voice(file_path=audio_path, tramos=tramos, high_pass=apply_highpass, audio_base=audio_base)
        chunks = ext_chunks(audio_clean=clean_audio, sr=sr, time=chunk_duration)

        data = []
        # used_chunks = chunks[:max_chunks] if len(chunks) > max_chunks else chunks

        # for chunk in used_chunks:
        #     feats = _extract_features(chunk, sr=sr)
        #     data.append(feats)

        for chunk in chunks:
            feats = _extract_features(chunk, sr=sr)
            data.append(feats)

        df = pd.DataFrame(data, columns=column_names)
        df['is_ave'] = is_ave

        return df

    except Exception as e:
        print(f"Error procesando {filename}: {e}")
        return pd.DataFrame()  # devuelve DataFrame vacío si hay error


In [36]:
# tomar 3 audios por especie
aves_df = train_df[train_df['is_ave'] == True]
aves_sampled = aves_df.groupby('scientific_name').apply(lambda x: x.sample(n=6, random_state=42)) \
                     .reset_index(drop=True)
no_aves_df = train_df[train_df['is_ave'] == False]
subset_df = pd.concat([aves_sampled, no_aves_df], ignore_index=True)
subset_df['is_ave'].value_counts()

False    916
True     876
Name: is_ave, dtype: int64

In [44]:
import warnings
warnings.filterwarnings('ignore')
id_labels = subset_df[['filename', 'is_ave']]
full_data = []
count = 0
for i, row in id_labels.iterrows():
    df_chunk = process_audio_file(row, tramos= False, audio_base=True)
    if not df_chunk.empty:
        full_data.append(df_chunk)
    count+=1
    print(count)

df_all = pd.concat(full_data, ignore_index=True).round(3)


Base
1
Base
2
Base
3
Base
4
Base
5
Base


KeyboardInterrupt: 

In [27]:
df_all.to_csv('datos_futures_M2.csv', index=False)

In [28]:
hola = pd.read_csv('datos_futures_M2.csv')

In [30]:
hola

Unnamed: 0,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,...,rolloff_var,flux_mean,flux_var,flatness_mean,flatness_var,zcr_mean,zcr_var,rms_mean,rms_var,is_ave
0,-514.152,-104.246,-73.625,37.189,-60.273,9.274,-8.646,12.241,7.273,-5.562,...,211385.351,0.872,0.177,0.013,0.000,0.378,0.001,0.011,0.0,True
1,-521.421,-117.332,-98.145,28.401,-70.413,-1.317,-16.426,13.676,3.305,-4.069,...,96073.414,0.853,0.083,0.013,0.000,0.367,0.001,0.006,0.0,True
2,-514.620,-103.264,-84.702,40.686,-58.327,1.522,-9.620,18.308,4.356,-6.838,...,81151.517,0.844,0.096,0.013,0.000,0.359,0.001,0.008,0.0,True
3,-518.867,-108.016,-80.760,38.109,-64.648,3.070,-8.248,12.461,4.449,-1.577,...,138945.623,0.842,0.081,0.015,0.000,0.366,0.001,0.008,0.0,True
4,-305.381,-65.616,-125.595,-69.095,-80.614,-24.220,-34.174,-11.167,-6.205,1.677,...,1105116.383,1.180,2.774,0.053,0.001,0.301,0.002,0.020,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11894,-684.402,-105.954,-79.119,-58.086,-44.975,-16.182,-13.316,8.358,1.273,15.249,...,182342.388,0.895,0.837,0.168,0.004,0.554,0.006,0.001,0.0,False
11895,-738.785,-100.427,-75.193,-57.889,-45.670,-19.087,-16.296,6.257,-0.161,13.765,...,64514.528,0.870,0.677,0.188,0.003,0.565,0.002,0.000,0.0,False
11896,-779.732,-75.572,-62.785,-46.755,-35.421,-12.938,-10.766,6.830,0.704,10.988,...,210335.378,1.003,4.973,0.215,0.005,0.523,0.007,0.000,0.0,False
11897,-722.778,-127.528,-69.195,-54.447,-54.165,-1.483,-28.090,20.979,-5.028,14.033,...,745619.364,0.900,0.854,0.067,0.013,0.513,0.000,0.002,0.0,False


In [31]:
hola = pd.read_csv('datos_futures_M1.csv')
hola

Unnamed: 0,mfcc_1_mean,mfcc_2_mean,mfcc_3_mean,mfcc_4_mean,mfcc_5_mean,mfcc_6_mean,mfcc_7_mean,mfcc_8_mean,mfcc_9_mean,mfcc_10_mean,...,rolloff_var,flux_mean,flux_var,flatness_mean,flatness_var,zcr_mean,zcr_var,rms_mean,rms_var,is_ave
0,-514.152,-104.246,-73.625,37.189,-60.273,9.274,-8.646,12.241,7.273,-5.562,...,211385.351,0.872,0.177,0.013,0.000,0.378,0.001,0.011,0.0,True
1,-521.421,-117.332,-98.145,28.401,-70.413,-1.317,-16.426,13.676,3.305,-4.069,...,96073.414,0.853,0.083,0.013,0.000,0.367,0.001,0.006,0.0,True
2,-514.620,-103.264,-84.702,40.686,-58.327,1.522,-9.620,18.308,4.356,-6.838,...,81151.517,0.844,0.096,0.013,0.000,0.359,0.001,0.008,0.0,True
3,-518.867,-108.016,-80.760,38.109,-64.648,3.070,-8.248,12.461,4.449,-1.577,...,138945.623,0.842,0.081,0.015,0.000,0.366,0.001,0.008,0.0,True
4,-305.381,-65.616,-125.595,-69.095,-80.614,-24.220,-34.174,-11.167,-6.205,1.677,...,1105116.383,1.180,2.774,0.053,0.001,0.301,0.002,0.020,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13798,-608.551,-28.464,-61.727,-41.879,-38.963,-15.905,-16.149,-5.074,-1.576,2.768,...,1578010.731,1.123,5.104,0.150,0.010,0.368,0.015,0.003,0.0,False
13799,-722.778,-127.528,-69.195,-54.447,-54.165,-1.483,-28.090,20.979,-5.028,14.033,...,745619.364,0.900,0.854,0.067,0.013,0.513,0.000,0.002,0.0,False
13800,-727.223,-102.914,-66.539,-52.135,-49.877,-5.278,-24.148,15.954,-5.332,10.780,...,852866.604,0.919,2.054,0.103,0.016,0.485,0.007,0.001,0.0,False
13801,-447.384,-20.705,-105.636,-82.166,-63.640,-40.951,-41.238,-14.648,-16.457,-1.164,...,327367.496,0.948,0.211,0.074,0.001,0.222,0.001,0.003,0.0,False


In [13]:
# wav, sr = torchaudio.load(id)
# O = spectogram(wav)
# clean_audio = _filter_voice(file_path=id, tramos=False)
# D = spectogram(clean_audio)
# high_pass = _filter_voice(file_path=id, tramos=False, high_pass = True)
# D_2 = spectogram(high_pass)

In [14]:
# plt.figure(figsize=(10, 10))

# plt.subplot(3, 1, 1)
# librosa.display.specshow(O, sr=sr, x_axis='time', y_axis='log', cmap='viridis')
# plt.colorbar(format='%+2.0f dB')
# plt.title("Original")


# plt.subplot(3, 1, 2)
# librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', cmap='viridis')
# plt.colorbar(format='%+2.0f dB')
# plt.title("Audio VAD")

# plt.subplot(3, 1, 3)
# librosa.display.specshow(D_2, sr=sr, x_axis='time', y_axis='log', cmap='viridis')
# plt.title("Filtrado (Highpass)")
# plt.colorbar(format='%+2.0f dB')

# plt.tight_layout()
# plt.show()

# print("Audio Oiginal: ")
# display(Audio(filename=id))
# print('Audio VAD')
# display(Audio(data=clean_audio, rate=32000))
# print("Audio Pasa Altas: ")
# display(Audio(data=high_pass, rate=32000))


##### Separator (Deezer)
Esto logra separar perfectamente la voz humana y el resto lo cataloga como ruido, sin embargo, parecido a silero-vad, el audo de ruido tendría la especie y las respiraciones humanas. Sin embargo, de fondo de ruido, se alcanza a escuchar el eco de la grabación, correspondiente a la voz humana

In [15]:
# from spleeter.separator import Separator

# id = f'birdclef-2025/train_audio/{filename.iloc[1]}'

# # Inicializa el modelo de 2 fuentes
# separator = Separator('spleeter:2stems')

# # Separar el archivo
# separator.separate_to_file(id, 'output')


In [16]:
# wav, sr = torchaudio.load('output\\CSA36389\\accompaniment.wav')
# wav = wav.mean(dim=0) 
# dta = np.abs(librosa.stft(wav.numpy()))
# new = librosa.amplitude_to_db(dta, ref=np.max)

# plt.figure(figsize=(10, 5))

# librosa.display.specshow(new, sr=sr, x_axis='time', y_axis='log', cmap='viridis')
# plt.title("Filtrado (Highpass)")
# plt.colorbar(format='%+2.0f dB')

# plt.tight_layout()
# plt.show()
# display(Audio('output\\CSA36389\\accompaniment.wav'))

In [17]:
# wav, sr = torchaudio.load('output\\CSA36389\\vocals.wav')
# wav = wav.mean(dim=0) 
# dta = np.abs(librosa.stft(wav.numpy()))
# new = librosa.amplitude_to_db(dta, ref=np.max)

# plt.figure(figsize=(10, 5))

# librosa.display.specshow(new, sr=sr, x_axis='time', y_axis='log', cmap='viridis')
# plt.title("Filtrado (Highpass)")
# plt.colorbar(format='%+2.0f dB')

# plt.tight_layout()
# plt.show()
# display(Audio('output\\CSA36389\\vocals.wav'))