In [128]:
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import multiprocessing as mp
from tqdm.notebook import tqdm

In [129]:
df_meta = pd.read_csv("./birdclef-2024/train_metadata.csv")

In [130]:
df_meta

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,asbfly,[],['call'],39.2297,118.1987,Muscicapa dauurica,Asian Brown Flycatcher,Matt Slaymaker,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/134896,asbfly/XC134896.ogg
1,asbfly,[],['song'],51.4030,104.6401,Muscicapa dauurica,Asian Brown Flycatcher,Magnus Hellström,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/164848,asbfly/XC164848.ogg
2,asbfly,[],['song'],36.3319,127.3555,Muscicapa dauurica,Asian Brown Flycatcher,Stuart Fisher,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/175797,asbfly/XC175797.ogg
3,asbfly,[],['call'],21.1697,70.6005,Muscicapa dauurica,Asian Brown Flycatcher,vir joshi,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/207738,asbfly/XC207738.ogg
4,asbfly,[],['call'],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/209218,asbfly/XC209218.ogg
...,...,...,...,...,...,...,...,...,...,...,...,...
24454,zitcis1,[],[''],43.5925,4.5434,Cisticola juncidis,Zitting Cisticola,Chèvremont Fabian,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://xeno-canto.org/845747,zitcis1/XC845747.ogg
24455,zitcis1,[],[''],43.5925,4.5434,Cisticola juncidis,Zitting Cisticola,Chèvremont Fabian,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/845817,zitcis1/XC845817.ogg
24456,zitcis1,[],[''],51.1207,4.5607,Cisticola juncidis,Zitting Cisticola,Wim Jacobs,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://xeno-canto.org/856176,zitcis1/XC856176.ogg
24457,zitcis1,[],[''],41.5607,-8.4236,Cisticola juncidis,Zitting Cisticola,Jorge Leitão,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://xeno-canto.org/856723,zitcis1/XC856723.ogg


In [131]:
df_train = df_meta[["primary_label", "filename"]]

In [132]:
def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    return model

    

In [133]:
def extract_audio_features_with_path(args):
    ogg_file_path, max_length = args
    return extract_audio_features(ogg_file_path, max_length)

def extract_audio_features(ogg_file_path, max_length=22050*5):
    y, sr = librosa.load(ogg_file_path, sr=None)
    
    # Ensure the audio is of fixed length
    if len(y) < max_length:
        y = np.pad(y, (0, max_length - len(y)), 'constant')
    else:
        y = y[:max_length]
    
    # Extract features
    features = {}

    # Mel spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    S_dB = librosa.power_to_db(S, ref=np.max)
    features['mel_spectrogram'] = S_dB

    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    features['mfcc'] = mfcc

    # Chroma feature
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    features['chroma'] = chroma

    # Spectral contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    features['spectral_contrast'] = spectral_contrast

    # Tonnetz
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
    features['tonnetz'] = tonnetz

    # Spectral centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    features['spectral_centroid'] = spectral_centroid

    # Spectral bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    features['spectral_bandwidth'] = spectral_bandwidth

    # Spectral rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    features['spectral_rolloff'] = spectral_rolloff

    # Zero crossing rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    features['zero_crossing_rate'] = zero_crossing_rate

    # RMS
    rms = librosa.feature.rms(y=y)
    features['rms'] = rms

    return features


In [134]:

# Example usage
audio_features = extract_audio_features('./birdclef-2024/train_audio/asbfly/XC49755.ogg')

# Convert features to a dictionary of DataFrames for better visualization
features_df = {key: pd.DataFrame(value) for key, value in audio_features.items()}

# Display the extracted features
for feature_name, df in features_df.items():
    print(f"\nFeature: {feature_name}")
    display(df.head())  # Using display() from IPython.display for better visualization in Jupyter


Feature: mel_spectrogram


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,-80.0,-80.0,-80.0,-80.0,-71.131088,-64.629059,-64.798882,-65.517334,-64.943108,-66.595901,...,-68.542099,-62.939053,-61.75354,-64.483032,-63.943794,-68.828384,-75.564575,-80.0,-69.7453,-58.308228
1,-80.0,-80.0,-80.0,-80.0,-78.152901,-77.86232,-80.0,-80.0,-80.0,-80.0,...,-66.897995,-64.690155,-63.297081,-61.861908,-62.23333,-65.24144,-66.423111,-74.839699,-68.645683,-57.374542
2,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,...,-61.738266,-63.308044,-63.542503,-68.342384,-66.267365,-65.165871,-68.677277,-73.797104,-69.288116,-58.136024
3,-80.0,-80.0,-80.0,-80.0,-78.352684,-80.0,-80.0,-80.0,-80.0,-79.861069,...,-64.366867,-71.845642,-70.448776,-73.537903,-71.473465,-71.168381,-72.104736,-69.305649,-67.361259,-57.354889
4,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,...,-73.9179,-80.0,-76.070633,-72.207756,-73.334656,-75.337105,-80.0,-70.59935,-68.497894,-58.325111



Feature: mfcc


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,-695.373413,-695.373413,-695.35437,-680.230347,-643.505615,-619.225769,-591.797607,-570.970276,-563.205383,-558.302429,...,-306.195435,-323.671509,-334.408813,-339.167542,-338.049377,-331.991302,-328.520233,-321.373627,-314.663361,-300.228394
1,0.0,0.0,0.000993,-1.203629,-5.267553,-6.745506,-6.516578,-5.824792,-6.088688,-6.661309,...,-41.372093,-18.021332,-13.957712,-14.097721,-14.342066,-13.82428,-19.953529,-27.281433,-24.620096,-22.640423
2,0.0,0.0,-0.026895,-17.48748,-48.594494,-65.481293,-84.124863,-95.908485,-98.228455,-101.852127,...,-174.743835,-175.672638,-179.184937,-179.671829,-181.474518,-182.583771,-186.912994,-188.352554,-176.718735,-133.41774
3,0.0,0.0,-0.002972,2.80594,12.572548,14.241904,11.57168,11.357303,12.281723,13.987467,...,65.480469,54.011528,45.415066,47.022861,52.940224,50.3647,52.326881,65.002045,67.655022,71.527222
4,0.0,0.0,0.026674,8.966082,11.52891,8.192678,3.49878,-2.270911,-9.443663,-8.712378,...,-91.412827,-79.573364,-68.684639,-62.239941,-48.788322,-53.536591,-59.001015,-60.851425,-54.280022,-25.470638



Feature: chroma


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,0.708295,0.640073,0.70726,0.544579,0.313167,0.284966,0.254466,0.247745,0.324839,0.466124,...,0.114438,0.319022,0.370808,0.189577,0.042638,0.022667,0.024571,0.025156,0.057726,0.065114
1,0.59552,0.761191,0.617285,0.474223,0.340369,0.308571,0.460019,0.379475,0.303072,0.324227,...,0.102003,0.304017,0.35448,0.219087,0.042779,0.023521,0.017317,0.014781,0.018537,0.01597
2,0.703772,0.855086,1.0,0.511553,0.417635,0.389911,0.582626,0.537955,0.582327,0.628322,...,0.233476,0.346289,0.495336,0.2912,0.052698,0.023576,0.022504,0.014497,0.009743,0.012317
3,0.729621,0.740412,0.929025,0.534931,0.51665,0.398323,0.351357,0.400521,0.466655,0.598876,...,0.588484,0.842825,0.87366,0.598802,0.114322,0.032365,0.0447,0.024819,0.015485,0.013818
4,0.966631,0.828872,0.708723,0.531631,0.500283,0.516987,0.690588,0.737944,0.609317,0.782331,...,0.807133,0.98946,0.916617,0.499404,0.099249,0.059439,0.052222,0.020962,0.019204,0.018052



Feature: spectral_contrast


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,8.354793,11.05597,18.828681,13.490245,10.935963,17.73794,25.388246,23.359664,17.671976,16.888773,...,11.537984,17.709943,20.112632,13.965468,9.895126,7.356802,12.945463,12.272881,1.991081,0.512265
1,5.655448,9.460604,13.58938,8.253302,6.355038,13.855677,5.488939,9.860909,11.35225,20.051979,...,10.357248,7.098002,13.164112,9.920544,10.515016,9.414235,11.883525,5.564786,6.395963,1.3319
2,7.175265,17.008937,10.132582,9.868057,10.725284,9.491105,13.383158,10.775962,15.808067,12.061553,...,10.227426,11.273117,17.308825,12.270414,16.327901,13.583436,14.425934,16.306118,14.946777,10.478192
3,13.181861,18.4871,7.492485,13.55892,11.697548,16.7218,12.77015,18.760126,14.912207,12.216167,...,10.045255,14.306967,15.011778,14.692581,13.983972,16.31101,12.78402,19.127118,17.829591,11.865277
4,11.863674,11.875513,12.481405,14.705043,15.351461,15.150303,13.763269,11.776819,11.068494,17.25326,...,18.722942,19.851819,14.89324,17.568424,17.89555,21.664876,21.846788,22.297705,16.560607,15.399926



Feature: tonnetz


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,0.05355,0.046576,0.069057,-0.033937,0.008213,0.029023,0.030646,-0.002728,0.043914,0.068861,...,0.000652,-0.044043,-0.096815,-0.092799,-0.093778,-0.024391,0.022639,0.041814,-0.040458,0.034045
1,0.006453,-0.01578,0.014258,0.041921,0.020128,0.015061,0.021392,-0.037868,-0.028053,0.050517,...,-0.01172,-0.085404,0.00319,-0.08578,0.012057,-0.011803,-0.030609,0.036463,-0.000963,-0.023262
2,0.05975,0.059457,0.07442,0.06872,0.029005,-0.084037,0.049583,-0.014404,0.035721,0.032192,...,0.006173,0.045641,0.028443,-0.021292,-0.084611,0.07851,0.001111,-0.084613,0.032052,-0.095086
3,0.063891,0.044799,0.006855,0.047675,0.045305,0.079124,0.070752,0.129191,0.09102,0.04713,...,-0.062414,-0.093077,0.102964,0.102695,-0.054047,0.059749,-0.066663,0.001458,0.000603,-0.094087
4,-0.00368,-0.011251,0.000642,-0.045971,0.009795,-0.042154,0.01835,0.020736,0.005233,-0.031041,...,-0.034227,-0.024232,-0.041018,-0.120868,-0.063487,0.012695,-0.023469,-0.047408,-0.042899,-0.044499



Feature: spectral_centroid


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,7677.594522,6883.725464,5391.344823,4644.819335,4256.43529,4117.689942,4004.929086,3984.257218,4061.659922,4006.887847,...,5067.518515,4680.967893,4378.585674,4230.923206,3971.862278,3890.484845,3928.581672,4022.806,4128.249837,4278.453318



Feature: spectral_bandwidth


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,4317.705768,4241.620262,3764.47843,3102.94098,2587.063436,2375.034378,2266.970457,2138.180219,2141.049613,2112.659167,...,1282.843863,1538.289848,1647.334525,1606.466814,1534.741298,1504.182266,1461.898212,1388.453856,1453.93369,1711.389557



Feature: spectral_rolloff


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,13000.0,12156.25,10078.125,7750.0,6421.875,6343.75,6312.5,6078.125,6203.125,6015.625,...,6109.375,6093.75,5968.75,5796.875,5703.125,5515.625,5375.0,5375.0,5609.375,6031.25



Feature: zero_crossing_rate


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,0.223633,0.297852,0.361328,0.295898,0.250488,0.234863,0.235352,0.245117,0.246582,0.242676,...,0.323242,0.3125,0.288086,0.258301,0.239258,0.218262,0.212891,0.219727,0.186035,0.133789



Feature: rms


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,1.3e-05,3.1e-05,7.3e-05,0.000177,0.00027,0.000456,0.000616,0.000727,0.000869,0.000953,...,0.090907,0.066837,0.030106,0.029994,0.035177,0.04654,0.061427,0.067807,0.066481,0.057839


In [135]:
def aggregate_features(features):
    aggregated_features = {}
    for key, value in features.items():
        aggregated_features[key] = {
            'mean': np.mean(value, axis=1),
            'std': np.std(value, axis=1),
            'min': np.min(value, axis=1),
            'max': np.max(value, axis=1)
        }
    return aggregated_features

def format_features(aggregated_features):
    formatted_features = []
    for key in aggregated_features:
        for stat in aggregated_features[key]:
            formatted_features.extend(aggregated_features[key][stat])
    return np.array(formatted_features)

In [136]:
ogg_file_path = './birdclef-2024/train_audio/asbfly/XC49755.ogg'
max_length = 22050 * 5  # For example, 5 seconds at a sample rate of 22050 Hz

# Extract and aggregate features
features = extract_audio_features(ogg_file_path, max_length)
aggregated_features = aggregate_features(features)

# Format features for model input
formatted_features = format_features(aggregated_features)

print(formatted_features.shape)
print(formatted_features)


(684,)
[-6.78951569e+01 -7.15780640e+01 -7.08037949e+01 -7.23653183e+01
 -7.57353821e+01 -7.60923462e+01 -7.72794800e+01 -7.69549332e+01
 -7.69535065e+01 -7.48166962e+01 -7.59738770e+01 -7.50231094e+01
 -7.48816452e+01 -7.21270599e+01 -7.18847809e+01 -6.99784622e+01
 -6.92440033e+01 -6.64677048e+01 -6.55641098e+01 -6.29078407e+01
 -6.35348587e+01 -6.19041100e+01 -6.14968414e+01 -6.00429764e+01
 -5.98880310e+01 -5.87049522e+01 -5.84746971e+01 -5.72715187e+01
 -5.75240135e+01 -5.58471870e+01 -5.55498352e+01 -5.40977554e+01
 -5.38592224e+01 -5.26902885e+01 -5.27443390e+01 -5.14852333e+01
 -5.19623528e+01 -5.10422897e+01 -5.14217949e+01 -5.06474075e+01
 -5.04771423e+01 -4.96308174e+01 -5.02442932e+01 -4.85827980e+01
 -4.86090584e+01 -4.86892738e+01 -4.79537125e+01 -4.74637413e+01
 -4.74799500e+01 -4.70840187e+01 -4.66258774e+01 -4.66038132e+01
 -4.59247398e+01 -4.57369804e+01 -4.46376686e+01 -4.42834740e+01
 -4.41352196e+01 -4.43094559e+01 -4.40298653e+01 -4.37026901e+01
 -4.32755165e+01 -

In [137]:
import numpy as np
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

def extract_and_format_features(file_path, max_length=22050*5):
    try:
        features = extract_audio_features(file_path, max_length)
        aggregated_features = aggregate_features(features)
        formatted_features = format_features(aggregated_features)
        return formatted_features
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def process_batch(file_paths, labels, max_length):
    X_batch = []
    y_batch = []
    for file_path, label in zip(file_paths, labels):
        formatted_features = extract_and_format_features(file_path, max_length)
        if formatted_features is not None:
            X_batch.append(formatted_features)
            y_batch.append(label)
    return X_batch, y_batch

def load_data(file_paths, labels, max_length=22050*5, batch_size=1000):
    X = []
    y = []

    # Split the data into batches
    batches = [(file_paths[i:i + batch_size], labels[i:i + batch_size])
               for i in range(0, len(file_paths), batch_size)]

    # Process batches in parallel using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_batch, batch_files, batch_labels, max_length) 
                   for batch_files, batch_labels in batches]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            X_batch, y_batch = future.result()
            X.extend(X_batch)
            y.extend(y_batch)

    return np.array(X), np.array(y)

# Assuming df_train is already loaded in your notebook
file_paths = df_train['filename'].apply(lambda x: "./birdclef-2024/train_audio/" + x).tolist()
labels = df_train['primary_label'].astype('category').cat.codes.tolist()

# Load data
X, y =  pd.read_csv('X.csv', header=None), pd.read_csv('y.csv', header=None) #load_data(file_paths, labels)

print(f"Loaded {X.shape[0]} files.")
print(X.shape)
print(y.shape)


Loaded 24459 files.
(24459, 684)
(24459, 1)


In [138]:
# save x and y as csv

# np.savetxt('X.csv', X, delimiter=',')
# np.savetxt('y.csv', y, delimiter=',')
print(X.shape)
print(y.shape)


(24459, 684)
(24459, 1)


In [139]:
# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert labels to categorical
y = to_categorical(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X.shape, y.shape)
# Define the neural network model
def create_model(input_shape):
    model = Sequential([
        Dense(256, input_shape=(input_shape,), activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(y_train.shape[1], activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and train the model
model = create_model(X_train.shape[1])
# model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=200, batch_size=256, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

(24459, 684) (24459, 182)
Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0218 - loss: 5.0534 - val_accuracy: 0.1034 - val_loss: 4.3542
Epoch 2/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0795 - loss: 4.3151 - val_accuracy: 0.1547 - val_loss: 3.9201
Epoch 3/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1203 - loss: 3.9716 - val_accuracy: 0.1868 - val_loss: 3.7069
Epoch 4/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1425 - loss: 3.7976 - val_accuracy: 0.1999 - val_loss: 3.5576
Epoch 5/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1642 - loss: 3.6386 - val_accuracy: 0.2255 - val_loss: 3.4657
Epoch 6/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1847 - loss: 3.5475 - val_accuracy: 0.2265 - val_loss: 3.3832
Epoch 7/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━

In [54]:
model.save('./models/26_05_2024_14-17/bird_species_classifier_model.keras')

In [58]:
# Predict probabilities for each class
predictions = model.predict(X_test)


[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 793us/step


In [60]:
# test the model

predictions = model.predict(X_test)
print(predictions)
print(np.argmax(predictions, axis=1))
print(np.argmax(y_test, axis=1))
print(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1))
print(np.mean(np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)))

[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700us/step
[[6.77345997e-06 3.61297367e-04 9.99722444e-03 ... 4.80583822e-06
  1.78343207e-13 2.68834672e-04]
 [4.24841885e-03 2.23327857e-02 1.57664064e-04 ... 7.53935456e-05
  5.43763292e-07 4.25365070e-05]
 [1.19067394e-04 5.29814046e-04 1.08128644e-08 ... 4.94190090e-07
  5.58729380e-06 6.24294003e-08]
 ...
 [4.08890657e-03 6.24113530e-03 3.37525353e-06 ... 3.06950795e-07
  3.32492964e-05 3.94365750e-04]
 [1.92772562e-03 6.30501006e-03 4.82090429e-04 ... 7.24603655e-04
  5.35062514e-04 5.27585275e-04]
 [3.43275652e-03 1.14992205e-02 1.04471333e-02 ... 2.62938003e-04
  2.07573539e-05 6.70732604e-03]]
[ 20 143 100 ...  81  41   9]
[ 85 143 100 ...  81  41  82]
[False  True  True ...  True  True False]
0.31643499591169255


In [75]:
def load_unlabeled_data(file_paths, max_length=22050*5, batch_size=10):
    X = []

    # Split the data into batches
    batches = [file_paths[i:i + batch_size] for i in range(0, len(file_paths), batch_size)]

    # Process batches in parallel using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_batch_unlabeled, batch_files, max_length) for batch_files in batches]
        
        for future in tqdm(as_completed(futures), total=len(futures)):
            X_batch = future.result()
            X.extend(X_batch)

    return np.array(X)

def process_batch_unlabeled(file_paths, max_length):
    X_batch = []
    for file_path in file_paths:
        formatted_features = extract_and_format_features(file_path, max_length)
        if formatted_features is not None:
            X_batch.append(formatted_features)
    return X_batch


In [76]:
import os
# Load the scaler you fitted earlier
scaler = StandardScaler()

# Assuming the scaler was saved previously:
# scaler = joblib.load('scaler.pkl')  # Uncomment if you have saved the scaler



# Load the unlabeled data from './birdclef-2024/unlabeled_soundscapes/'

unlabeled_file_paths = ['./birdclef-2024/unlabeled_soundscapes/' + file for file in os.listdir('./birdclef-2024/unlabeled_soundscapes/')]



X_unlabeled = load_unlabeled_data(unlabeled_file_paths)


  0%|          | 0/845 [00:00<?, ?it/s]

[[-3.21212883e+01 -2.87777729e+01 -3.02512302e+01 ...  2.24155467e-03
   2.17884989e-03  1.60560049e-02]
 [-2.36302452e+01 -2.00415802e+01 -2.05173740e+01 ...  1.29544584e-03
   6.01685420e-03  1.10199349e-02]
 [-1.94876842e+01 -1.36277876e+01 -1.23970518e+01 ...  7.98544555e-04
   1.37835192e-02  2.12538913e-02]
 ...
 [-1.60226192e+01 -9.08266449e+00 -1.18834248e+01 ...  5.80647145e-04
   3.04474146e-03  6.13697013e-03]
 [-2.33688107e+01 -1.95451126e+01 -1.99254322e+01 ...  7.61987932e-04
   1.41835643e-03  5.15813008e-03]
 [-4.09209938e+01 -3.70874748e+01 -3.48483086e+01 ...  1.53483571e-02
   1.25856474e-02  8.58121291e-02]]


NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [81]:

print(X_unlabeled.shape)
# Standardize the features
X_unlabeled = scaler.transform(X_unlabeled)

(8444, 684)


In [84]:
print(X_unlabeled)

predictions = model.predict(X_unlabeled)

[[-3.21476203e+01 -2.88147565e+01 -3.03063772e+01 ...  6.98346394e-03
   7.17909965e-03  2.09998116e-02]
 [-2.36496142e+01 -2.00677206e+01 -2.05555130e+01 ...  6.03257134e-03
   1.10998611e-02  1.59468060e-02]
 [-1.95036562e+01 -1.36459674e+01 -1.24210021e+01 ...  5.53315760e-03
   1.90339952e-02  2.62151778e-02]
 ...
 [-1.60357498e+01 -9.09520304e+00 -1.19064776e+01 ...  5.31415845e-03
   8.06366206e-03  1.10474206e-02]
 [-2.33879653e+01 -1.95706369e+01 -1.99625369e+01 ...  5.49641614e-03
   6.40220802e-03  1.00652888e-02]
 [-4.09545418e+01 -3.71347721e+01 -3.49114880e+01 ...  2.01565377e-02
   1.78102943e-02  9.09905157e-02]]
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 867us/step


In [86]:

# Generate row IDs based on the filenames (remove path and extension)
row_ids = [os.path.splitext(os.path.basename(file_path))[0] for file_path in unlabeled_file_paths]

# Ensure the number of row_ids matches the number of predictions
assert len(row_ids) == predictions.shape[0], "Number of row_ids must match the number of predictions"

# Prepare the column names
species_columns = df_train['primary_label'].astype('category').cat.categories.tolist()
columns = ['row_id'] + species_columns

# Combine row_ids with predictions
results = []
for row_id, probs in zip(row_ids, predictions):
    results.append([row_id] + probs.tolist())

# Convert to DataFrame
df_predictions = pd.DataFrame(results, columns=columns)

# Save predictions to CSV
df_predictions.to_csv('predictions.csv', index=False)
df_predictions["row_id"]  = df_predictions["row_id"].apply(lambda x: "soundscape_" + x)
print(df_predictions.head())

                  row_id  asbfly  ashdro1  ashpri1  ashwoo2       asikoe2  \
0  soundscape_1000170626     0.0      0.0      0.0      0.0  0.000000e+00   
1  soundscape_1000308629     0.0      0.0      0.0      0.0  4.282066e-37   
2  soundscape_1000389428     0.0      0.0      0.0      0.0  0.000000e+00   
3  soundscape_1000424265     0.0      0.0      0.0      0.0  0.000000e+00   
4  soundscape_1000450112     0.0      0.0      0.0      0.0  0.000000e+00   

   asiope1  aspfly1  aspswi1  barfly1  ...  whbwoo2  whcbar1  whiter2  whrmun  \
0      0.0      0.0      0.0      0.0  ...      0.0      0.0      0.0     0.0   
1      0.0      0.0      0.0      0.0  ...      0.0      0.0      0.0     0.0   
2      0.0      0.0      0.0      0.0  ...      0.0      0.0      0.0     0.0   
3      0.0      0.0      0.0      0.0  ...      0.0      0.0      0.0     0.0   
4      0.0      0.0      0.0      0.0  ...      0.0      0.0      0.0     0.0   

   whtkin2  woosan  wynlau1       yebbab1  yebbul3

# TEST 2

In [2]:
import pandas as pd

In [3]:
X_2 = pd.read_csv('X.csv', header=None)
y_2 = pd.read_csv('y.csv', header=None)

X_2 = X_2.to_numpy()
y_2 = y_2.to_numpy()

In [144]:

def get_spectrogram(audio, sr=22050, n_mels=128, fmax=8000):
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, fmax=fmax)
    return spectrogram

In [142]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# import Maxpooling2D and Conv2D, concatenate, model, input
from tensorflow.keras.layers import MaxPooling2D, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, concatenate


def create_model(input_shape):
    model = Sequential([
        Dense(256, input_shape=(input_shape,), activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(y_train.shape[1], activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


def build_cnn_model(input_shape=(128, 128, 1), num_meta_features=2, num_classes=10):
    # Image input branch
    img_input = Input(shape=input_shape, name='img_input')
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    
    # Metadata input branch
    meta_input = Input(shape=(num_meta_features,), name='meta_input')
    y = Dense(256, activation='relu')(meta_input)
    y = Dropout(0.5)(y)
    y = Dense(128, activation='relu')(y)
    y = Dropout(0.5)(y)
    y = Dense(64, activation='relu')(y)
    
    # Concatenate the outputs of the image and metadata branches
    combined = concatenate([x, y])
    z = Dense(256, activation='relu')(combined)
    z = Dense(num_classes, activation='softmax')(z)
    
    model = Model(inputs=[img_input, meta_input], outputs=z)
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [143]:
model_cnn = build_cnn_model(input_shape=(128, 128, 1), num_meta_features=2, num_classes=df_train['primary_label'].nunique())
model_cnn.summary()


In [145]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Conv2D, MaxPooling2D, Flatten, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [146]:

# Load X and y from CSV files
X = pd.read_csv('X.csv', header=None).values
y = pd.read_csv('y.csv', header=None).values

print(f"Loaded {X.shape[0]} files.")
print(X.shape)
print(y.shape)

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert labels to categorical
y = to_categorical(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X.shape, y.shape)

# Define the CNN model with metadata input
def build_cnn_model(input_shape=(128, 128, 1), num_meta_features=2, num_classes=10):
    # Image input branch
    img_input = Input(shape=input_shape, name='img_input')
    x = Conv2D(32, (3, 3), activation='relu')(img_input)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Dropout(0.25)(x)
    x = Flatten()(x)
    
    # Metadata input branch
    meta_input = Input(shape=(num_meta_features,), name='meta_input')
    y = Dense(256, activation='relu')(meta_input)
    y = Dropout(0.5)(y)
    y = Dense(128, activation='relu')(y)
    y = Dropout(0.5)(y)
    y = Dense(64, activation='relu')(y)
    
    # Concatenate the outputs of the image and metadata branches
    combined = concatenate([x, y])
    z = Dense(256, activation='relu')(combined)
    z = Dense(num_classes, activation='softmax')(z)
    
    model = Model(inputs=[img_input, meta_input], outputs=z)
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

# Assuming you have the spectrograms stored as a 4D numpy array with shape (num_samples, 128, 128, 1)
# If you have the spectrograms in another format, you need to load or preprocess them accordingly.

# Example usage
# Let's assume you have the spectrograms stored in an array called `spectrograms`
# For demonstration, we'll create dummy spectrogram data
# spectrograms = np.random.rand(X.shape[0], 128, 128, 1)  # Dummy data, replace with actual spectrograms

# Dummy data for demonstration
spectrograms = np.random.rand(X.shape[0], 128, 128, 1)

# Number of metadata features
num_meta_features = X.shape[1]

# Number of classes
num_classes = y.shape[1]

# Build the model
model = build_cnn_model(input_shape=(128, 128, 1), num_meta_features=num_meta_features, num_classes=num_classes)

# Train the model
history = model.fit(
    {'img_input': spectrograms, 'meta_input': X},
    y,
    epochs=200,
    batch_size=256,
    validation_split=0.2
)

# Evaluate the model
loss, accuracy = model.evaluate({'img_input': spectrograms, 'meta_input': X_test}, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Visualize training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

Loaded 24459 files.
(24459, 684)
(24459, 1)
(24459, 684) (24459, 182)
Epoch 1/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 528ms/step - accuracy: 0.0388 - loss: 4.5253 - val_accuracy: 0.0000e+00 - val_loss: 8.0123
Epoch 2/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 521ms/step - accuracy: 0.1314 - loss: 3.6763 - val_accuracy: 0.0070 - val_loss: 9.7762
Epoch 3/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 527ms/step - accuracy: 0.1761 - loss: 3.4023 - val_accuracy: 0.0090 - val_loss: 10.9418
Epoch 4/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 550ms/step - accuracy: 0.1999 - loss: 3.2561 - val_accuracy: 0.0055 - val_loss: 11.5770
Epoch 5/200
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 620ms/step - accuracy: 0.2158 - loss: 3.1728 - val_accuracy: 0.0080 - val_loss: 12.1268
Epoch 6/200
[1m39/77[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m22s[0m 599ms/step - accuracy: 0.2