In [138]:
import librosa
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.model_selection import train_test_split
import os

In [139]:
# 1. Parse the text file for labeled segments
def parse_text_file(file_path):
    segments = []
    def is_float(value):
        """Check if a string can be converted to a float."""
        try:
            float(value)
            return True
        except ValueError:
            return False

    with open(file_path, 'r') as f:
        lines = f.readlines()[1:]  # Skip the first line (header), if needed
        
        for line in lines:
            # Strip leading/trailing spaces and check if the line is empty
            line = line.strip()
            if not line:
                continue  
            
            # Split the line into parts (by spaces or tabs)
            parts = line.split()
            
            # Check if there are exactly three parts and if the first two parts are floats
            if len(parts) == 3 and is_float(parts[0]) and is_float(parts[1]):
                start, end, label = float(parts[0]), float(parts[1]), parts[2]
                segments.append((start, end, label))
            else:
                print(f"Skipping line due to incorrect format: {line}")
    
    return segments

In [140]:
# 2. Extract corresponding audio segments
def extract_audio_segments(audio_path, segments):
    y, sr = librosa.load(audio_path, sr=None)
    audio_segments = []
    labels = []
    
    for start, end, label in segments:
        start_sample = int(start * sr)
        end_sample = int(end * sr)
        segment = y[start_sample:end_sample]
        audio_segments.append(segment)
        labels.append(1 if label == "bee" else 0)  # 1 for "bee", 0 for "nobee"
    
    return audio_segments, labels

In [141]:
# 3. Preprocess audio segments (MFCC feature extraction)
def preprocess_audio_segments(audio_segments, sr):
    mfcc_features = []
    
    for segment in audio_segments:
        # Extract MFCC features for each segment
        mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=40)
        mfcc_features.append(mfcc)
    
    return mfcc_features

In [142]:
# 4. Pad audio features to ensure equal length
def pad_audio_segments(mfcc_features):
    # Find the maximum length (number of time steps) across all MFCC segments
    max_length = max([mfcc.shape[1] for mfcc in mfcc_features])
    padded_mfccs = []
    
    for mfcc in mfcc_features:
        # Pad with zeros to ensure all segments have the same number of time steps
        pad_width = max_length - mfcc.shape[1]
        if pad_width > 0:
            # Pad only along the time steps axis (second dimension)
            mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
        padded_mfccs.append(mfcc)
    
    return np.array(padded_mfccs)


In [143]:
# 5. Create a CNN model for binary classification
def create_model(input_shape):
    model = Sequential([
        Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(128, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [144]:
audio_path = 'CF003 - Active - Day - (216).wav'  
text_file_path = 'CF003 - Active - Day - (216).lab'

# Step 1: Parse the text file for labeled segments
segments = parse_text_file(text_file_path)

# Step 2: Extract corresponding audio segments
audio_segments, labels = extract_audio_segments(audio_path, segments)

# Step 3: Preprocess audio segments (MFCC feature extraction)
sr = 22050  # Sampling rate, librosa defaults to 22050
mfcc_features = preprocess_audio_segments(audio_segments, sr)

# Step 4: Pad audio features to ensure equal length
padded_mfcc_features = pad_audio_segments(mfcc_features)

X = np.array(padded_mfcc_features)
y = np.array(labels)

input_shape = (X.shape[1], X.shape[2])  # (time steps, MFCC coefficients)
model = create_model(input_shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Skipping line due to incorrect format: .


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.7500 - loss: 0.4194 - val_accuracy: 1.0000 - val_loss: 0.3117
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step - accuracy: 1.0000 - loss: 0.0055 - val_accuracy: 1.0000 - val_loss: 0.0576
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step - accuracy: 1.0000 - loss: 4.6698e-04 - val_accuracy: 1.0000 - val_loss: 0.0128
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363ms/step - accuracy: 1.0000 - loss: 3.7098e-05 - val_accuracy: 1.0000 - val_loss: 0.0033
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326ms/step - accuracy: 1.0000 - loss: 4.0653e-06 - val_accuracy: 1.0000 - val_loss: 8.8012e-04
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step - accuracy: 1.0000 - loss: 6.4584e-07 - val_accuracy: 1.0000 - val_loss: 2.7412e-04
Epoch 7/10
[1m1/1

<keras.src.callbacks.history.History at 0x1b08e2de860>

In [145]:
test_audio_path = 'CF003 - Active - Day - (216).wav'  
test_text_file_path = 'CF003 - Active - Day - (216).lab' 

# Step 1: Parse the text file for labeled segments
test_segments = parse_text_file(test_text_file_path)

# Step 2: Extract corresponding audio segments
test_audio_segments, test_labels = extract_audio_segments(test_audio_path, test_segments)

# Step 3: Preprocess test audio segments (MFCC feature extraction)
test_mfcc_features = preprocess_audio_segments(test_audio_segments, sr=22050)

# Step 4: Pad test audio features to ensure equal length
padded_test_mfcc_features = pad_audio_segments(test_mfcc_features)

X_test = np.array(padded_test_mfcc_features)
y_test = np.array(test_labels)

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

# Test the model on the test dataset and get predictions
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

# Make predictions on the test data
predictions = model.predict(X_test)

# Improved output formatting with confidence levels
y_pred = (predictions > 0.5).astype(int)

for i in range(len(y_test)):
    confidence = predictions[i][0]  # Confidence level for each prediction
    if y_pred[i][0] == 1:
        print(f"Audio segment {i+1}: Bee detected with confidence {confidence:.2f}")
    else:
        print(f"Audio segment {i+1}: No bee detected with confidence {1-confidence:.2f}")

Skipping line due to incorrect format: .
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 1.0000 - loss: 9.7023e-07
Test Loss: 9.70233372754592e-07, Test Accuracy: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 1.0000 - loss: 9.7023e-07
Test Loss: 9.70233372754592e-07, Test Accuracy: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
Audio segment 1: Bee detected with confidence 1.00
Audio segment 2: No bee detected with confidence 1.00
Audio segment 3: Bee detected with confidence 1.00
Audio segment 4: No bee detected with confidence 1.00
Audio segment 5: Bee detected with confidence 1.00
