In [1]:
pip install patool

Collecting patool
  Downloading patool-4.0.1-py2.py3-none-any.whl.metadata (4.5 kB)
Downloading patool-4.0.1-py2.py3-none-any.whl (86 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.5/86.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patool
Successfully installed patool-4.0.1


In [2]:
import patoolib

In [3]:
patoolib.extract_archive('/content/archive (1).zip')

INFO patool: Extracting /content/archive (1).zip ...
INFO:patool:Extracting /content/archive (1).zip ...
INFO patool: running /usr/bin/7z x -aou -o./Unpack_cf6y5q3c -- "/content/archive (1).zip"
INFO:patool:running /usr/bin/7z x -aou -o./Unpack_cf6y5q3c -- "/content/archive (1).zip"
INFO patool: ... /content/archive (1).zip extracted to `donateacry_corpus'.
INFO:patool:... /content/archive (1).zip extracted to `donateacry_corpus'.


'donateacry_corpus'

In [4]:
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import os

In [5]:
data_dir = "/content/donateacry_corpus"
categories = ["belly_pain", "burping", "discomfort", "hungry", "tired"]

In [6]:
data = []
labels = []

# Load audio data and label
def load_audio_data(data_dir, categories):
    for label, category in enumerate(categories):
        category_path = os.path.join(data_dir, category)
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            try:
                # Load audio file and extract MFCC features
                audio, sr = librosa.load(file_path, sr=None)
                mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
                mfcc_scaled = np.mean(mfcc.T, axis=0)  # Take the mean across time
                data.append(mfcc_scaled)
                labels.append(label)
            except Exception as e:
                print(f"Error loading {file_name}: {e}")

In [7]:
load_audio_data(data_dir, categories)

# Convert data to numpy arrays
data = np.array(data)
labels = np.array(labels)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

# Normalize the data
X_train = X_train / np.max(np.abs(X_train))
X_test = X_test / np.max(np.abs(X_test))
X_train = X_train.reshape(-1, 40, 1)
X_test = X_test.reshape(-1, 40, 1)

In [10]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import torchaudio
from torchaudio.transforms import Resample
from tqdm import tqdm

class Wav2Vec2Extractor:
    def __init__(self, device='cpu'):
        """
        Initialize the Wav2Vec2 extractor using facebook/wav2vec2-base.
        """
        self.device = torch.device(device if device in ['cpu', 'cuda'] else 'cpu')
        self.processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
        self.model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(self.device)
        # Use the sampling rate from the processor if available; otherwise default to 16000.
        self.target_sampling_rate = getattr(self.processor, "sampling_rate", 16000)

    def preprocess_audio(self, audio_path):
        """
        Load and preprocess the audio file.
        Returns:
            waveform (Tensor): Audio waveform.
            fs (int): Sample rate.
        """
        try:
            waveform, fs = torchaudio.load(audio_path)
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            return None, None

        # Convert multi-channel audio to mono if needed.
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        # Resample if the sample rate is not the target.
        if fs != self.target_sampling_rate:
            resampler = Resample(orig_freq=fs, new_freq=self.target_sampling_rate)
            waveform = resampler(waveform)
            fs = self.target_sampling_rate
        return waveform, fs

    def extract_features(self, audio_path):
        """
        Extract features from a single audio file using facebook/wav2vec2-base.
        Returns:
            avg_embeddings (np.array): Average embedding over time dimension.
        """
        waveform, fs = self.preprocess_audio(audio_path)
        if waveform is None:
            return None
        # Remove extra dimensions and add a batch dimension
        audio_np = np.squeeze(waveform.numpy().astype(np.float32))
        audio_np = np.expand_dims(audio_np, axis=0)
        inputs = self.processor(audio_np, sampling_rate=fs, return_tensors="pt")
        inputs = inputs.to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.cpu().numpy()
        # Compute the average over the time dimension (axis 1)
        avg_embeddings = np.mean(embeddings.squeeze(), axis=0)
        return avg_embeddings

    def extract_folder(self, data_dir, categories, output_file):
        data_records = []
        filenames = []

        for category in categories:
            category_path = os.path.join(data_dir, category)
            if not os.path.exists(category_path):
                print(f"Skipping {category_path}, directory not found.")
                continue

            for filename in tqdm(os.listdir(category_path), desc=f"Processing {category}"):
                if filename.lower().endswith(".wav"):
                    file_path = os.path.join(category_path, filename)
                    features = self.extract_features(file_path)
                    if features is not None:
                        data_records.append(features)
                        filenames.append(f"{category}/{filename}")

        if not data_records:
            print("No features extracted.")
            return

        df = pd.DataFrame(data_records)
        df.insert(0, 'filename', filenames)
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        df.to_csv(output_file, index=False)
        print(f"Saved all features to {output_file}")


# Paths

data_dir = "/content/donateacry_corpus"
output_csv = "/content/wav2vec-features.csv"
categories = ["belly_pain", "burping", "discomfort", "hungry", "tired"]

# Extract Features
extractor = Wav2Vec2Extractor(device='cuda' if torch.cuda.is_available() else 'cpu')
extractor.extract_folder(data_dir, categories, output_csv)

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]


Processing belly_pain:   0%|          | 0/16 [00:00<?, ?it/s][A
Processing belly_pain:   6%|▋         | 1/16 [00:06<01:41,  6.74s/it][A
Processing belly_pain:  12%|█▎        | 2/16 [00:10<01:10,  5.05s/it][A
Processing belly_pain:  19%|█▉        | 3/16 [00:13<00:52,  4.02s/it][A
Processing belly_pain:  25%|██▌       | 4/16 [00:15<00:38,  3.24s/it][A
Processing belly_pain:  31%|███▏      | 5/16 [00:17<00:30,  2.78s/it][A
Processing belly_pain:  38%|███▊      | 6/16 [00:19<00:25,  2.51s/it][A
Processing belly_pain:  44%|████▍     | 7/16 [00:21<00:21,  2.34s/it][A
Processing belly_pain:  50%|█████     | 8/16 [00:23<00:18,  2.34s/it][A
Processing belly_pain:  56%|█████▋    | 9/16 [00:26<00:17,  2.54s/it][A
Processing belly_pain:  62%|██████▎   | 10/16 [00:28<00:14,  2.36s/it][A
Processing belly_pain:  69%|██████▉   | 11/16 [00:30<00:11,  2.22s/it][A
Processing belly_pain:  75%|███████▌  | 12/16 [00:32<00:08,  2.14s/it][A
Processing belly_pain:  81%|████████▏ | 13/16 [00:34<00

Saved all features to /content/wav2vec-features.csv


In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Load embeddings
embeddings_path = "/content/wav2vec-features.csv"
df = pd.read_csv(embeddings_path)

# Extract filenames and labels
df['label'] = df['filename'].apply(lambda x: x.split('/')[0])  # Extract category from filename

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Extract features (X) and labels (y)
X = df.drop(columns=['filename', 'label']).values
y = df['label'].values

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Reshape input for FCN (add a channel dimension)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

num_classes = len(np.unique(y))
# Define FCN model
fcn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Compile the model
fcn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
fcn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)
# Predict on test data
y_pred_proba = fcn_model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute Accuracy and F1-score
fcn_acc = accuracy_score(y_test, y_pred)
fcn_f1 = f1_score(y_test, y_pred, average="weighted")

print(f"FCN - Accuracy: {fcn_acc:.4f}, F1-score: {fcn_f1:.4f}")
results = pd.DataFrame({
    "Feature Extractor": ["WAV2VEC2"],
    "FCN_Acc": [fcn_acc],
    "FCN_F1": [fcn_f1]
})

results.to_csv("/content/fcn_results.csv", index=False)
print("Results saved successfully.")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 195ms/step - accuracy: 0.6499 - loss: 1.5426 - val_accuracy: 0.8370 - val_loss: 1.2771
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step - accuracy: 0.8505 - loss: 1.1344 - val_accuracy: 0.8370 - val_loss: 0.7348
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 144ms/step - accuracy: 0.8469 - loss: 0.6661 - val_accuracy: 0.8370 - val_loss: 0.7321
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 175ms/step - accuracy: 0.8398 - loss: 0.7321 - val_accuracy: 0.8370 - val_loss: 0.6762
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 104ms/step - accuracy: 0.8143 - loss: 0.7560 - val_accuracy: 0.8370 - val_loss: 0.6769
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 104ms/step - accuracy: 0.8283 - loss: 0.7106 - val_accuracy: 0.8370 - val_loss: 0.6730
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━

In [13]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Load embeddings
embeddings_path = "/content/wav2vec-features.csv"
df = pd.read_csv(embeddings_path)

# Extract labels from filename
df['label'] = df['filename'].apply(lambda x: x.split('/')[0])  # Extract category from filename

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Extract features (X) and labels (y)
X = df.drop(columns=['filename', 'label']).values
y = df['label'].values

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Reshape input for CNN (add a channel dimension)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

num_classes = len(np.unique(y))
# Define CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Compile the model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)
# Predict on test data
y_pred_proba = cnn_model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute Accuracy and F1-score
cnn_acc = accuracy_score(y_test, y_pred)
cnn_f1 = f1_score(y_test, y_pred, average="weighted")

print(f"CNN - Accuracy: {cnn_acc:.4f}, F1-score: {cnn_f1:.4f}")
results = pd.DataFrame({
    "Feature Extractor": ["Wav2Vec2"],
    "CNN_Acc": [cnn_acc],
    "CNN_F1": [cnn_f1]
})

results.to_csv("/content/cnn_results.csv", index=False)
print("CNN results saved successfully.")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 140ms/step - accuracy: 0.7436 - loss: 0.9459 - val_accuracy: 0.8370 - val_loss: 0.7557
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step - accuracy: 0.8189 - loss: 0.7690 - val_accuracy: 0.8370 - val_loss: 0.6867
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step - accuracy: 0.8463 - loss: 0.6269 - val_accuracy: 0.8370 - val_loss: 0.6879
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step - accuracy: 0.8189 - loss: 0.7023 - val_accuracy: 0.8370 - val_loss: 0.7186
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 98ms/step - accuracy: 0.8374 - loss: 0.6465 - val_accuracy: 0.8370 - val_loss: 0.6910
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step - accuracy: 0.8207 - loss: 0.6521 - val_accuracy: 0.8370 - val_loss: 0.6854
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━━━━

In [14]:
import os
import numpy as np
import pandas as pd
import torch
import torchaudio
from transformers import AutoProcessor, UniSpeechSatModel
from torchaudio.transforms import Resample
from tqdm import tqdm

class UniSpeechSATExtractor:
    def __init__(self, device='cpu'):
        """
        Initialize the UniSpeech-SAT extractor using microsoft/unispeech-sat-base-100h-libri-ft.
        """
        self.device = torch.device(device if device in ['cpu', 'cuda'] else 'cpu')
        self.processor = AutoProcessor.from_pretrained("microsoft/unispeech-sat-base-100h-libri-ft")
        self.model = UniSpeechSatModel.from_pretrained("microsoft/unispeech-sat-base-100h-libri-ft").to(self.device)
        # Define the target sampling rate (typically 16000 Hz for this model)
        self.target_sampling_rate = 16000

    def preprocess_audio(self, audio_path, target_rate):
        """
        Load and preprocess the audio file.
        Converts multi-channel audio to mono and resamples to the target sampling rate if necessary.

        Returns:
            waveform (Tensor): The preprocessed audio waveform.
            fs (int): The sample rate after resampling.
        """
        try:
            waveform, fs = torchaudio.load(audio_path)
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            return None, None

        # Convert multi-channel to mono if needed.
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        # Resample if fs is not the target_rate.
        if fs != target_rate:
            resampler = Resample(orig_freq=fs, new_freq=target_rate)
            waveform = resampler(waveform)
            fs = target_rate
        return waveform, fs

    def extract_features(self, audio_path):
        """
        Extract features from a single audio file using microsoft/unispeech-sat-base-100h-libri-ft.
        Returns:
            avg_embeddings (np.array): The average encoder output computed over the time dimension.
        """
        sample_rate = self.target_sampling_rate
        waveform, fs = self.preprocess_audio(audio_path, sample_rate)
        if waveform is None:
            return None

        try:
            # Remove extra dimensions and add a batch dimension if needed.
            audio_np = np.squeeze(waveform.numpy().astype(np.float32))
            audio_np = np.expand_dims(audio_np, axis=0)
            inputs = self.processor(audio_np, sampling_rate=fs, return_tensors="pt")
            inputs = inputs.to(self.device)
            input_features = inputs.input_values  # UniSpeech-SAT uses "input_values"

            with torch.no_grad():
                outputs = self.model(input_features)
            # Average the encoder outputs over the time dimension (axis 1)
            avg_embeddings = outputs.last_hidden_state.squeeze().mean(axis=0).cpu().numpy()
            return avg_embeddings

        except Exception as e:
            print(f"Error processing {audio_path}: {e}")
            return None

    def extract_folder(self, data_dir, categories, output_file):
        data_records = []
        filenames = []

        for category in categories:
            category_path = os.path.join(data_dir, category)
            if not os.path.exists(category_path):
                print(f"Skipping {category_path}, directory not found.")
                continue

            for filename in tqdm(os.listdir(category_path), desc=f"Processing {category}"):
                if filename.lower().endswith(".wav"):
                    file_path = os.path.join(category_path, filename)
                    features = self.extract_features(file_path)
                    if features is not None:
                        data_records.append(features)
                        filenames.append(f"{category}/{filename}")

        if not data_records:
            print("No features extracted.")
            return

        df = pd.DataFrame(data_records)
        df.insert(0, 'filename', filenames)
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        df.to_csv(output_file, index=False)
        print(f"Saved all features to {output_file}")


# Paths

data_dir = "/content/donateacry_corpus"
output_csv = "/content/infant_cry_unispeech_features.csv"
categories = ["belly_pain", "burping", "discomfort", "hungry", "tired"]

# Extract Features
extractor = UniSpeechSATExtractor(device='cuda' if torch.cuda.is_available() else 'cpu')
extractor.extract_folder(data_dir, categories, output_csv)


preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]


Processing belly_pain:   0%|          | 0/16 [00:00<?, ?it/s][A
Processing belly_pain:   6%|▋         | 1/16 [00:04<01:09,  4.62s/it][A
Processing belly_pain:  12%|█▎        | 2/16 [00:08<00:59,  4.23s/it][A
Processing belly_pain:  19%|█▉        | 3/16 [00:10<00:43,  3.34s/it][A
Processing belly_pain:  25%|██▌       | 4/16 [00:12<00:33,  2.81s/it][A
Processing belly_pain:  31%|███▏      | 5/16 [00:14<00:27,  2.52s/it][A
Processing belly_pain:  38%|███▊      | 6/16 [00:16<00:23,  2.35s/it][A
Processing belly_pain:  44%|████▍     | 7/16 [00:18<00:20,  2.24s/it][A
Processing belly_pain:  50%|█████     | 8/16 [00:21<00:20,  2.51s/it][A
Processing belly_pain:  56%|█████▋    | 9/16 [00:24<00:16,  2.39s/it][A
Processing belly_pain:  62%|██████▎   | 10/16 [00:26<00:13,  2.28s/it][A
Processing belly_pain:  69%|██████▉   | 11/16 [00:28<00:10,  2.19s/it][A
Processing belly_pain:  75%|███████▌  | 12/16 [00:30<00:08,  2.13s/it][A
Processing belly_pain:  81%|████████▏ | 13/16 [00:32<00

Saved all features to /content/infant_cry_unispeech_features.csv


In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Load embeddings
embeddings_path = "/content/infant_cry_unispeech_features.csv"
df = pd.read_csv(embeddings_path)

# Extract filenames and labels
df['label'] = df['filename'].apply(lambda x: x.split('/')[0])  # Extract category from filename

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Extract features (X) and labels (y)
X = df.drop(columns=['filename', 'label']).values
y = df['label'].values

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Reshape input for FCN (add a channel dimension)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

num_classes = len(np.unique(y))
# Define FCN model
fcn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    Conv1D(filters=128, kernel_size=3, activation='relu'),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Compile the model
fcn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
fcn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)
# Predict on test data
y_pred_proba = fcn_model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute Accuracy and F1-score
fcn_acc = accuracy_score(y_test, y_pred)
fcn_f1 = f1_score(y_test, y_pred, average="weighted")

print(f"FCN - Accuracy: {fcn_acc:.4f}, F1-score: {fcn_f1:.4f}")
results = pd.DataFrame({
    "Feature Extractor": ["Trillson"],
    "FCN_Acc": [fcn_acc],
    "FCN_F1": [fcn_f1]
})

results.to_csv("/content/fcn_results_2.csv", index=False)
print("Results saved successfully.")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 232ms/step - accuracy: 0.6668 - loss: 1.5562 - val_accuracy: 0.8370 - val_loss: 1.3345
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 108ms/step - accuracy: 0.8284 - loss: 1.2201 - val_accuracy: 0.8370 - val_loss: 0.7833
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 106ms/step - accuracy: 0.8220 - loss: 0.7680 - val_accuracy: 0.8370 - val_loss: 0.7022
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 105ms/step - accuracy: 0.8263 - loss: 0.7667 - val_accuracy: 0.8370 - val_loss: 0.6612
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 265ms/step - accuracy: 0.8217 - loss: 0.7570 - val_accuracy: 0.8370 - val_loss: 0.6619
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 189ms/step - accuracy: 0.8349 - loss: 0.6932 - val_accuracy: 0.8370 - val_loss: 0.6562
Epoch 7/20
[1m12/12[0m [3



[1m1/3[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 109ms/step



[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
FCN - Accuracy: 0.8370, F1-score: 0.7627
Results saved successfully.


In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# Load embeddings
embeddings_path = "/content/infant_cry_unispeech_features.csv"
df = pd.read_csv(embeddings_path)

# Extract labels from filename
df['label'] = df['filename'].apply(lambda x: x.split('/')[0])  # Extract category from filename

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Extract features (X) and labels (y)
X = df.drop(columns=['filename', 'label']).values
y = df['label'].values

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Reshape input for CNN (add a channel dimension)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

num_classes = len(np.unique(y))
# Define CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),

    Conv1D(filters=128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),

    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Compile the model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)
# Predict on test data
y_pred_proba = cnn_model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

# Compute Accuracy and F1-score
cnn_acc = accuracy_score(y_test, y_pred)
cnn_f1 = f1_score(y_test, y_pred, average="weighted")

print(f"CNN - Accuracy: {cnn_acc:.4f}, F1-score: {cnn_f1:.4f}")
results = pd.DataFrame({
    "Feature Extractor": ["Unispeech"],
    "CNN_Acc": [cnn_acc],
    "CNN_F1": [cnn_f1]
})

results.to_csv("/content/cnn_results_2.csv", index=False)
print("CNN results saved successfully.")


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 124ms/step - accuracy: 0.8024 - loss: 1.0255 - val_accuracy: 0.8370 - val_loss: 0.7967
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 163ms/step - accuracy: 0.8171 - loss: 0.8071 - val_accuracy: 0.8370 - val_loss: 0.6937
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 102ms/step - accuracy: 0.8639 - loss: 0.6402 - val_accuracy: 0.8370 - val_loss: 0.6873
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step - accuracy: 0.8505 - loss: 0.6943 - val_accuracy: 0.8370 - val_loss: 0.6526
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step - accuracy: 0.8390 - loss: 0.7232 - val_accuracy: 0.8370 - val_loss: 0.6561
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 101ms/step - accuracy: 0.8151 - loss: 0.7624 - val_accuracy: 0.8370 - val_loss: 0.6519
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━━━