In [1]:
import os
import pandas as pd
import numpy as np
import librosa as lr
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import torch

import tensorflow as tf
from tensorflow import keras

mode = "kaggle"

input_dir = ""
output_dir = ""

if mode == "local":
    input_dir = "../../"
    output_dir = ""

if mode == "kaggle":
    input_dir = "/kaggle/input/depression-audio/daic-woz-dataset"
    features_dir = "/kaggle/input/depression-audio/extracted_features"
    output_dir = "/kaggle/working"

DATASET_DIR = f"{input_dir}/extracted_audio"
DATAINFO_DIR = f"{input_dir}/dataset_info"
MELSPECT_DIR = f"{features_dir}/mel_spectograms"

# check if gpu is available
if tf.test.gpu_device_name():
    print("GPU is available")
    device_name = tf.test.gpu_device_name()
else:
    print("GPU is not available")
    device_name = "CPU:0"

2024-11-03 00:30:43.559011: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU is not available


# Loading dataset


In [None]:
train_features = torch.load(
    os.path.join(MELSPECT_DIR, f"train_stacked_seg_spect.pkl"), weights_only=True
)
train_features = train_features.numpy()
train_features = tf.convert_to_tensor(train_features)
train_features.shape

In [None]:
train_labels = np.load(os.path.join(MELSPECT_DIR, f"train_labels.npy"))
train_labels.shape

# Model Training


## Basic CNN Model


In [10]:
batch_size = 32
n_channels = 1

cnn_basic = keras.models.Sequential(
    [
        keras.layers.Input(shape=(128, 4096, 1)),
        keras.layers.Conv2D(32, kernel_size=3, activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        keras.layers.BatchNormalization(),
        keras.layers.Conv2D(64, kernel_size=3, activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        keras.layers.BatchNormalization(),
        keras.layers.Conv2D(128, kernel_size=3, activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        keras.layers.BatchNormalization(),
        keras.layers.Conv2D(256, kernel_size=3, activation="relu"),
        keras.layers.MaxPooling2D(pool_size=(2, 2)),
        keras.layers.BatchNormalization(),
        keras.layers.GlobalAveragePooling2D(),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

cnn_basic.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
h = cnn_basic.fit(train_features, train_labels, batch_size=batch_size, epochs=50)
cnn_basic.save(os.path.join(output_dir, "cnn_basic.keras"))

## ResNET Model


In [None]:
from tensorflow.keras.applications import ResNet50

# from tensorflow.keras import Model

# Load the ResNet50 model, excluding the top layers
# Convert the 2D melspectograms to 3 channels by repeating the single channel 3 times
train_features_3ch = np.repeat(train_features, 3, axis=-1)

base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(128, 4096, 3))

# Add custom layers on top of the base model
x = base_model.output
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dense(256, activation="relu")(x)
x = keras.layers.Dropout(0.5)(x)
predictions = keras.layers.Dense(1, activation="sigmoid")(x)

# Create the final model
resnet_model = keras.Model(inputs=base_model.input, outputs=predictions)

# Compile the model
resnet_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
h_resnet = resnet_model.fit(train_features, train_labels, batch_size=16, epochs=50)

# Save the model
resnet_model.save(os.path.join(output_dir, "resnet50_audio_classification.keras"))

In [None]:
def plot_history(h, title):
    plt.figure(figsize=(10, 5))
    plt.plot(h.history["accuracy"], label="accuracy")
    plt.plot(h.history["loss"], label="loss")
    plt.title(title)
    plt.xlabel("Epoch")
    plt.legend()
    plt.show()

## HuBERT Model


In [13]:
import os
import pandas as pd
import pandas as pd
import numpy as np

# import librosa as lr
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pickle

# import torch

# import tensorflow as tf
# from tensorflow import keras

from datasets import load_dataset

mode = "local"

input_dir = ""
output_dir = ""

if mode == "local":
    input_dir = "../.."
    output_dir = ""
    features_dir = "../../extracted_features"


if mode == "kaggle":
    input_dir = "/kaggle/input/depression-audio/daic-woz-dataset"
    features_dir = "/kaggle/input/depression-audio/extracted_features"
    output_dir = "/kaggle/working"

DATASET_DIR = f"{input_dir}/extracted_audio"
DATAINFO_DIR = f"{input_dir}/dataset_info"
MELSPECT_DIR = f"{features_dir}/mel_spectograms"

# check if gpu is available
# if tf.test.gpu_device_name():
#     print("GPU is available")
#     device_name = tf.test.gpu_device_name()
# else:
#     print("GPU is not available")
#     device_name = 'CPU:0'

In [14]:
train_df = pd.read_csv(os.path.join(DATAINFO_DIR, "train_split_augmented.csv"))
dev_df = pd.read_csv(os.path.join(DATAINFO_DIR, "dev_split.csv"))
test_df = pd.read_csv(os.path.join(DATAINFO_DIR, "test_split.csv"))

In [15]:
def insert_audio_id(x):
    parts = str(x).split("_")
    if len(parts) == 1:
        return f"{x}_AUDIO.wav"
    if len(parts) == 2:
        return f"{parts[0]}_AUDIO_{parts[1]}.wav"


def modify_info_files(df, split_type):
    df["original_id"] = df["Participant_ID"]
    df["Participant_ID"] = df["Participant_ID"].apply(lambda x: insert_audio_id(x))
    # print(df.head())
    df.to_csv(os.path.join(DATAINFO_DIR, f"{split_type}_split_new.csv"), index=False)

In [16]:
modify_info_files(train_df, "train")
modify_info_files(dev_df, "dev")
modify_info_files(test_df, "test")

In [17]:
def create_metadata(df, split_type):
    file_names = df["Participant_ID"].values
    labels = df["PHQ_Binary"].values

    metadata = pd.DataFrame({"file_name": file_names, "label": labels})
    metadata.to_csv(os.path.join(DATASET_DIR, split_type, "metadata.csv"), index=False)

In [18]:
create_metadata(train_df, "train")
create_metadata(dev_df, "dev")
create_metadata(test_df, "test")

In [2]:
os.environ["MODEL"] = "ntu-spml/distilhubert"

In [27]:
# load dataset from folder
dataset = load_dataset("audiofolder", data_dir="../../extracted_audio/train")
dev_dataset = load_dataset("audiofolder", data_dir="../../extracted_audio/dev")
test_dataset = load_dataset("audiofolder", data_dir="../../extracted_audio/test")

dataset

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/57 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 56
    })
})

In [None]:
# add dev and test datasets to the dict
dataset["dev"] = dev_dataset["train"]
dataset["test"] = test_dataset["train"]

# Playground


In [7]:
dataset = load_dataset("audiofolder", data_dir="../../extracted_audio/dev")
dataset

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 56
    })
})

In [6]:
dataset["train"][1]  # { audio -> path, array, sr, label }

{'audio': {'path': '/home/dell/Preet/Comding/ML_Projects/depression_detection/extracted_audio/dev/301_AUDIO.wav',
  'array': array([ 2.74658203e-04,  9.15527344e-05,  2.74658203e-04, ...,
         -3.96728516e-04, -2.44140625e-04,  7.93457031e-04]),
  'sampling_rate': 22050},
 'label': 0}

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import torchaudio
import numpy as np
from tqdm import tqdm

In [None]:
class AudioDataset(Dataset):
    def __init__(
        self, audio_paths, labels, feature_extractor, segment_duration=30, overlap=5
    ):
        """
        Args:
            audio_paths: List of paths to audio files
            labels: List of labels for each audio file
            feature_extractor: HuggingFace feature extractor
            segment_duration: Duration of each segment in seconds
            overlap: Overlap between segments in seconds
        """
        self.audio_paths = audio_paths
        self.file_labels = labels
        self.feature_extractor = feature_extractor
        self.segment_duration = segment_duration
        self.overlap = overlap
        self.sampling_rate = 16000  # Standard sampling rate

        # Calculate segment sizes
        self.segment_size = self.sampling_rate * segment_duration
        self.hop_size = self.sampling_rate * (segment_duration - overlap)

        # Create segment indices for all files
        self.segments = self._create_segment_indices()

    def _create_segment_indices(self):
        """Create a list of (file_idx, start_idx, end_idx) for all segments"""
        segments = []

        for file_idx, audio_path in enumerate(self.audio_paths):
            # Load audio file
            waveform, sample_rate = torchaudio.load(audio_path)

            # Resample if necessary
            if sample_rate != self.sampling_rate:
                resampler = torchaudio.transforms.Resample(
                    sample_rate, self.sampling_rate
                )
                waveform = resampler(waveform)

            # Get total length
            total_length = waveform.shape[1]

            # Calculate segment starts
            start_indices = np.arange(
                0, total_length - self.segment_size, self.hop_size
            )

            # Add segments for this file
            for start_idx in start_indices:
                end_idx = start_idx + self.segment_size
                segments.append((file_idx, int(start_idx), int(end_idx)))

        return segments

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        file_idx, start_idx, end_idx = self.segments[idx]
        audio_path = self.audio_paths[file_idx]

        # Load audio
        waveform, sample_rate = torchaudio.load(audio_path)

        # Resample if necessary
        if sample_rate != self.sampling_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.sampling_rate)
            waveform = resampler(waveform)

        # Extract segment
        segment = waveform[0, start_idx:end_idx].numpy()

        # Process through feature extractor
        inputs = self.feature_extractor(
            segment, sampling_rate=self.sampling_rate, return_tensors="pt", padding=True
        )

        return {
            "input_values": inputs.input_values.squeeze(),
            "label": torch.tensor(self.file_labels[file_idx], dtype=torch.long),
            "file_idx": file_idx,  # Keep track of which file this segment came from
            "segment_idx": idx,
        }

In [None]:
class AudioClassifier:
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def predict_file(
        self, audio_path, feature_extractor, segment_duration=30, overlap=5
    ):
        """Predict depression probability for a full audio file using segments"""
        # Create dataset for single file
        dataset = AudioDataset(
            [audio_path],
            [0],
            feature_extractor,
            segment_duration=segment_duration,
            overlap=overlap,
        )
        loader = DataLoader(dataset, batch_size=8, shuffle=False)

        segment_predictions = []
        self.model.eval()

        with torch.no_grad():
            for batch in loader:
                input_values = batch["input_values"].to(self.device)
                outputs = self.model(input_values)
                probabilities = torch.softmax(outputs.logits, dim=1)
                segment_predictions.append(
                    probabilities[:, 1].cpu().numpy()
                )  # Probability of depression

        # Aggregate predictions from all segments
        segment_predictions = np.concatenate(segment_predictions)

        # You can use different aggregation strategies:
        mean_prediction = np.mean(segment_predictions)
        max_prediction = np.max(segment_predictions)

        return {
            "mean_probability": mean_prediction,
            "max_probability": max_prediction,
            "segment_probabilities": segment_predictions,
        }

In [None]:
def train_model(model, train_loader, val_loader, device, num_epochs=5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = torch.nn.CrossEntropyLoss()

    # Dictionary to track file-level predictions
    file_predictions = {}

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0

        # Reset file predictions for this epoch
        file_predictions.clear()

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_values = batch["input_values"].to(device)
            labels = batch["label"].to(device)
            file_indices = batch["file_idx"]

            optimizer.zero_grad()
            outputs = model(input_values)
            loss = criterion(outputs.logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            # Store predictions for each segment
            probs = torch.softmax(outputs.logits, dim=1)
            for i, file_idx in enumerate(file_indices):
                file_idx = file_idx.item()
                if file_idx not in file_predictions:
                    file_predictions[file_idx] = []
                file_predictions[file_idx].append(probs[i, 1].item())

        # Calculate file-level accuracy
        correct_files = 0
        total_files = len(set(file_predictions.keys()))

        for file_idx, predictions in file_predictions.items():
            mean_pred = np.mean(predictions)
            true_label = train_loader.dataset.file_labels[file_idx]
            if (mean_pred > 0.5) == true_label:
                correct_files += 1

        file_accuracy = correct_files / total_files
        print(
            f"Epoch {epoch+1}: Train Loss: {train_loss/len(train_loader):.4f}, "
            f"File-level Accuracy: {file_accuracy*100:.2f}%"
        )

In [None]:
train_df = pd.read_csv(os.path.join(DATAINFO_DIR, "train_split_new.csv"))
# train_df.head()
train_paths = [
    os.path.join(DATASET_DIR, "train", f) for f in train_df["Participant_ID"]
]
train_labels = train_df["PHQ_Binary"].values
# train_labels[:5]

In [24]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model and feature extractor
model_name = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(
    model_name,
    num_labels=2,
    label2id={"not_depressed": 0, "depressed": 1},
    id2label={0: "not_depressed", 1: "depressed"},
).to(device)

# Create datasets with segmentation
train_dataset = AudioDataset(
    train_paths,
    train_labels,
    feature_extractor,
    segment_duration=90,  # 90 seconds per segment
    overlap=20,  # 20 seconds overlap
)

# val_dataset = AudioDataset(
#     val_paths,
#     val_labels,
#     feature_extractor,
#     segment_duration=90,
#     overlap=20
# )

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Train model
train_model(model, train_loader, device)
# train_model(model, train_loader, val_loader, device)

array([0, 0, 0, 0, 0])