In [1]:
import os
import pandas as pd
import numpy as np
import librosa as lr
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

mode = "kaggle"

input_dir = ""
output_dir = ""

if mode == "local":
    input_dir = "../../"
    output_dir = ""

if mode == "kaggle":
    input_dir = "/kaggle/input/depression-audio/daic-woz-dataset"
    features_dir = "/kaggle/input/depression-audio/extracted_features"
    output_dir = "/kaggle/working"

DATASET_DIR = f"{input_dir}/extracted_audio"
DATAINFO_DIR = f"{input_dir}/dataset_info"
MELSPECT_DIR = f"{features_dir}/mel_spectograms"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [35]:
print(torch.__version__)
print(torchaudio.__version__)

2.4.0+cpu
2.4.0+cpu


# Pytorch Model Training

## Dataset Loading

In [2]:
class SpectogramDataset(Dataset):
    def __init__(self, split_type, transform=None, target_transform=None):
        self.data = torch.load(os.path.join(MELSPECT_DIR, f"{split_type}_stacked_seg_spect.pkl"),
                               weights_only=True)
        self.labels = np.load(os.path.join(MELSPECT_DIR, f"{split_type}_labels.npy"))
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        data = self.data[index].to(torch.float32)
        label = self.labels[index]

        if torch.cuda.is_available():
            data = data.to("cuda")
            label = torch.tensor(label).to("cuda")
        return data, label

In [None]:
trainset = SpectogramDataset("train")
devset = SpectogramDataset("dev")
testset = SpectogramDataset("test")

In [None]:
train_loader = DataLoader(trainset, batch_size=16, shuffle=True)
dev_loader = DataLoader(devset, batch_size=64, shuffle=True)
test_loader = DataLoader(testset, batch_size=64, shuffle=True)

In [5]:
train_features, train_labels = next(iter(train_loader))
train_features.shape, train_labels.shape

(torch.Size([16, 128, 4096]), torch.Size([16]))

## Prelimnary CNN Model

In [24]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.pool = nn.MaxPool2d(kernel_size=(2, 2))
        self.gap = nn.AdaptiveAvgPool2d((1,1))
        
        # Block 1
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3, 3), padding=1)
        self.bn1 = nn.BatchNorm2d(num_features=64, affine=False)
        
        # Block 2
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), padding=1)
        self.bn2 = nn.BatchNorm2d(num_features=128, affine=False)
        
        # Block 3
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(5, 5), padding=1)
        self.bn3 = nn.BatchNorm2d(num_features=256, affine=False)

        # Block 4
        # self.conv4 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(5, 5), padding=1)
        # self.bn4 = nn.BatchNorm2d(num_features=512, affine=False)

        # Block 5
        # self.conv5 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=(7, 7), padding=1)
        # self.bn5 = nn.BatchNorm2d(num_features=1024, affine=False)
        
        # Linear Layers
        self.fc1 = nn.Linear(in_features=256*1*1, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=1)  # binary classifier
    
    def forward(self, x):
        # CNN -> ReLU -> Pool -> BN
        x = self.bn1(self.pool(F.relu(self.conv1(x))))
        # print(x.shape)
        x = self.bn2(self.pool(F.relu(self.conv2(x))))
        # print(x.shape)
        x = self.bn3(self.pool(F.relu(self.conv3(x))))
        # x = self.bn4(self.pool(F.relu(self.conv4(x))))
        # x = self.bn5(self.pool(F.relu(self.conv5(x))))

        # print(x.shape)
        x = self.gap(x)
        # print(x.shape)
        
        # Flatten for linear layers
        x = x.view(x.size(0), -1)  # Flatten to [batch_size, 256*1*1]
        
        # # Fully connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # Binary output with sigmoid for probability # no need of sigmoid because we will use sigmoid with loss
        
        return x

In [18]:
cnn_basic = CNN()
cnn_basic.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(cnn_basic.parameters(), lr=0.005, momentum=0.9)

In [13]:
def get_model_size(model):
    model_size = 0
    for param in model.parameters():
        model_size += param.numel() * param.element_size()
    model_size_MB = model_size / (1024 ** 2)  # Convert to MB
    print(f"Model size: {model_size_MB:.2f} MB")
    return model_size_MB

# Example usage
model_size_MB = get_model_size(cnn_basic)

Model size: 3.51 MB


In [14]:
def get_gpu_mem_stats():
    stat = torch.cuda.memory_allocated(0) / (1024*1024*1024) # in GB
    return stat

In [None]:
for epoch in range(50):
    print(f"epoch: {epoch}")
    
    # stat = get_gpu_mem_stats()
    # print(f"Before data loading: {stat} GB")
    running_loss = 0.0
    for i, data in enumerate(train_loader, start=0):  

        # stat = get_gpu_mem_stats()
        # print(f"After data loading: {stat} GB")
        
        inputs,labels = data
        inputs = torch.unsqueeze(inputs, 1)
        labels = torch.unsqueeze(labels, 1)
        # print(inputs.shape)
        # print(inputs.is_cuda, labels.is_cuda)

        # zero gradient parameters
        optimizer.zero_grad()
        
        # fw pass + bw pass + optimize
        outputs = cnn_basic(inputs)
        # print(outputs.dtype, labels.dtype)
        # print(outputs.shape, labels.shape)
        # stat = get_gpu_mem_stats()
        # print(f"After first fw: {stat} GB")

        loss = criterion(outputs, labels.float())
        # stat = get_gpu_mem_stats()
        # print(f"After loss: {stat} GB")
        
        loss.backward()
        # stat = get_gpu_mem_stats()
        # print(f"After first bw: {stat} GB")
        
        optimizer.step()
        # stat = get_gpu_mem_stats()
        # print(f"After first fw: {stat} GB")

        del outputs
        torch.cuda.empty_cache()

        # print statistics
        running_loss += loss.item()
        if i % 20 == 0:
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 5:.3f}')
            running_loss = 0.0

print('Finished Training')

epoch: 0
[1,     1] loss: 0.139
[1,    21] loss: 2.779
Finished Training


# Tensorflow Training

In [1]:
import os
import pandas as pd
import numpy as np
import librosa as lr
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

import torch

import tensorflow as tf
from tensorflow import keras

mode = "kaggle"

input_dir = ""
output_dir = ""

if mode == "local":
    input_dir = "../../"
    output_dir = ""

if mode == "kaggle":
    input_dir = "/kaggle/input/depression-audio/daic-woz-dataset"
    features_dir = "/kaggle/input/depression-audio/extracted_features"
    output_dir = "/kaggle/working"

DATASET_DIR = f"{input_dir}/extracted_audio"
DATAINFO_DIR = f"{input_dir}/dataset_info"
MELSPECT_DIR = f"{features_dir}/mel_spectograms"

# check if gpu is available
if tf.test.gpu_device_name():
    print("GPU is available")
    device_name = tf.test.gpu_device_name()
else:
    print("GPU is not available")
    device_name = 'CPU:0'

GPU is available


## Data Loading

In [2]:
train_features = torch.load(os.path.join(MELSPECT_DIR, f"train_stacked_seg_spect.pkl"), weights_only=True)
train_features = train_features.numpy()
train_features = tf.convert_to_tensor(train_features)
train_features.shape

TensorShape([822, 128, 4096])

In [3]:
train_labels = np.load(os.path.join(MELSPECT_DIR, f"train_labels.npy"))
train_labels.shape

(822,)

## Basic CNN Model

In [4]:
batch_size = 16
n_channels = 1

cnn_basic = keras.models.Sequential([
    keras.layers.Input(shape=(128, 4096, 1)),
    keras.layers.Conv2D(32, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(64, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(128, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(256, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(2, 2)),
    keras.layers.BatchNormalization(),
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1, activation='sigmoid')
])

cnn_basic.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [5]:
h = cnn_basic.fit(train_features, train_labels, batch_size=batch_size, epochs=50)
cnn_basic.save(os.path.join(output_dir, "cnn_basic.keras"))

Epoch 1/50


I0000 00:00:1730575678.075817     100 service.cc:145] XLA service 0x7b110c113c30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730575678.075879     100 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
2024-11-02 19:28:09.090331: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[64,32,3,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[16,32,63,2047]{3,2,1,0}, f32[16,64,61,2045]{3,2,1,0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardFilter", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...
2024-11-02 19:28:09.624816: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.534648384s
Trying algorithm eng0{} for conv (f32[64,32,

[1m 1/52[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16:50[0m 20s/step - accuracy: 0.4375 - loss: 0.7392

I0000 00:00:1730575694.114259     100 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 295ms/step - accuracy: 0.4795 - loss: 0.7365
Epoch 2/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 133ms/step - accuracy: 0.5216 - loss: 0.7062
Epoch 3/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 132ms/step - accuracy: 0.4872 - loss: 0.7220
Epoch 4/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 133ms/step - accuracy: 0.5111 - loss: 0.7083
Epoch 5/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 133ms/step - accuracy: 0.4883 - loss: 0.7317
Epoch 6/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 132ms/step - accuracy: 0.4918 - loss: 0.7249
Epoch 7/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 132ms/step - accuracy: 0.5523 - loss: 0.6903
Epoch 8/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 133ms/step - accuracy: 0.4595 - loss: 0.7109
Epoch 9/50
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━

# Playground