In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2024-11-05 22:48:41.413287: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
mode = "local"

if mode == "local":
    features_dir = "../../extracted_features"
    info_dir = "../../dataset_info"
    
DATASET_DIR = features_dir + "/spect_images"
DATAINFO_DIR = info_dir

In [3]:
def create_dataframe(csv_path, img_dir):
    df = pd.read_csv(csv_path)
    df["filepath"] = df["Participant_ID"].apply(lambda x: os.path.join(img_dir, f"{x.split('.')[0]}.png"))
    df[df["filepath"].apply(lambda x: os.path.exists(x))]
    df["PHQ_Binary"] = df["PHQ_Binary"].astype(str)
    return df[["filepath", "PHQ_Binary"]]

In [4]:
train_df = create_dataframe(f"{DATAINFO_DIR}/train_split_new.csv", f"{DATASET_DIR}/train")
dev_df = create_dataframe(f"{DATAINFO_DIR}/dev_split_new.csv" , f"{DATASET_DIR}/dev")
test_df = create_dataframe(f"{DATAINFO_DIR}/test_split_new.csv", f"{DATASET_DIR}/test")

In [5]:
train_df.shape, dev_df.shape, test_df.shape

((274, 2), (56, 2), (56, 2))

In [None]:
BATCH_SIZE = 16

In [None]:
def load_image(image_path, label):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, [512, 512])
    return image, label

def create_tf_dataset(df):
    filepaths = df['filepath'].values
    labels = df['PHQ_Binary'].values.astype(int)
    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels))
    dataset = dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [None]:
train_dataset = create_tf_dataset(train_df)
dev_dataset = create_tf_dataset(dev_df)
test_dataset = create_tf_dataset(test_df)

In [27]:
# Fetch one sample from train_dataset
for image, label in train_dataset.take(1): # retireve one batch of data
    sample_image = image[0].numpy()
    sample_label = label[0].numpy()
    print("Sample Image Shape:", sample_image.shape)
    print("Sample Label:", sample_label)
    break

Sample Image Shape: (512, 512, 3)
Sample Label: 0


2024-11-04 23:36:42.315941: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 100663296 exceeds 10% of free system memory.
2024-11-04 23:36:42.380420: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 100663296 exceeds 10% of free system memory.


# VGG19

In [None]:
# Load the VGG16 model pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(512, 512, 3))
x = base_model.output

# Add custom layers on top of the base model
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(256, activation='relu')(x)
x = keras.layers.Dropout(0.5)(x)
x = keras.layers.Dense(1, activation='sigmoid')(x)

# Create the final model
model = keras.Model(inputs=base_model.input, outputs=x)

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

batch_size = 16
train_steps = len(train_dataset) // batch_size  # Adjusted steps per epoch for training
val_steps = len(dev_dataset) // batch_size      # Adjusted steps per epoch for validation

# Fit the model
vgg_h = model.fit(
    train_dataset,
    validation_data=dev_dataset,
    epochs=10,
    steps_per_epoch=train_steps,
    validation_steps=val_steps
)

In [None]:
# Unfreeze some of the last VGG layers for fine-tuning
for layer in base_model.layers[-4:]:  # Last 4 layers
    layer.trainable = True

# Re-compile the model with a lower learning rate for fine-tuning
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Fine-tune the model
fine_tune_history = model.fit(
    train_dataset,
    validation_data=dev_dataset,
    epochs=10,
    steps_per_epoch=len(train_dataset),
    validation_steps=len(dev_dataset)
)


In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
# Save the model
model.save(f"{output_dir}/fine_tuned_vgg16_audio_classification.keras")

In [None]:
def plot_history(history):
    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')

    plt.show()

# DistillHubert 

In [17]:
mode = "local"

if mode == "local":
    features_dir = "../../extracted_features"
    info_dir = "../../dataset_info"
    
DATASET_DIR = "../../extracted_audio"
DATAINFO_DIR = info_dir

In [18]:
from datasets import load_dataset, Audio
from transformers import AutoFeatureExtractor
import numpy as np

In [19]:
dataset = load_dataset("audiofolder", data_dir=f"{DATASET_DIR}/train")
val_dataset = load_dataset("audiofolder", data_dir=f"{DATASET_DIR}/dev")
test_dataset = load_dataset("audiofolder", data_dir=f"{DATASET_DIR}/test")

Resolving data files:   0%|          | 0/275 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

In [6]:
# add dev and test to "dataset" dictionary
dataset["dev"] = val_dataset["train"]
dataset["test"] = test_dataset["train"]

In [None]:
dataset["train"][0] # dict -> audio => {path, array}, label

{'audio': {'path': '/home/dell/Preet/Comding/ML_Projects/depression_detection/extracted_audio/train/302_AUDIO.wav',
  'array': array([0.00210571, 0.00170898, 0.00140381, ..., 0.00039673, 0.00067139,
         0.00085449]),
  'sampling_rate': 22050},
 'label': 0}

In [None]:
model_id = "ntu-spml/distilhubert"

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, 
                                                        do_normalize=True, return_attention_mask=True)

In [9]:
# we need to downsample the audio files to match the sample rate of the model

target_sr = feature_extractor.sampling_rate
target_sr

16000

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=target_sr))

In [None]:
dataset["train"][0]

{'audio': {'path': '/home/dell/Preet/Comding/ML_Projects/depression_detection/extracted_audio/train/302_AUDIO.wav',
  'array': array([ 0.00167072,  0.00172269,  0.00113971, ..., -0.00035031,
          0.0007129 ,  0.00066517]),
  'sampling_rate': 16000},
 'label': 0}

In [None]:
# feature scaling so that the model can learn better

# first let's calculate mean and variance of raw audio data
sample = dataset["train"][0]["audio"]["array"]

mean = np.mean(sample)
var = np.var(sample)

mean, var

(-2.3990588478507866e-05, 4.126452968996767e-05)

In [33]:
inputs = feature_extractor(sample, sampling_rate=target_sr, return_tensors="np")

print("Input keys:", inputs.keys())

mean = inputs["input_values"].mean()
var = inputs["input_values"].var()

mean, var

Input keys: dict_keys(['input_values', 'attention_mask'])


(4.7639994e-09, 0.9975836)

Mean close to 0 and var close to 1

is it necessary to truncate the audio files? Need to check whether it is necessary or not

In [6]:
temp_dataset = load_dataset("audiofolder", data_files=[f"{DATASET_DIR}/train/302_AUDIO.wav", f"{DATASET_DIR}/train/303_AUDIO.wav"])

# Manually add labels to the temp_dataset
temp_dataset = temp_dataset.map(lambda x, idx: {"label": [0, 1][idx]}, with_indices=True)
temp_dataset = temp_dataset.cast_column("audio", Audio(sampling_rate=16000))

# Verify the labels have been added
print(temp_dataset)

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 2
    })
})


In [10]:
CHUNK_DURATION = 5 * 60  # 5 minutes in seconds
# CHUNK_SAMPLES = CHUNK_DURATION * target_sr
CHUNK_SAMPLES = CHUNK_DURATION * 16000

def preprocess_function(examples):
    audio_chunks = []
    labels = []
    
    for audio, label in zip(examples['audio'], examples['label']):
        audio_array = audio['array']
        
        num_chunks = audio_array.shape[0] // CHUNK_SAMPLES + (1 if audio_array.shape[0] % CHUNK_SAMPLES != 0 else 0)
        
        for i in range(num_chunks):
            start_idx = i * CHUNK_SAMPLES
            end_idx = start_idx + CHUNK_SAMPLES
            chunk = audio_array[start_idx:end_idx]
            
            if len(chunk) < CHUNK_SAMPLES:
                chunk = np.pad(chunk, (0, CHUNK_SAMPLES - len(chunk)), 'constant')
            
            audio_chunks.append(chunk)
            labels.append(label)
    
    return {'input_values': audio_chunks, 'labels': labels}

prp_dataset = temp_dataset.map(preprocess_function, batched=True, remove_columns=temp_dataset["train"].column_names)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

3491201 4800000
1
9910405 4800000
3


In [11]:
len(prp_dataset["train"][0]["input_values"])

4800000

In [16]:
prp_dataset["train"][3]["labels"]

1

# Trying Iterable Dataset

In [21]:
dataset = load_dataset("audiofolder", data_dir=f"{DATASET_DIR}/train", streaming=True) # streaming=True to load audio files on-the-fly
val_dataset = load_dataset("audiofolder", data_dir=f"{DATASET_DIR}/dev", streaming=True)
test_dataset = load_dataset("audiofolder", data_dir=f"{DATASET_DIR}/test", streaming=True)

Resolving data files:   0%|          | 0/275 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

In [23]:
# add dev and test to "dataset" dictionary
dataset["dev"] = val_dataset["train"]
dataset["test"] = test_dataset["train"]


In [25]:
model_id = "ntu-spml/distilhubert"

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, 
                                                        do_normalize=True, return_attention_mask=True)
# we need to downsample the audio files to match the sample rate of the model

target_sr = feature_extractor.sampling_rate
target_sr

16000

In [26]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=target_sr))

In [29]:
itrt_dataset = dataset["train"].map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
for example in itrt_dataset:
    print(example[0]["input_values"].shape)
    break