In [None]:
import os
import torchaudio
import pandas as pd
import torch
from tqdm import tqdm
import opensmile
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from transformers import AutoFeatureExtractor, AutoModel
from transformers import WhisperFeatureExtractor, WhisperModel
from multiprocessing import Pool
from functools import partial

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define paths
## change to train/test for training/testing feature extraction
dataType = 'test'
path = "/home/siavash/Downloads/FinalExplainedAcousticNIH/"
audio_folder = path + dataType + "_audios"
original_sampling_rate = 48000  
target_sampling_rate = 16000    

def load_and_resample_audio(file_path, target_sampling_rate):
    waveform, original_rate = torchaudio.load(file_path)
    original_length = waveform.shape[-1] / original_rate
    
    if original_rate != target_sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=original_rate, new_freq=target_sampling_rate)
        waveform = resampler(waveform)
    return waveform.squeeze(), original_length

# Generate model for embedding extraction using OpenSMILE
def audio_embeddings_model(model_name):
    if model_name == "compare":
        model = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    elif model_name == "egemaps":
        model = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    return model

def opensmile_worker(audio, model_name, sampling_rate):
    model = audio_embeddings_model(model_name)  # Create a new model instance per process
    embeddings = model.process_signal(audio.numpy(), sampling_rate)
    return embeddings.values.flatten()

# Parallel embedding extraction function for OpenSMILE features
def audio_embeddings_parallel(audio_list, model_name, sampling_rate, num_processes=40):
    worker = partial(opensmile_worker, model_name=model_name, sampling_rate=sampling_rate)
    with Pool(num_processes) as p:
        embeddings_list = list(tqdm(p.imap(worker, audio_list), total=len(audio_list), desc="Extracting OpenSMILE features"))
    return embeddings_list

# Generic function to extract embeddings from a Transformer-based audio model
def extract_embeddings_from_transformer_model(audio_list, model_name, sampling_rate):
    print(f"Extracting embeddings using {model_name}...")
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    embeddings_list = []
    for audio in tqdm(audio_list, desc=f"Extracting {model_name} embeddings"):
        inputs = feature_extractor(audio.numpy(), sampling_rate=sampling_rate, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        embeddings_list.append(embeddings.squeeze())
    return embeddings_list

# Function to extract encoder embeddings from Whisper
def extract_whisper_embeddings(audio_list, model_name, sampling_rate):
    print(f"Extracting embeddings using {model_name} (Whisper) ...")
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
    model = WhisperModel.from_pretrained(model_name).to(device)
    model.eval()

    embeddings_list = []
    for audio in tqdm(audio_list, desc=f"Extracting {model_name} embeddings"):
        audio_np = audio.numpy()
        inputs = feature_extractor(audio_np, sampling_rate=sampling_rate, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model.encoder(**inputs)
            embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        embeddings_list.append(embeddings.squeeze())
    return embeddings_list

# Function to extract additional features
def extract_additional_features(audio, sampling_rate):
    audio = audio.numpy()
    mfccs = librosa.feature.mfcc(y=audio, sr=sampling_rate, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sampling_rate)
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sampling_rate)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate, roll_percent=0.85)
    zero_crossings = librosa.feature.zero_crossing_rate(y=audio)
    pitches, magnitudes = librosa.core.piptrack(y=audio, sr=sampling_rate)
    
    if pitches.size == 0 or pitches.shape[0] == 0:
        mean_pitch = 0.0  # Or np.nan if you prefer to flag it as missing
    else:
        pitch = np.max(pitches, axis=0)
        mean_pitch = np.mean(pitch)
    
    features = np.hstack([
        np.mean(mfccs, axis=1),
        np.mean(chroma, axis=1),
        np.mean(spectral_centroid, axis=1),
        np.mean(spectral_rolloff, axis=1),
        np.mean(zero_crossings, axis=1),
        mean_pitch
    ])
    return features

# Parallel extraction for additional features
def extract_additional_features_parallel(audio_list, sampling_rate, num_processes=40):
    worker = partial(extract_additional_features, sampling_rate=sampling_rate)
    with Pool(num_processes) as p:
        additional_features_list = list(tqdm(p.imap(worker, audio_list), total=len(audio_list), desc="Extracting additional features"))
    return additional_features_list

# Load and process MP3 files
audio_files = [os.path.join(audio_folder, file) for file in os.listdir(audio_folder) if file.endswith(".mp3")]

data = [load_and_resample_audio(file, target_sampling_rate) for file in tqdm(audio_files)]
audio_list = [item[0] for item in data]
audio_lengths = [item[1] for item in data]

additional_features_list = extract_additional_features_parallel(audio_list, target_sampling_rate)

additional_feature_columns = (
    [f"mfcc_{i}" for i in range(1, 14)]
    + [f"chroma_{i}" for i in range(1, 13)]
    + ["spectral_centroid", "spectral_rolloff", "zero_crossing_rate", "pitch"]
)
additional_features_df = pd.DataFrame(additional_features_list, columns=additional_feature_columns)

egemaps_model = audio_embeddings_model("egemaps")
print("Extracting eGeMAPS")
egemaps_features = audio_embeddings_parallel(audio_list, "egemaps", target_sampling_rate)
wav2vec2_model_name = "facebook/wav2vec2-large-960h-lv60-self"
whisper_model_name = "openai/whisper-medium"
print("Extracting Wav2Vec2 ")
wav2vec2_features = extract_embeddings_from_transformer_model(audio_list, wav2vec2_model_name, target_sampling_rate)
print("Extracting Whisper")
whisper_features = extract_whisper_embeddings(audio_list, whisper_model_name, target_sampling_rate)



# Create DataFrame for all features
features_df = pd.DataFrame(egemaps_features, columns=egemaps_model.column_names)
features_df["Wav2Vec2_embeddings"] = list(wav2vec2_features)
features_df["Whisper_embeddings"] = list(whisper_features)
features_df["file_name"] = [os.path.basename(file) for file in audio_files]
features_df["augmentation_type"] = "original"  # tag as original
features_df = pd.concat([features_df, additional_features_df], axis=1)
embeddings_features_df = pd.DataFrame(features_df['Wav2Vec2_embeddings'].tolist(), columns=[f'Embedding1_{i+1}' for i in range(len(features_df['Wav2Vec2_embeddings'][0]))])
features_df_exctracted = pd.concat([features_df.drop(columns=['Wav2Vec2_embeddings']), embeddings_features_df], axis=1)
embeddings_features_df = pd.DataFrame(features_df_exctracted['Whisper_embeddings'].tolist(), columns=[f'Embedding2_{i+1}' for i in range(len(features_df_exctracted['Whisper_embeddings'][0]))])
features_df_exctracted = pd.concat([features_df_exctracted.drop(columns=['Whisper_embeddings']), embeddings_features_df], axis=1)


In [None]:
features_df_exctracted.to_csv("Features/"+dataType+"_audio_features.csv", index=False)

In [None]:
import pandas as pd
data = pd.read_csv("/home/siavash/Downloads/FinalExplainedAcousticNIH/processed_30seconds.csv")
try:
    data[data['sex'].notna()]['sex'] = data[data['sex'].notna()]['sex'].apply(lambda x: 'm' if x.lower()[0] == 'm' else 'f')
except:
    pass  
data.loc[data['sex'].notna(), 'sex'] = data.loc[data['sex'].notna(), 'sex'].apply(lambda x: 'm' if x.lower()[0] == 'm' else 'f')


In [2]:
# data = data.iloc[:200]

In [None]:
import os
import torchaudio
import pandas as pd
import torch
from tqdm import tqdm
import opensmile
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from transformers import AutoFeatureExtractor, AutoModel
from transformers import WhisperFeatureExtractor, WhisperModel
from multiprocessing import Pool
from functools import partial

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define paths
## change to train/test for training/testing feature extraction
dataType = 'full'
path = "/home/siavash/Downloads/FinalExplainedAcousticNIH/"
# Assume 'data' dataframe is pre-loaded with 'processed_path' column containing paths to audio files
# audio_folder = path + dataType + "_audios"  # No longer needed
original_sampling_rate = 48000  
target_sampling_rate = 16000    

def load_and_resample_audio(file_path, target_sampling_rate):
    waveform, original_rate = torchaudio.load(file_path)
    original_length = waveform.shape[-1] / original_rate
    
    if original_rate != target_sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=original_rate, new_freq=target_sampling_rate)
        waveform = resampler(waveform)
    
    # Average channels if multi-channel
    if waveform.dim() > 1 and waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    
    return waveform.squeeze(), original_length



# Generate model for embedding extraction using OpenSMILE
def audio_embeddings_model(model_name):
    if model_name == "compare":
        model = opensmile.Smile(
            feature_set=opensmile.FeatureSet.ComParE_2016,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    elif model_name == "egemaps":
        model = opensmile.Smile(
            feature_set=opensmile.FeatureSet.eGeMAPSv02,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
    return model

def opensmile_worker(audio, model_name, sampling_rate):
    model = audio_embeddings_model(model_name)  # Create a new model instance per process
    embeddings = model.process_signal(audio.numpy(), sampling_rate)
    return embeddings.values.flatten()

# Parallel embedding extraction function for OpenSMILE features
def audio_embeddings_parallel(audio_list, model_name, sampling_rate, num_processes=40):
    worker = partial(opensmile_worker, model_name=model_name, sampling_rate=sampling_rate)
    with Pool(num_processes) as p:
        embeddings_list = list(tqdm(p.imap(worker, audio_list), total=len(audio_list), desc="Extracting OpenSMILE features"))
    return embeddings_list

# Generic function to extract embeddings from a Transformer-based audio model
def extract_embeddings_from_transformer_model(audio_list, model_name, sampling_rate):
    print(f"Extracting embeddings using {model_name}...")
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    embeddings_list = []
    for audio in tqdm(audio_list, desc=f"Extracting {model_name} embeddings"):
        inputs = feature_extractor(audio.numpy(), sampling_rate=sampling_rate, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        embeddings_list.append(embeddings.squeeze())
    return embeddings_list

# Function to extract encoder embeddings from Whisper
def extract_whisper_embeddings(audio_list, model_name, sampling_rate):
    print(f"Extracting embeddings using {model_name} (Whisper) ...")
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
    model = WhisperModel.from_pretrained(model_name).to(device)
    model.eval()

    embeddings_list = []
    for audio in tqdm(audio_list, desc=f"Extracting {model_name} embeddings"):
        audio_np = audio.numpy()
        inputs = feature_extractor(audio_np, sampling_rate=sampling_rate, return_tensors="pt")
        inputs = {key: val.to(device) for key, val in inputs.items()}
        with torch.no_grad():
            outputs = model.encoder(**inputs)
            embeddings = torch.mean(outputs.last_hidden_state, dim=1).cpu().numpy()
        embeddings_list.append(embeddings.squeeze())
    return embeddings_list

# Function to extract additional features
def extract_additional_features(audio, sampling_rate):
    audio = audio.numpy()
    mfccs = librosa.feature.mfcc(y=audio, sr=sampling_rate, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sampling_rate)
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sampling_rate)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate, roll_percent=0.85)
    zero_crossings = librosa.feature.zero_crossing_rate(y=audio)
    pitches, magnitudes = librosa.core.piptrack(y=audio, sr=sampling_rate)
    
    if pitches.size == 0 or pitches.shape[0] == 0:
        mean_pitch = 0.0
    else:
        pitch = np.max(pitches, axis=0)
        mean_pitch = np.mean(pitch)
    
    # Wrap mean_pitch as 1D array
    mean_pitch = np.array([mean_pitch])
    
    features = np.hstack([
        np.mean(mfccs, axis=1),
        np.mean(chroma, axis=1),
        np.mean(spectral_centroid, axis=1),
        np.mean(spectral_rolloff, axis=1),
        np.mean(zero_crossings, axis=1),
        mean_pitch
    ])
    return features

# Parallel extraction for additional features
def extract_additional_features_parallel(audio_list, sampling_rate, num_processes=40):
    worker = partial(extract_additional_features, sampling_rate=sampling_rate)
    with Pool(num_processes) as p:
        additional_features_list = list(tqdm(p.imap(worker, audio_list), total=len(audio_list), desc="Extracting additional features"))
    return additional_features_list

# Load and process audio files from 'data' dataframe
audio_files = data['processed_path'].tolist()  # Use paths from the pre-loaded 'data' dataframe

audio_data = [load_and_resample_audio(file, target_sampling_rate) for file in tqdm(audio_files)]
audio_list = [item[0] for item in audio_data]
audio_lengths = [item[1] for item in audio_data]

additional_features_list = extract_additional_features_parallel(audio_list, target_sampling_rate)

additional_feature_columns = (
    [f"mfcc_{i}" for i in range(1, 14)]
    + [f"chroma_{i}" for i in range(1, 13)]
    + ["spectral_centroid", "spectral_rolloff", "zero_crossing_rate", "pitch"]
)
additional_features_df = pd.DataFrame(additional_features_list, columns=additional_feature_columns)

egemaps_model = audio_embeddings_model("egemaps")
print("Extracting eGeMAPS")
egemaps_features = audio_embeddings_parallel(audio_list, "egemaps", target_sampling_rate)
wav2vec2_model_name = "facebook/wav2vec2-large-960h-lv60-self"
whisper_model_name = "openai/whisper-medium"
print("Extracting Wav2Vec2 ")
wav2vec2_features = extract_embeddings_from_transformer_model(audio_list, wav2vec2_model_name, target_sampling_rate)
print("Extracting Whisper")
whisper_features = extract_whisper_embeddings(audio_list, whisper_model_name, target_sampling_rate)

# Create DataFrame for all features
features_df = pd.DataFrame(egemaps_features, columns=egemaps_model.column_names)
features_df["Wav2Vec2_embeddings"] = list(wav2vec2_features)
features_df["Whisper_embeddings"] = list(whisper_features)
features_df["file_name"] = [os.path.basename(file) for file in audio_files]
features_df["augmentation_type"] = "original"  # tag as original
features_df = pd.concat([features_df, additional_features_df], axis=1)
embeddings_features_df = pd.DataFrame(features_df['Wav2Vec2_embeddings'].tolist(), columns=[f'Embedding1_{i+1}' for i in range(len(features_df['Wav2Vec2_embeddings'][0]))])
features_df_extracted = pd.concat([features_df.drop(columns=['Wav2Vec2_embeddings']), embeddings_features_df], axis=1)
embeddings_features_df = pd.DataFrame(features_df_extracted['Whisper_embeddings'].tolist(), columns=[f'Embedding2_{i+1}' for i in range(len(features_df_extracted['Whisper_embeddings'][0]))])
features_df_extracted = pd.concat([features_df_extracted.drop(columns=['Whisper_embeddings']), embeddings_features_df], axis=1)

# Add the extracted features to the original 'data' dataframe
# Assuming order is preserved, drop 'file_name' and 'augmentation_type' if not needed in 'data'
data = pd.concat([data.reset_index(drop=True), features_df_extracted.drop(columns=['file_name', 'augmentation_type']).reset_index(drop=True)], axis=1)

In [5]:
data.to_csv("Full_data_features.csv")