# Husky dataset ‚Äî load train and test
This notebook loads `dogvoicedataset/husky` train and test labels and builds full file paths.

In [9]:
from pathlib import Path
import pandas as pd
from IPython.display import display

# Paths (notebook is in `model/` so dataset is at ../dogvoicedataset)
ROOT = Path('..')
DATASET_DIR = ROOT / 'dogvoicedataset'
HUSKY_DIR = DATASET_DIR / 'husky'
TRAIN_CSV = DATASET_DIR / 'husky_train_labels.csv'
TEST_CSV = DATASET_DIR / 'husky_test_labels.csv'

print('Training CSV:', TRAIN_CSV)
print('Test CSV:', TEST_CSV)

# Read CSVs
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print('Train columns:', list(train_df.columns))
print('Test columns :', list(test_df.columns))


Training CSV: ../dogvoicedataset/husky_train_labels.csv
Test CSV: ../dogvoicedataset/husky_test_labels.csv
Train columns: ['audio_id', 'arousal', 'valence']
Test columns : ['audio_id', 'arousal', 'valence']


In [10]:
# Helper: create a full path column from likely filename/column names
def add_full_path(df, split='train'):
    df = df.copy()
    # Candidate filename columns
    candidates = ['filepath', 'file', 'filename', 'wav', 'path']
    col = None
    for c in candidates:
        if c in df.columns:
            col = c
            break
    if col is None:
        # fallback: assume first column is the filename
        col = df.columns[0]
    df['full_path'] = df[col].astype(str).apply(lambda p: str(HUSKY_DIR / split / p))
    return df

train_df = add_full_path(train_df, 'train')
test_df = add_full_path(test_df, 'test')

# Check existence
train_df['exists'] = train_df['full_path'].apply(lambda p: Path(p).exists())
test_df['exists']  = test_df['full_path'].apply(lambda p: Path(p).exists())

print('Train shape, files exist:', train_df.shape, train_df['exists'].sum())
print('Test  shape, files exist:', test_df.shape, test_df['exists'].sum())

display(train_df.head())
display(test_df.head())


Train shape, files exist: (600, 5) 0
Test  shape, files exist: (100, 5) 0


Unnamed: 0,audio_id,arousal,valence,full_path,exists
0,husky_train_00000,High,Positive,../dogvoicedataset/husky/train/husky_train_00000,False
1,husky_train_00001,High,Positive,../dogvoicedataset/husky/train/husky_train_00001,False
2,husky_train_00002,Low,Neutral,../dogvoicedataset/husky/train/husky_train_00002,False
3,husky_train_00003,Low,Neutral,../dogvoicedataset/husky/train/husky_train_00003,False
4,husky_train_00004,Medium,Positive,../dogvoicedataset/husky/train/husky_train_00004,False


Unnamed: 0,audio_id,arousal,valence,full_path,exists
0,husky_test_00000,Low,Negative,../dogvoicedataset/husky/test/husky_test_00000,False
1,husky_test_00001,Low,Negative,../dogvoicedataset/husky/test/husky_test_00001,False
2,husky_test_00002,Low,Negative,../dogvoicedataset/husky/test/husky_test_00002,False
3,husky_test_00003,Low,Negative,../dogvoicedataset/husky/test/husky_test_00003,False
4,husky_test_00004,Low,Negative,../dogvoicedataset/husky/test/husky_test_00004,False


Next: if some `exists` values are False, verify the filenames in the CSV match the files under `dogvoicedataset/husky/train` and `dogvoicedataset/husky/test`.
You can then proceed to load audio files (e.g., with `librosa.load`) or extract features.

In [12]:
import sys
# Compatibility shim: Python 3.12 removes distutils from the stdlib.
# Some packages (including TensorFlow) still import distutils ‚Äî
# map the name to setuptools' vendored copy when missing.
try:
    import distutils
except Exception:
    try:
        import setuptools._distutils as distutils
        sys.modules['distutils'] = distutils
    except Exception:
        # If setuptools._distutils isn't available, let the import fail later
        pass

import numpy as np
import librosa
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

ModuleNotFoundError: No module named 'distutils'

## Audio Preprocessing Utilities

Load and preprocess .wav files to 16kHz mono audio and extract YAMNet embeddings.

In [None]:
def load_and_resample_audio(filepath, target_sr=16000):
    """
    Load audio file and resample to target sample rate (mono).
    
    Args:
        filepath (str): Path to .wav file
        target_sr (int): Target sample rate (default 16kHz for YAMNet)
    
    Returns:
        np.ndarray: Audio waveform resampled to target_sr
    """
    try:
        audio, sr = librosa.load(filepath, sr=None, mono=True)
        if sr != target_sr:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
        return audio
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

# Test: load one audio file
sample_audio = load_and_resample_audio(train_df.iloc[0]['full_path'])
if sample_audio is not None:
    print(f"Sample audio shape: {sample_audio.shape}, duration: {len(sample_audio) / 16000:.2f}s")
else:
    print("Failed to load sample audio")

## Label Parsing and Encoding

Parse label columns and encode them numerically for training.

In [None]:
# Define label columns (adjust based on your actual CSV column names)
VALENCE_COL = 'valence'  # Expected values: negative, neutral, positive
AROUSAL_COL = 'arousal'  # Expected values: low, medium, high

# Check actual columns and infer if needed
print("Train DataFrame columns:", list(train_df.columns))
print("Test DataFrame columns:", list(test_df.columns))

# If columns don't exist, show first few rows for manual inspection
if VALENCE_COL not in train_df.columns or AROUSAL_COL not in train_df.columns:
    print("\n‚ö†Ô∏è  Label columns not found. Showing data for inspection:")
    display(train_df.head())
    print("\nüìù Update VALENCE_COL and AROUSAL_COL based on your actual column names")

In [None]:
# Encode labels
valence_encoder = LabelEncoder()
arousal_encoder = LabelEncoder()

train_df['valence_encoded'] = valence_encoder.fit_transform(train_df[VALENCE_COL])
train_df['arousal_encoded'] = arousal_encoder.fit_transform(train_df[AROUSAL_COL])

test_df['valence_encoded'] = valence_encoder.transform(test_df[VALENCE_COL])
test_df['arousal_encoded'] = arousal_encoder.transform(test_df[AROUSAL_COL])

print("Valence classes:", valence_encoder.classes_)
print("Arousal classes:", arousal_encoder.classes_)
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nTrain labels sample:")
print(train_df[[VALENCE_COL, AROUSAL_COL, 'valence_encoded', 'arousal_encoded']].head())

## Load YAMNet Model and Extract Embeddings

Load the pre-trained YAMNet model and extract embeddings for each audio file.

In [None]:
import tensorflow_hub as hub

# Load YAMNet model from TensorFlow Hub
# YAMNet is a pre-trained audio event classifier that produces embeddings
yamnet_model_url = 'https://tfhub.dev/google/yamnet/1'
yamnet = hub.load(yamnet_model_url)

print(f"YAMNet model loaded from {yamnet_model_url}")
print(f"YAMNet embedding dimension: 1024")

def extract_yamnet_embedding(audio_waveform, sample_rate=16000):
    """
    Extract YAMNet embedding from audio waveform.
    
    Args:
        audio_waveform (np.ndarray): Audio samples at sample_rate
        sample_rate (int): Sample rate of audio (default 16kHz for YAMNet)
    
    Returns:
        np.ndarray: YAMNet embedding (1024-dim vector, mean-pooled across time)
    """
    # Ensure waveform is float32 and in the correct shape
    audio_tensor = tf.cast(audio_waveform, tf.float32)
    
    # Get embeddings from YAMNet
    _, embeddings, _ = yamnet(audio_tensor)
    
    # Mean-pool embeddings across time dimension to get a single vector per audio
    embedding = tf.reduce_mean(embeddings, axis=0).numpy()
    return embedding

# Test on one sample
test_audio = load_and_resample_audio(train_df.iloc[0]['full_path'])
if test_audio is not None:
    test_embedding = extract_yamnet_embedding(test_audio)
    print(f"\nTest embedding shape: {test_embedding.shape}")
    print(f"Embedding stats - Mean: {test_embedding.mean():.4f}, Std: {test_embedding.std():.4f}")

## Extract Embeddings for All Data

Efficiently extract YAMNet embeddings for all training and test samples.

In [None]:
def extract_embeddings_for_dataframe(df, max_files=None):
    """
    Extract YAMNet embeddings for all audios in a dataframe.
    
    Args:
        df (pd.DataFrame): Dataframe with 'full_path' column
        max_files (int): Limit number of files to process (for testing)
    
    Returns:
        np.ndarray: Array of embeddings (N, 1024)
    """
    embeddings = []
    paths = df['full_path'].values
    n_files = min(len(paths), max_files) if max_files else len(paths)
    
    for i, filepath in enumerate(paths[:n_files]):
        if (i + 1) % max(1, n_files // 5) == 0:
            print(f"  Processed {i + 1}/{n_files} files...")
        
        audio = load_and_resample_audio(filepath)
        if audio is not None:
            embedding = extract_yamnet_embedding(audio)
            embeddings.append(embedding)
        else:
            # Use zero embedding as fallback for failed files
            embeddings.append(np.zeros(1024))
    
    return np.array(embeddings)

# Extract embeddings for train and test sets
print("Extracting embeddings for training set...")
train_embeddings = extract_embeddings_for_dataframe(train_df)
print(f"Train embeddings shape: {train_embeddings.shape}")

print("\nExtracting embeddings for test set...")
test_embeddings = extract_embeddings_for_dataframe(test_df)
print(f"Test embeddings shape: {test_embeddings.shape}")

## Build Multi-Task Classifier

Create a neural network with shared embeddings baseline and two task-specific output heads.

In [None]:
def build_multi_task_model(embedding_dim=1024, num_valence_classes=3, num_arousal_classes=3):
    """
    Build a multi-task neural network with shared base and task-specific heads.
    
    Args:
        embedding_dim (int): Dimension of input embeddings (YAMNet = 1024)
        num_valence_classes (int): Number of valence classes (default 3)
        num_arousal_classes (int): Number of arousal classes (default 3)
    
    Returns:
        tf.keras.Model: Compiled multi-task model with two outputs
    """
    # Input layer for embeddings
    embedding_input = tf.keras.Input(shape=(embedding_dim,), name='embedding_input')
    
    # Shared base (dense layers)
    x = tf.keras.layers.Dense(512, activation='relu')(embedding_input)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    
    # Task 1: Valence prediction head
    valence_out = tf.keras.layers.Dense(64, activation='relu')(x)
    valence_out = tf.keras.layers.Dropout(0.2)(valence_out)
    valence_out = tf.keras.layers.Dense(num_valence_classes, activation='softmax', name='valence_output')(valence_out)
    
    # Task 2: Arousal prediction head
    arousal_out = tf.keras.layers.Dense(64, activation='relu')(x)
    arousal_out = tf.keras.layers.Dropout(0.2)(arousal_out)
    arousal_out = tf.keras.layers.Dense(num_arousal_classes, activation='softmax', name='arousal_output')(arousal_out)
    
    # Build model with two outputs
    model = tf.keras.Model(inputs=embedding_input, outputs=[valence_out, arousal_out])
    
    # Compile with separate losses and metrics for each task
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        loss={
            'valence_output': 'sparse_categorical_crossentropy',
            'arousal_output': 'sparse_categorical_crossentropy'
        },
        loss_weights={'valence_output': 1.0, 'arousal_output': 1.0},
        metrics={
            'valence_output': ['accuracy'],
            'arousal_output': ['accuracy']
        }
    )
    
    return model

# Build the model
model = build_multi_task_model()
print("Multi-task model built successfully!")
model.summary()

## Train the Multi-Task Model

Train on the training set with validation on a held-out split.

In [None]:
# Prepare training data
X_train = train_embeddings
y_valence_train = train_df['valence_encoded'].values
y_arousal_train = train_df['arousal_encoded'].values

# Split into train/validation
X_train_split, X_val_split, y_val_train, y_val_val, y_arom_train, y_arom_val = train_test_split(
    X_train, y_valence_train, y_arousal_train,
    test_size=0.2, random_state=42, stratify=y_valence_train
)

print(f"Train split: {X_train_split.shape}")
print(f"Validation split: {X_val_split.shape}")

# Train the model
history = model.fit(
    X_train_split,
    {'valence_output': y_val_train, 'arousal_output': y_arom_train},
    validation_data=(
        X_val_split,
        {'valence_output': y_val_val, 'arousal_output': y_arom_val}
    ),
    epochs=50,
    batch_size=32,
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6
        )
    ]
)

print("\n‚úÖ Training completed!")

## Evaluate on Test Set

Measure accuracy and other metrics on the held-out test set.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Evaluate on test set
X_test = test_embeddings
y_valence_test = test_df['valence_encoded'].values
y_arousal_test = test_df['arousal_encoded'].values

# Get predictions
valence_preds, arousal_preds = model.predict(X_test)
valence_pred_classes = np.argmax(valence_preds, axis=1)
arousal_pred_classes = np.argmax(arousal_preds, axis=1)

print("=" * 60)
print("TEST SET EVALUATION")
print("=" * 60)

# Valence metrics
print("\nüìä VALENCE Metrics:")
print(f"  Accuracy: {accuracy_score(y_valence_test, valence_pred_classes):.4f}")
print(f"  Precision (macro): {precision_score(y_valence_test, valence_pred_classes, average='macro', zero_division=0):.4f}")
print(f"  Recall (macro): {recall_score(y_valence_test, valence_pred_classes, average='macro', zero_division=0):.4f}")
print(f"  F1 (macro): {f1_score(y_valence_test, valence_pred_classes, average='macro', zero_division=0):.4f}")

print("\nValence Classification Report:")
print(classification_report(y_valence_test, valence_pred_classes, 
                          target_names=valence_encoder.classes_, zero_division=0))

# Arousal metrics
print("\nüìä AROUSAL Metrics:")
print(f"  Accuracy: {accuracy_score(y_arousal_test, arousal_pred_classes):.4f}")
print(f"  Precision (macro): {precision_score(y_arousal_test, arousal_pred_classes, average='macro', zero_division=0):.4f}")
print(f"  Recall (macro): {recall_score(y_arousal_test, arousal_pred_classes, average='macro', zero_division=0):.4f}")
print(f"  F1 (macro): {f1_score(y_arousal_test, arousal_pred_classes, average='macro', zero_division=0):.4f}")

print("\nArousal Classification Report:")
print(classification_report(y_arousal_test, arousal_pred_classes, 
                          target_names=arousal_encoder.classes_, zero_division=0))

print("\n‚úÖ Evaluation completed!")

# Optional: Plot confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Valence confusion matrix
cm_valence = confusion_matrix(y_valence_test, valence_pred_classes)
im1 = axes[0].imshow(cm_valence, cmap='Blues')
axes[0].set_title('Valence Confusion Matrix')
axes[0].set_xticks(range(len(valence_encoder.classes_)))
axes[0].set_yticks(range(len(valence_encoder.classes_)))
axes[0].set_xticklabels(valence_encoder.classes_)
axes[0].set_yticklabels(valence_encoder.classes_)
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')
for i in range(len(valence_encoder.classes_)):
    for j in range(len(valence_encoder.classes_)):
        axes[0].text(j, i, str(cm_valence[i, j]), ha='center', va='center', color='white')

# Arousal confusion matrix
cm_arousal = confusion_matrix(y_arousal_test, arousal_pred_classes)
im2 = axes[1].imshow(cm_arousal, cmap='Greens')
axes[1].set_title('Arousal Confusion Matrix')
axes[1].set_xticks(range(len(arousal_encoder.classes_)))
axes[1].set_yticks(range(len(arousal_encoder.classes_)))
axes[1].set_xticklabels(arousal_encoder.classes_)
axes[1].set_yticklabels(arousal_encoder.classes_)
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')
for i in range(len(arousal_encoder.classes_)):
    for j in range(len(arousal_encoder.classes_)):
        axes[1].text(j, i, str(cm_arousal[i, j]), ha='center', va='center', color='white')

plt.tight_layout()
plt.show()

## Inference on New Audio

Run inference on a new .wav file to predict valence and arousal.

In [None]:
def predict_vocalization(audio_path, model, valence_encoder, arousal_encoder):
    """
    Predict valence and arousal for a new audio file.
    
    Args:
        audio_path (str): Path to .wav file
        model: Trained multi-task model
        valence_encoder: Fitted label encoder for valence classes
        arousal_encoder: Fitted label encoder for arousal classes
    
    Returns:
        dict: Prediction results with class labels and confidence scores
    """
    # Load and preprocess audio
    audio = load_and_resample_audio(audio_path)
    if audio is None:
        return None
    
    # Extract embedding
    embedding = extract_yamnet_embedding(audio)
    embedding_batch = np.expand_dims(embedding, axis=0)  # Add batch dimension
    
    # Get predictions
    valence_probs, arousal_probs = model.predict(embedding_batch, verbose=0)
    
    # Decode predictions
    valence_class_idx = np.argmax(valence_probs[0])
    arousal_class_idx = np.argmax(arousal_probs[0])
    
    valence_class = valence_encoder.classes_[valence_class_idx]
    arousal_class = arousal_encoder.classes_[arousal_class_idx]
    
    return {
        'valence': {
            'class': valence_class,
            'confidence': float(valence_probs[0][valence_class_idx]),
            'all_probs': {valence_encoder.classes_[i]: float(p) 
                         for i, p in enumerate(valence_probs[0])}
        },
        'arousal': {
            'class': arousal_class,
            'confidence': float(arousal_probs[0][arousal_class_idx]),
            'all_probs': {arousal_encoder.classes_[i]: float(p) 
                         for i, p in enumerate(arousal_probs[0])}
        }
    }

# Test inference on a sample from test set
test_audio_path = test_df.iloc[0]['full_path']
print(f"Testing inference on: {test_audio_path}")
print(f"True labels - Valence: {test_df.iloc[0][VALENCE_COL]}, Arousal: {test_df.iloc[0][AROUSAL_COL]}\n")

result = predict_vocalization(test_audio_path, model, valence_encoder, arousal_encoder)

if result:
    print("üîÆ Predictions:")
    print(f"\n  Valence: {result['valence']['class']} (confidence: {result['valence']['confidence']:.2%})")
    print(f"    Probabilities: {result['valence']['all_probs']}")
    
    print(f"\n  Arousal: {result['arousal']['class']} (confidence: {result['arousal']['confidence']:.2%})")
    print(f"    Probabilities: {result['arousal']['all_probs']}")
else:
    print("‚ùå Failed to process audio file")

## Save and Load the Trained Model

Save the trained model and encoders for future inference.

In [None]:
import pickle

# Save model
model_path = 'yamnet_multitask_model.h5'
model.save(model_path)
print(f"‚úÖ Model saved to {model_path}")

# Save label encoders
encoders_path = 'label_encoders.pkl'
with open(encoders_path, 'wb') as f:
    pickle.dump({
        'valence_encoder': valence_encoder,
        'arousal_encoder': arousal_encoder
    }, f)
print(f"‚úÖ Label encoders saved to {encoders_path}")

# To load later:
# loaded_model = tf.keras.models.load_model(model_path)
# with open(encoders_path, 'rb') as f:
#     encoders = pickle.load(f)
#     valence_encoder = encoders['valence_encoder']
#     arousal_encoder = encoders['arousal_encoder']