In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install git+https://github.com/snakers4/silero-vad.git
!pip install torch librosa pandas scikit-learn pydub
!pip install git+https://github.com/openai/whisper.git


In [None]:
import os
import whisper
import torch
import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment
from sklearn.cluster import KMeans
from vad import VoiceActivityDetector

# Load models
whisper_model = whisper.load_model("base")

# Silero VAD model
torch.set_num_threads(1)
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True)
(get_speech_timestamps, _, read_audio, _, _) = utils

# Paths
BASE_DIR = "/content/drive/MyDrive/icaasp_paper/dementiabank"

# Get first 15 .mp3 files
def get_mp3_files(base_dir, limit=15):
    mp3_files = []
    for root, _, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".mp3"):
                mp3_files.append(os.path.join(root, file))
            if len(mp3_files) >= limit:
                return mp3_files
    return mp3_files

# Convert mp3 to wav (for analysis)
def convert_mp3_to_wav(mp3_path):
    audio = AudioSegment.from_mp3(mp3_path)
    wav_path = mp3_path.replace(".mp3", ".wav")
    audio.export(wav_path, format="wav")
    return wav_path

# Extract features
def extract_audio_features(wav_path):
    y, sr = librosa.load(wav_path)
    duration = librosa.get_duration(y=y, sr=sr)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    pitch = np.mean(librosa.yin(y, fmin=50, fmax=300))
    return y, sr, duration, tempo, pitch

# Count pauses using Silero VAD
def count_pauses(wav_path):
    wav = read_audio(wav_path, sampling_rate=16000)
    speech_timestamps = get_speech_timestamps(wav, vad_model, sampling_rate=16000)

    # Pauses = gaps between speech segments
    pauses = 0
    for i in range(1, len(speech_timestamps)):
        prev_end = speech_timestamps[i - 1]['end']
        curr_start = speech_timestamps[i]['start']
        gap = (curr_start - prev_end) / 16000  # convert to seconds
        if gap > 0.3:
            pauses += 1
    return pauses

# Transcribe using Whisper
def transcribe_whisper(audio_path):
    result = whisper_model.transcribe(audio_path)
    return result['text']

# Run pipeline
audio_data = []
for mp3_path in get_mp3_files(BASE_DIR):
    try:
        wav_path = convert_mp3_to_wav(mp3_path)
        y, sr, duration, tempo, pitch = extract_audio_features(wav_path)
        pauses = count_pauses(wav_path)
        transcript = transcribe_whisper(wav_path)
        speech_rate = len(transcript.split()) / duration if duration > 0 else 0

        audio_data.append({
            "file": mp3_path,
            "duration": duration,
            "speech_rate": speech_rate,
            "tempo": tempo,
            "pitch": pitch,
            "pauses": pauses,
            "transcript": transcript
        })
    except Exception as e:
        print(f"Error processing {mp3_path}: {e}")

# Create DataFrame
df = pd.DataFrame(audio_data)

# Cluster
features = df[["speech_rate", "tempo", "pitch", "pauses"]]
kmeans = KMeans(n_clusters=2, random_state=0).fit(features)
df["cluster"] = kmeans.labels_

# Show result
print(df[["file", "speech_rate", "tempo", "pitch", "pauses", "cluster"]])


In [None]:
from sklearn.metrics import classification_report

# Derive true label from folder name (basic example)
def extract_true_label(file_path):
    parts = file_path.lower().split('/')
    for part in parts:
        if "dementia" in part:
            return "dementia"
        elif "mci" in part:
            return "mci"
        elif "hc" in part:
            return "hc"
    return "unknown"

df["true_label"] = df["file"].apply(extract_true_label)

# Map true labels to integers for comparison
label_map = {label: idx for idx, label in enumerate(df["true_label"].unique())}
df["true_label_id"] = df["true_label"].map(label_map)
df["predicted_label"] = df["cluster"]

# Print classification report
print("\n📊 Classification Report (Unsupervised Cluster vs. True Labels):")
print(classification_report(df["true_label_id"], df["predicted_label"], target_names=label_map.keys()))
