In [1]:
!pip install librosa pandas scikit-learn joblib tqdm



In [19]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

DATA_ROOT = "../dataset/Pitt"  # <-- change if needed


In [20]:
def get_label_from_folder(folder_name):
    folder = folder_name.lower()
    if "control" in folder:
        return 0
    if "dementia" in folder:
        return 2
    return None


In [21]:
def extract_patient_text(cha_path, tag="*PAR:"):
    text_lines = []
    with open(cha_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if line.startswith(tag):
                if "\t" in line:
                    content = line.split("\t", 1)[1]
                else:
                    content = line[len(tag):]
                text_lines.append(content)

    return " ".join(text_lines).lower()


In [22]:
def extract_text_features(text):
    words = text.split()
    length = len(words)
    avg_len = sum(len(w) for w in words) / length if length else 0
    unique_ratio = len(set(words)) / length if length else 0
    return np.array([length, avg_len, unique_ratio])


In [23]:
def extract_audio_features(audio_path, sr_target=16000):
    y, sr = librosa.load(audio_path, sr=sr_target)

    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

    return np.hstack([mfcc, chroma, zcr, centroid, tempo])


In [24]:
features = []
labels = []

for folder in ["Control", "Dementia"]:
    folder_path = os.path.join(DATA_ROOT, folder)
    label = get_label_from_folder(folder)

    for file in tqdm(os.listdir(folder_path), desc=f"Processing {folder}"):
        if file.endswith(".cha"):
            cha_path = os.path.join(folder_path, file)
            wav_path = os.path.join(folder_path, file.replace(".cha", ".wav"))

            if not os.path.exists(wav_path):
                continue

            # Text features
            text = extract_patient_text(cha_path)
            text_feat = extract_text_features(text)

            # Audio features
            audio_feat = extract_audio_features(wav_path)

            combined = np.hstack([audio_feat, text_feat])
            features.append(combined)
            labels.append(label)


Processing Control: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 49344.75it/s]
Processing Dementia: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51781.53it/s]


In [25]:
X = np.array(features)
y = np.array(labels)

X.shape, y.shape


((0,), (0,))

In [26]:
import os

print("CWD =", os.getcwd())
print("DATA_ROOT exists:", os.path.exists(DATA_ROOT))
print("Folders inside DATA_ROOT:", os.listdir(DATA_ROOT))


CWD = F:\Projects\github projects\dimentia ai\notebooks
DATA_ROOT exists: True
Folders inside DATA_ROOT: ['0metadata.cdc', 'Control', 'Dementia']


In [27]:
import os

control_cookie = os.path.join(DATA_ROOT, "Control", "cookie")
print("Path:", control_cookie)
print("Exists:", os.path.exists(control_cookie))
print("Files:", os.listdir(control_cookie)[:20])


Path: ../dataset/Pitt\Control\cookie
Exists: True
Files: ['002-0.cha', '002-1.cha', '002-2.cha', '002-3.cha', '006-2.cha', '006-3.cha', '006-4.cha', '013-0.cha', '013-2.cha', '013-3.cha', '013-4.cha', '015-0.cha', '015-1.cha', '015-2.cha', '015-3.cha', '015-4.cha', '017-4.cha', '021-0.cha', '021-1.cha', '021-2.cha']
