In [1]:
import os
import pandas as pd
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import joblib
import json


In [2]:
RAW_ROOT = "../dataset/adress2020_raw/train"
CLEAN_ROOT = "../dataset/adress2020"

AUDIO_RAW_CC = os.path.join(RAW_ROOT, "Full_wave_enhanced_audio", "cc")
AUDIO_RAW_CD = os.path.join(RAW_ROOT, "Full_wave_enhanced_audio", "cd")

TRANS_RAW_CC = os.path.join(RAW_ROOT, "transcription", "cc")
TRANS_RAW_CD = os.path.join(RAW_ROOT, "transcription", "cd")

AUDIO_CLEAN = os.path.join(CLEAN_ROOT, "audio")
TRANS_CLEAN = os.path.join(CLEAN_ROOT, "transcripts")

os.makedirs(AUDIO_CLEAN, exist_ok=True)
os.makedirs(TRANS_CLEAN, exist_ok=True)


In [3]:
import shutil

for src_dir in [AUDIO_RAW_CC, AUDIO_RAW_CD]:
    for f in os.listdir(src_dir):
        if f.endswith(".wav"):
            new_name = f.replace("_enhanced", "")
            shutil.copy(os.path.join(src_dir, f), os.path.join(AUDIO_CLEAN, new_name))

print("Audio files copied:", len(os.listdir(AUDIO_CLEAN)))


Audio files copied: 108


In [4]:
for src_dir in [TRANS_RAW_CC, TRANS_RAW_CD]:
    for f in os.listdir(src_dir):
        if f.endswith(".cha"):
            shutil.copy(os.path.join(src_dir, f), os.path.join(TRANS_CLEAN, f))

print("Transcript files copied:", len(os.listdir(TRANS_CLEAN)))


Transcript files copied: 108


In [5]:
cc_meta = os.path.join(RAW_ROOT, "cc_meta_data.txt")
cd_meta = os.path.join(RAW_ROOT, "cd_meta_data.txt")

cc = pd.read_csv(cc_meta, sep=";", engine="python",
                 names=["id", "age", "gender", "mmse"],
                 skiprows=1)
cc["id"] = cc["id"].str.strip()
cc["label"] = "Control"

cd = pd.read_csv(cd_meta, sep=";", engine="python",
                 names=["id", "age", "gender", "mmse"],
                 skiprows=1)
cd["id"] = cd["id"].str.strip()
cd["label"] = "AD"

df = pd.concat([cc, cd], ignore_index=True)
df["filename"] = df["id"] + ".wav"

labels = df[["filename", "label"]]
labels.to_csv(os.path.join(CLEAN_ROOT, "labels.csv"), index=False)

labels.head()


Unnamed: 0,filename,label
0,S001.wav,Control
1,S002.wav,Control
2,S003.wav,Control
3,S004.wav,Control
4,S005.wav,Control


In [6]:
def extract_patient_text(cha_path, speaker_tag="*PAR:"):
    text_lines = []
    with open(cha_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if line.startswith(speaker_tag):
                content = line.split("\t", 1)[-1]
                text_lines.append(content)
    return " ".join(text_lines).lower()

def extract_text_features(text):
    words = text.split()
    length = len(words)
    avg_len = np.mean([len(w) for w in words]) if words else 0.0
    unique_ratio = len(set(words)) / length if length else 0.0
    return np.array([length, avg_len, unique_ratio])


In [7]:
import librosa
import numpy as np

def extract_audio_features(audio_path: str, sr_target: int = 16000) -> np.ndarray:
    """
    Rich acoustic feature set:
    - MFCC (mean + std)
    - Delta MFCC
    - Delta-Delta MFCC
    - Chroma
    - Spectral contrast
    - Tonnetz
    """
    y, sr = librosa.load(audio_path, sr=sr_target)

    # MFCCs and their deltas
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

    # Chroma, contrast, tonnetz
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

    # Take mean + std for stability
    feat_vec = np.hstack([
        mfcc.mean(axis=1),        # 13
        mfcc.std(axis=1),         # 13
        mfcc_delta.mean(axis=1),  # 13
        mfcc_delta2.mean(axis=1), # 13
        chroma.mean(axis=1),      # 12
        contrast.mean(axis=1),    # 7
        tonnetz.mean(axis=1),     # 6
    ])

    return feat_vec.astype(np.float32)



In [8]:
labels_df = pd.read_csv(os.path.join(CLEAN_ROOT, "labels.csv"))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Step 1 — read all texts first
all_texts = []

for _, row in tqdm(labels_df.iterrows(), total=len(labels_df)):
    fname = row["filename"]
    cha_path = os.path.join(TRANS_CLEAN, fname.replace(".wav", ".cha"))

    if os.path.exists(cha_path):
        text = extract_patient_text(cha_path)
    else:
        text = ""

    all_texts.append(text)

# Step 2 — TF-IDF (300 features)
vectorizer = TfidfVectorizer(max_features=300)
tfidf_matrix = vectorizer.fit_transform(all_texts).toarray()

# Step 3 — Reduce TF-IDF dimensions to 20
pca = PCA(n_components=20)
tfidf_reduced = pca.fit_transform(tfidf_matrix)

print("TF-IDF reduced shape:", tfidf_reduced.shape)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [00:00<00:00, 4883.67it/s]

TF-IDF reduced shape: (108, 20)





## Build combined features (audio + text)

In [9]:
features = []
labels_list = []

for i, row in tqdm(labels_df.iterrows(), total=len(labels_df)):
    fname = row["filename"]
    f_label = row["label"]

    audio_path = os.path.join(AUDIO_CLEAN, fname)
    cha_path = os.path.join(TRANS_CLEAN, fname.replace(".wav", ".cha"))

    if not os.path.exists(audio_path) or not os.path.exists(cha_path):
        continue

    # Extract audio features
    audio_feat = extract_audio_features(audio_path)

    # NEW: Use TF-IDF reduced text features
    text_feat = tfidf_reduced[i]

    # Combine both
    combined = np.hstack([audio_feat, text_feat])

    features.append(combined)
    labels_list.append(f_label)


    # STEP 6 — Convert to numpy arrays
# -------------------------------
X = np.array(features)
y_text = np.array(labels_list)

print("Final shape:", X.shape, y_text.shape)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [04:35<00:00,  2.55s/it]

Final shape: (108, 97) (108,)





In [10]:
encoder = LabelEncoder()
y = encoder.fit_transform(y_text)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## Train models

In [12]:
models = {
    "RandomForest": RandomForestClassifier(n_estimators=500, class_weight="balanced"),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "XGBoost": XGBClassifier(eval_metric="logloss", use_label_encoder=False),
    "SVM": SVC(kernel='rbf', probability=True)
}

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    results.append([name, acc, prec, rec, f1])
    print(classification_report(y_test, preds, target_names=encoder.classes_))

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
results_df



Training RandomForest...
              precision    recall  f1-score   support

          AD       0.64      0.82      0.72        11
     Control       0.75      0.55      0.63        11

    accuracy                           0.68        22
   macro avg       0.70      0.68      0.68        22
weighted avg       0.70      0.68      0.68        22


Training LogisticRegression...
              precision    recall  f1-score   support

          AD       0.60      0.55      0.57        11
     Control       0.58      0.64      0.61        11

    accuracy                           0.59        22
   macro avg       0.59      0.59      0.59        22
weighted avg       0.59      0.59      0.59        22


Training XGBoost...
              precision    recall  f1-score   support

          AD       0.71      0.91      0.80        11
     Control       0.88      0.64      0.74        11

    accuracy                           0.77        22
   macro avg       0.79      0.77      0.77      

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,RandomForest,0.681818,0.75,0.545455,0.631579
1,LogisticRegression,0.590909,0.583333,0.636364,0.608696
2,XGBoost,0.772727,0.875,0.636364,0.736842
3,SVM,0.590909,0.625,0.454545,0.526316
