In [35]:
# Install required libraries
!pip install librosa transformers   



In [36]:
# Define paths to data
cha_path = '/kaggle/input/pittcombined/PittCombined/cha'
mp3_path = '/kaggle/input/pittcombined/PittCombined/mp3'

In [37]:
# Import necessary libraries
import os
import librosa
import pandas as pd
from transformers import BertTokenizer, BertModel


In [38]:
# List files in directories
cha_files = [os.path.join(cha_path, file) for file in os.listdir(cha_path) if file.endswith('.cha')]
mp3_files = [os.path.join(mp3_path, file) for file in os.listdir(mp3_path) if file.endswith('.mp3')]

print(f"Found {len(cha_files)} CHA files and {len(mp3_files)} MP3 files.")

Found 1255 CHA files and 1253 MP3 files.


In [39]:
import os
import re
import random
from collections import defaultdict, Counter

def extract_diagnosis(cha_file):
    """Extracts the diagnosis from a CHAT transcript file."""
    with open(cha_file, 'r') as file:
        content = file.read()
    match = re.search(r'@ID:\s*[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|([^|]*)\|', content)
    return match.group(1) if match else None

# Paths to your CHA and MP3 files
cha_path = '/kaggle/input/pittcombined/PittCombined/cha'
mp3_path = '/kaggle/input/pittcombined/PittCombined/mp3'

# Load all CHA files and their diagnoses
cha_files = [f for f in os.listdir(cha_path) if f.endswith('.cha')]
diagnoses = [extract_diagnosis(os.path.join(cha_path, f)) for f in cha_files]

# Group files by diagnosis, ensuring corresponding MP3 exists
files_by_diagnosis = defaultdict(list)
for cha_file, diag in zip(cha_files, diagnoses):
    mp3_file = cha_file.replace('.cha', '.mp3')
    if os.path.exists(os.path.join(mp3_path, mp3_file)):
        files_by_diagnosis[diag].append(cha_file)

# Select up to 500 files, trying to balance across diagnoses
selected_files = []
for diag, files in files_by_diagnosis.items():
    select_count = min(len(files), max(500 // len(files_by_diagnosis), 1))
    selected_files.extend(random.sample(files, select_count))

# Ensure the selection does not exceed 10 if categories were unbalanced
selected_files = random.sample(selected_files, min(500, len(selected_files)))

# Count of selected diagnoses
selected_diagnoses = [extract_diagnosis(os.path.join(cha_path, f)) for f in selected_files]
diagnosis_count = Counter(selected_diagnoses)

# Find corresponding MP3 files
selected_mp3_files = [f.replace('.cha', '.mp3') for f in selected_files]

# Collect full paths for the selected files
cha_files = [os.path.join(cha_path, f) for f in selected_files]
mp3_files = [os.path.join(mp3_path, f) for f in selected_mp3_files]

# Output results
print("Selected CHA files:", cha_files)
print("Selected MP3 files:", mp3_files)
print("Diagnosis counts:", dict(diagnosis_count))


Selected CHA files: ['/kaggle/input/pittcombined/PittCombined/cha/172-1f.cha', '/kaggle/input/pittcombined/PittCombined/cha/260-1s.cha', '/kaggle/input/pittcombined/PittCombined/cha/118-1.cha', '/kaggle/input/pittcombined/PittCombined/cha/120-2.cha', '/kaggle/input/pittcombined/PittCombined/cha/120-4.cha', '/kaggle/input/pittcombined/PittCombined/cha/223-1s.cha', '/kaggle/input/pittcombined/PittCombined/cha/355-1r.cha', '/kaggle/input/pittcombined/PittCombined/cha/585-0r.cha', '/kaggle/input/pittcombined/PittCombined/cha/704-0.cha', '/kaggle/input/pittcombined/PittCombined/cha/057-2f.cha', '/kaggle/input/pittcombined/PittCombined/cha/338-0.cha', '/kaggle/input/pittcombined/PittCombined/cha/511-0s.cha', '/kaggle/input/pittcombined/PittCombined/cha/248-2.cha', '/kaggle/input/pittcombined/PittCombined/cha/024-2s.cha', '/kaggle/input/pittcombined/PittCombined/cha/322-1.cha', '/kaggle/input/pittcombined/PittCombined/cha/073-3.cha', '/kaggle/input/pittcombined/PittCombined/cha/468-0f.cha', '

In [40]:
cha_base_names = set([os.path.splitext(os.path.basename(f))[0] for f in cha_files])
mp3_base_names = set([os.path.splitext(os.path.basename(f))[0] for f in mp3_files])

unmatched_cha = cha_base_names - mp3_base_names
unmatched_mp3 = mp3_base_names - cha_base_names

print("Unmatched CHA files:", unmatched_cha)
print("Unmatched MP3 files:", unmatched_mp3)

Unmatched CHA files: set()
Unmatched MP3 files: set()


In [41]:
import librosa
from transformers import BertTokenizer, BertModel
import soundfile as sf

from transformers import DistilBertTokenizer, DistilBertModel

# Initialize DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [42]:
import os
import librosa
import soundfile as sf

def preprocess_audio(mp3_file_path, output_wav_path):
    # Define output path within the writable directory
    output_wav_path = os.path.join('/kaggle/working', os.path.basename(output_wav_path))
    
    try:
        # Check if the WAV file already exists
        if not os.path.exists(output_wav_path):
            # Convert MP3 to WAV
            y, sr = librosa.load(mp3_file_path, sr=None)
            sf.write(output_wav_path, y, sr)
            print(f"Converted {mp3_file_path} to WAV.")
        else:
            print(f"WAV file already exists: {output_wav_path}")

        # Extract MFCC features
        y, sr = librosa.load(output_wav_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        return mfccs.mean(axis=1)
    except Exception as e:
        print(f"An error occurred while processing {mp3_file_path}: {str(e)}")
        return None

# Replace 'mp3_files' with the actual list of mp3 file paths
audio_features = [preprocess_audio(f, f.replace('.mp3', '.wav')) for f in mp3_files if f.endswith('.mp3')]


Converted /kaggle/input/pittcombined/PittCombined/mp3/172-1f.mp3 to WAV.
Converted /kaggle/input/pittcombined/PittCombined/mp3/260-1s.mp3 to WAV.
WAV file already exists: /kaggle/working/118-1.wav
WAV file already exists: /kaggle/working/120-2.wav
Converted /kaggle/input/pittcombined/PittCombined/mp3/120-4.mp3 to WAV.
WAV file already exists: /kaggle/working/223-1s.wav
Converted /kaggle/input/pittcombined/PittCombined/mp3/355-1r.mp3 to WAV.
WAV file already exists: /kaggle/working/585-0r.wav
WAV file already exists: /kaggle/working/704-0.wav
Converted /kaggle/input/pittcombined/PittCombined/mp3/057-2f.mp3 to WAV.
WAV file already exists: /kaggle/working/338-0.wav
WAV file already exists: /kaggle/working/511-0s.wav
WAV file already exists: /kaggle/working/248-2.wav
WAV file already exists: /kaggle/working/024-2s.wav
WAV file already exists: /kaggle/working/322-1.wav
Converted /kaggle/input/pittcombined/PittCombined/mp3/073-3.mp3 to WAV.
WAV file already exists: /kaggle/working/468-0f.wa

In [43]:
# Define a function to preprocess text data
def preprocess_text(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    # Insert text cleaning code here as necessary
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze().detach().numpy()

In [44]:
print(cha_files)

['/kaggle/input/pittcombined/PittCombined/cha/172-1f.cha', '/kaggle/input/pittcombined/PittCombined/cha/260-1s.cha', '/kaggle/input/pittcombined/PittCombined/cha/118-1.cha', '/kaggle/input/pittcombined/PittCombined/cha/120-2.cha', '/kaggle/input/pittcombined/PittCombined/cha/120-4.cha', '/kaggle/input/pittcombined/PittCombined/cha/223-1s.cha', '/kaggle/input/pittcombined/PittCombined/cha/355-1r.cha', '/kaggle/input/pittcombined/PittCombined/cha/585-0r.cha', '/kaggle/input/pittcombined/PittCombined/cha/704-0.cha', '/kaggle/input/pittcombined/PittCombined/cha/057-2f.cha', '/kaggle/input/pittcombined/PittCombined/cha/338-0.cha', '/kaggle/input/pittcombined/PittCombined/cha/511-0s.cha', '/kaggle/input/pittcombined/PittCombined/cha/248-2.cha', '/kaggle/input/pittcombined/PittCombined/cha/024-2s.cha', '/kaggle/input/pittcombined/PittCombined/cha/322-1.cha', '/kaggle/input/pittcombined/PittCombined/cha/073-3.cha', '/kaggle/input/pittcombined/PittCombined/cha/468-0f.cha', '/kaggle/input/pittco

In [45]:
# Process each file
text_embeddings = [preprocess_text(f) for f in cha_files]

In [46]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming text_embeddings and audio_features are lists of numpy arrays
# Convert lists to numpy arrays
text_features = np.array(text_embeddings)
audio_features = np.stack(audio_features)

In [47]:
print(text_features)

[[[-0.5396378  -0.1122188  -0.14136086 ... -0.07889522  0.3143726
    0.33446717]
  [ 0.21182205  0.04163441  0.43201783 ...  0.256563    0.19033904
    0.08237714]
  [ 0.02588163 -0.18625505  0.81828237 ... -0.09514139  0.17848589
    0.42008242]
  ...
  [-0.1534813   0.194721    0.6961406  ...  0.3218218  -0.23201407
   -1.3592888 ]
  [ 0.46738222 -0.33231646  0.26890546 ... -0.1453437  -0.34344774
   -0.370194  ]
  [ 0.6557946   0.08349684 -0.18222585 ...  0.39387137 -0.3084189
   -0.36823618]]

 [[-0.55939883 -0.10720944 -0.262997   ... -0.0638551   0.3407018
    0.3402853 ]
  [ 0.12690997  0.06115264  0.39825737 ...  0.13250828  0.17918362
    0.19027814]
  [ 0.00789125 -0.22654438  0.7108496  ... -0.25727662  0.1398017
    0.5196721 ]
  ...
  [-0.4581162  -0.26147276  0.5805859  ...  0.00213803  0.08770298
   -0.86314625]
  [-0.4289421  -0.10727982  0.54286706 ... -0.09939187 -0.43324986
   -0.6927323 ]
  [ 0.76034784  0.09480232 -0.29435822 ...  0.41920146 -0.35071903
   -0.3618

In [48]:
print(audio_features)

[[-5.3075952e+02  9.3225693e+01  1.3921964e+01 ... -6.2906194e+00
   7.3403950e+00 -2.2417779e+00]
 [-5.5964587e+02  8.7340088e+01 -1.0423868e+01 ... -8.5324764e+00
   8.8442526e+00 -4.2466202e+00]
 [-3.9039734e+02  1.1912372e+02 -2.6985764e+01 ... -4.3761964e+00
   4.8595433e+00  7.7608067e-01]
 ...
 [-4.9511880e+02  8.9522713e+01  5.1734385e+00 ... -3.2919807e+00
   7.9526526e-01 -2.4579440e-01]
 [-5.2650989e+02  1.0799457e+02  6.1760035e+00 ... -2.0162501e+00
   3.0548265e+00 -1.0188855e+00]
 [-4.3381122e+02  1.4063947e+02  3.0807066e+01 ... -2.7706909e+00
   1.3317888e+00 -6.2867141e+00]]


In [49]:
text_features = np.mean(text_features, axis=1)


In [50]:
print(text_features)

[[-0.26754478 -0.07133148  0.37088713 ... -0.02230411  0.01639818
  -0.2508931 ]
 [-0.2989361  -0.086805    0.36174476 ... -0.09444899 -0.05514356
  -0.16749185]
 [-0.2931763  -0.0275961   0.34265968 ... -0.05192115  0.00782341
  -0.13503647]
 ...
 [-0.32714897 -0.11728647  0.40946248 ... -0.0613702  -0.03461002
  -0.14553922]
 [-0.15658194 -0.06696546  0.33279938 ... -0.0986774   0.03073214
  -0.18162242]
 [-0.36398995 -0.08443119  0.36757478 ... -0.02360295  0.01838582
  -0.21575068]]


In [51]:
# Ensure audio_features is 2D (it should already be if you've extracted features correctly)
if audio_features.ndim > 2:
    audio_features = np.mean(audio_features, axis=1)

In [52]:
# Combine text and audio features
combined_features = np.concatenate([text_features, audio_features], axis=1)

In [53]:
print(combined_features)

[[-0.26754478 -0.07133148  0.37088713 ... -6.2906194   7.340395
  -2.241778  ]
 [-0.2989361  -0.086805    0.36174476 ... -8.532476    8.844253
  -4.24662   ]
 [-0.2931763  -0.0275961   0.34265968 ... -4.3761964   4.8595433
   0.77608067]
 ...
 [-0.32714897 -0.11728647  0.40946248 ... -3.2919807   0.79526526
  -0.2457944 ]
 [-0.15658194 -0.06696546  0.33279938 ... -2.0162501   3.0548265
  -1.0188855 ]
 [-0.36398995 -0.08443119  0.36757478 ... -2.770691    1.3317888
  -6.286714  ]]


In [54]:
# Normalize the combined features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(combined_features)

In [55]:
import re

def extract_diagnostic_code(cha_file):
    # Read the content of the .cha file
    with open(cha_file, 'r') as file:
        content = file.read()

    # Regex to find the diagnosis in the file content based on the updated structure
    match = re.search(r'@ID:\s*[^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*\|([^|]*)\|\|', content)
    if match:
        return match.group(1).strip()
    return None

labels = [extract_diagnostic_code(f) for f in cha_files]

for label in labels[:10]:  # Print first 10 labels to verify
    print(label)


MCI
MCI
Control
MCI
MCI
Vascular
ProbableAD
PossibleAD
Other
ProbableAD


In [56]:
from sklearn.preprocessing import LabelEncoder

# Encode labels as integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)


In [57]:
print(encoded_labels)

[3 3 1 3 3 9 8 6 5 8 3 6 1 4 1 1 9 3 8 3 4 6 1 6 7 3 8 1 1 1 8 6 1 1 8 4 1
 3 8 3 1 8 3 6 8 9 6 8 6 3 6 3 1 1 8 1 3 1 6 3 3 6 8 3 3 6 7 4 6 8 6 1 1 8
 6 9 6 8 1 1 5 1 9 6 1 8 3 6 8 6 3 8 3 8 9 6 3 9 1 1 8 6 8 1 1 3 8 5 6 7 6
 9 6 1 3 8 9 6 6 3 4 8 8 1 6 9 3 3 8 1 6 3 1 6 1 1 3 8 6 3 1 8 8 5 9 8 8 1
 8 4 8 3 4 1 6 1 6 9 6 3 1 6 4 1 3 3 6 8 8 1 6 8 1 8 8 7 3 8 9 3 6 8 1 6 1
 8 9 6 1 6 6 8 6 6 1 3 6 8 9 9 1 6 1 4 6 3 6 4 8 3 8 0 2 3 4 3 3 8 4 3 8 8
 6 8 9 1 3 6 6 1 6 3 9 1 8 3 3 8 1 6 9 1 1 8 3 1 3 3 3 1 3 1 6 8 3 6 3 8 6
 3]


In [58]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Conv1D, GlobalAveragePooling1D, BatchNormalization
from tensorflow.keras.optimizers import Adam

In [59]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

In [60]:
# Split data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(normalized_features, encoded_labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Finding the maximum label value across all datasets to ensure consistency
max_label = max(np.max(y_train), np.max(y_val), np.max(y_test)) + 1  # Plus one because classes are zero-indexed

# One-hot encode the labels with a consistent number of classes across datasets
y_train = to_categorical(y_train, num_classes=max_label)
y_val = to_categorical(y_val, num_classes=max_label)
y_test = to_categorical(y_test, num_classes=max_label)




In [61]:
print("y_test",y_test)

y_test [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0

In [62]:
from tensorflow.keras.regularizers import l2

# Model architecture
input_text = Input(shape=(768,))  # Text features from Clinical BERT
input_audio = Input(shape=(13,))  # Audio features, assuming MFCCs with 13 coefficients

# Text pathway
text_dense = Dense(128, activation='relu', kernel_regularizer=l2(0.1))(input_text)
text_bn = BatchNormalization()(text_dense)  # Add batch normalization
text_out = Dropout(0.5)(text_bn)  # Increase dropout rate


In [63]:
from tensorflow.keras.layers import Reshape

# Audio pathway
audio_reshape = Reshape((13, 1))(input_audio)
conv1 = Conv1D(64, kernel_size=3, activation='relu', kernel_regularizer=l2(0.1))(audio_reshape)
conv1_bn = BatchNormalization()(conv1)
conv1_pool = GlobalAveragePooling1D()(conv1_bn)
audio_out = Dropout(0.5)(conv1_pool)  # Increase dropout rate

In [64]:
# Fusion and output
concatenated = Concatenate()([text_out, audio_out])
dense_layer = Dense(64, activation='relu', kernel_regularizer=l2(0.1))(concatenated)  # Regularize dense layer

# Assuming y_train has already been one-hot encoded correctly
num_classes = y_train.shape[1]

# Adjust your output layer
output_layer = Dense(num_classes, activation='softmax')(dense_layer)


In [65]:
print(concatenated)

<KerasTensor shape=(None, 192), dtype=float32, sparse=False, name=keras_tensor_23>


In [66]:
from tensorflow.keras.metrics import Precision, Recall

# Compile model
model = Model(inputs=[input_text, input_audio], outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', 
              metrics=['accuracy', Precision(), Recall()])


In [67]:
# Fit model on training data
model.fit([X_train[:, :768], X_train[:, 768:]], y_train, validation_data=([X_val[:, :768], X_val[:, 768:]], y_val), epochs=100, batch_size=32)


Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 144ms/step - accuracy: 0.1921 - loss: 33.6753 - precision_1: 0.3106 - recall_1: 0.0553 - val_accuracy: 0.2692 - val_loss: 31.5733 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.3493 - loss: 30.6814 - precision_1: 0.6271 - recall_1: 0.1255 - val_accuracy: 0.2885 - val_loss: 29.0491 - val_precision_1: 0.4167 - val_recall_1: 0.0962
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4291 - loss: 27.8958 - precision_1: 0.5951 - recall_1: 0.1681 - val_accuracy: 0.3077 - val_loss: 26.6693 - val_precision_1: 0.5263 - val_recall_1: 0.1923
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5672 - loss: 25.4068 - precision_1: 0.6887 - recall_1: 0.3009 - val_accuracy: 0.3269 - val_loss: 24.3744 - val_precision_1: 0.5500 - val_

<keras.src.callbacks.history.History at 0x7c64aaa7a800>

In [68]:
# Evaluate the model on the test set
evaluation = model.evaluate([X_test[:, :768], X_test[:, 768:]], y_test)
print(f'Accuracy: {evaluation[1]*100:.2f}%, Precision: {evaluation[2]*100:.2f}%, Recall: {evaluation[3]*100:.2f}%')

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4768 - loss: 2.3328 - precision_1: 0.4451 - recall_1: 0.3069 
Accuracy: 48.08%, Precision: 42.86%, Recall: 28.85%
