# Data Loading and Preprocessing

### Setting Path to Dataset Folder

In [None]:
# Set the path to the dataset folder
# IMPORTANT: Modify this path to point to your local copy of the Pitt Corpus database.
folder_path = '/path/to/PittCorpus'

# Note: Ensure that the dataset has the expected folder structure (e.g., 'Transcripts/Control/...').

### Preprocessing Functions

The following functions perform various pipelines of text cleaning, extraction, and symbol replacement tailored to the CHAT format of the Pitt Corpus. These preprocessing steps are essential to normalize and structure the transcript data before further analysis.


In [None]:
import re
import os
import pandas as pd

In [None]:
def clean_text(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove timing markers (e.g., 30_5640)
    text = re.sub(r'\d+_\d+', '', text)
    # Remove headers like %wor, %mor, %gra, and *PAR:
    text = re.sub(r'(%wor|%mor|%gra|\*par):', '', text)
    # Remove morphological tags (e.g., det:art|the n|scene)
    text = re.sub(r'\b\w+:\w+\|\w+', '', text)
    # Remove special symbol ‡
    text = text.replace('‡', '')
    # Remove specific annotations like [+ exc]
    text = re.sub(r'\[\+ exc\]', '', text)
    # Remove researcher intervention marker +<
    text = re.sub(r'\+<\s*', '', text)
    # Preserve [+ gram], remove other similar annotations
    text = re.sub(r'\[\+ gram\]', '[+ gram]', text)
    text = re.sub(r'&\S+', '', text)  # Remove morphological analyses like y&...
    # Remove tabs and normalize whitespace
    text = text.replace('\t', ' ').replace('  ', ' ')
    # Fix spacing before punctuation
    text = re.sub(r'\s+([.?])', r'\1', text)
    text = re.sub(r'\s+(\?)', r'\1', text)
    # Final cleanup of extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
def clean_words_only(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove timing and label markers
    text = re.sub(r'\d+_\d+', '', text)
    text = re.sub(r'(%\w+:|\*par:)', '', text)
    text = re.sub(r'\b\w+:\w+\|\w+', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[<>\[\]\(\)\{\}/\\]', '', text)
    text = re.sub(r'\+[^ ]*', '', text)
    text = re.sub(r'&\S+', '', text)
    text = re.sub(r'[.,!?]', '', text)
    # Keep only letters and spaces
    text = re.sub(r'[^a-záéíóúüñ\s]', '', text)
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
def clean_without_special_symbols(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove timing and label markers
    text = re.sub(r'\d+_\d+', '', text)
    text = re.sub(r'(%\w+:|\*par:)', '', text)
    text = re.sub(r'\b\w+:\w+\|\w+', '', text)
    # Remove special symbols
    text = re.sub(r'<[^>]+>', '', text)        # Remove content between <>
    text = re.sub(r'\([^)]*\)', '', text)      # Remove content between ()
    text = re.sub(r'\[[^\]]*\]', '', text)     # Remove content between []
    text = re.sub(r'\+[^ ]*', '', text)        # Remove + prefixed codes
    text = re.sub(r'&\S+', '', text)           # Remove & codes
    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [None]:
def replace_symbols(text):
    # Replace symbols with corresponding English words
    text = re.sub(r'\[/\]', 'pause', text)
    text = re.sub(r'\[//\]', 'long pause', text)
    text = re.sub(r'<[^>]+>', 'correction', text)
    text = re.sub(r'\(\.\.\.\)', 'mispronunciation', text)
    text = re.sub(r'\[\+ gram\]', 'grammatical error', text)
    return text


In [None]:
def extract_cha_info(folder_path):
    cha_files = []
    diagnoses = []
    raw_texts = []
    cleaned_texts = []

    # Process both Control and Dementia subfolders
    for subfolder in ['Control/cookie', 'Dementia/cookie']:
        full_path = os.path.join(folder_path, subfolder)
        for filename in os.listdir(full_path):
            if filename.endswith('.cha'):
                file_path = os.path.join(full_path, filename)
                with open(file_path, 'r') as f:
                    content = f.read()
                    # Extract diagnosis from @ID line
                    id_line = [line for line in content.splitlines() if line.startswith('@ID')][0]
                    diagnosis = id_line.split('|')[5]
                    # Extract patient's speech
                    patient_text = ' '.join([line for line in content.splitlines() if line.startswith('*PAR:')])
                    # Clean the text
                    cleaned_text = clean_text(patient_text)
                    # Optionally, try other cleaning functions:
                    # cleaned_text = replace_symbols(patient_text)
                    # cleaned_text = clean_words_only(patient_text)
                    # cleaned_text = clean_without_special_symbols(patient_text)
                    # Store all information
                    cha_files.append(filename)
                    diagnoses.append(diagnosis)
                    raw_texts.append(patient_text)
                    cleaned_texts.append(cleaned_text)

    # Create a DataFrame
    df = pd.DataFrame({
        'file': cha_files,
        'diagnosis': diagnoses,
        'raw_text': raw_texts,
        'cleaned_text': cleaned_texts
    })

    # Convert diagnosis column to one-hot encoding
    diagnosis_one_hot = pd.get_dummies(df['diagnosis'], prefix='diagnosis')
    df = pd.concat([df, diagnosis_one_hot], axis=1)

    return df

### Dataset Preparation

This section loads the CHAT files, constructs the initial DataFrame, and applies basic preprocessing such as filtering classes and mapping diagnosis labels into binary categories for classification tasks.


In [None]:
transcripts_path = f'{folder_path}/Transcripts'
df_transcripts = extract_cha_info(transcripts_path)


In [None]:
# Sort transcripts by filename
df_transcripts = df_transcripts.sort_values(by='file')


In [None]:
# Count samples per diagnosis
class_counts = df_transcripts['diagnosis'].value_counts()

# Keep only classes with at least 2 samples
df_transcripts_filtered = df_transcripts[df_transcripts['diagnosis'].isin(class_counts[class_counts >= 2].index)].copy()


In [None]:
# Create a binary label column: 0 = Control, 1 = ProbableAD or PossibleAD, -1 = other
df_transcripts_filtered['global_diagnosis'] = df_transcripts_filtered['diagnosis'].apply(lambda x: 0 if x == 'Control' else 1 if x in ['ProbableAD', 'PossibleAD'] else -1)

In [None]:
# Define label names for reporting
label_map = {0: "Control", 1: "Alzheimer's Indicator"}

# Preliminary Analysis of BERT Configurations

### Contextual Word Embeddings (BERT) Configurations

This section compares different strategies for generating sentence-level embeddings from BERT to evaluate their impact on classification performance. Three methods are explored:

- **Mean of token embeddings**: averaging all token embeddings to represent the sentence.
- **[CLS] token embedding**: using the special classification token provided by BERT as a summary of the sentence.
- **Concatenation of the last four hidden layers**: representation that stacks information from multiple depths of the model.

All configurations are evaluated on the same train–test split to ensure comparability.

In [None]:
# Import required libraries
import torch
from transformers import BertTokenizer, BertModel
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Additional libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

plt.rcParams.update({'font.size': 16})
plt.rcParams.update({'figure.figsize': [10, 6]})
plt.style.use('seaborn-v0_8-whitegrid')


In [None]:
# Load BERT-base tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load BERT-large tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# model = BertModel.from_pretrained('bert-large-uncased')


In [None]:
from sklearn.model_selection import train_test_split

# Filter for binary classification (Control vs Alzheimer's)
df_filtered = df_transcripts_filtered[df_transcripts_filtered['global_diagnosis'].isin([0, 1])]

# Define X and y
X = df_filtered['cleaned_text']
y = df_filtered['global_diagnosis']

# Stratified train-test split
train_text, test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Explore training set and test set label class distribution
print("Training set label distribution:\n", y_train.value_counts())
print("Test set label distribution:\n", y_test.value_counts())


In [None]:
# Training set sentence embeddings using mean of token embeddings
bert_sent_embeddings_train=[]
for text in train_text:
  encoding = tokenizer.batch_encode_plus([text],  # List of input texts
    padding=True,                                 # Pad to the maximum sequence length
    truncation=True,                              # Truncate if input exceeds the maximum length
    return_tensors='pt',                          # Return PyTorch tensors
    add_special_tokens=True)                      # Add special CLS and SEP tokens
  input_ids = encoding['input_ids']               # Token identifiers
  attention_mask = encoding['attention_mask']     # Attention mask
  # Generate training set embeddings using the BERT model
  with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
      word_embeddings = outputs.last_hidden_state
  sentence_embedding = word_embeddings.mean(dim=1)
  bert_sent_embeddings_train.append(sentence_embedding[0])

In [None]:
# Training set sentence embeddings using the [CLS] token (designed to summarize full sequence)
bert_sent_embeddings_train=[]
for text in train_text:
  encoding = tokenizer.batch_encode_plus([text],
    padding=True,
    truncation=True,
    return_tensors='pt',
    add_special_tokens=True)
  input_ids = encoding['input_ids']
  attention_mask = encoding['attention_mask']
  with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
      cls_embedding = outputs.last_hidden_state[:, 0, :]
  bert_sent_embeddings_train.append(cls_embedding[0].numpy())

bert_sent_embeddings_train=np.array(bert_sent_embeddings_train)

In [None]:
# Training set sentence embeddings using concatenation of the last 4 hidden layers
bert_sent_embeddings_train = []
for text in train_text:
  encoding = tokenizer.batch_encode_plus([text],
    padding=True,
    truncation=True,
    return_tensors='pt',
    add_special_tokens=True)
  input_ids = encoding['input_ids']
  attention_mask = encoding['attention_mask']
  with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
      hidden_states = outputs.hidden_states
      concat_last_4 = torch.cat(hidden_states[-4:], dim=-1)
      sentence_embedding = concat_last_4.mean(dim=1)
  bert_sent_embeddings_train.append(sentence_embedding[0])


In [None]:
# Test set sentence embeddings using the mean of token embeddings
# For CLS token or last-4-layer concatenation, replicate the approach used in the training set accordingly
bert_sent_embeddings_test=[]
for text in test_text:
  encoding = tokenizer.batch_encode_plus( [text],   # List of input texts
    padding=True,                                   # Pad to the maximum sequence length
    truncation=True,                                # Truncate if input exceeds the maximum length
    return_tensors='pt',                            # Return PyTorch tensors
    add_special_tokens=True)                        # Add special CLS and SEP tokens
  input_ids = encoding['input_ids']                 # Token identifiers
  attention_mask = encoding['attention_mask']       # Attention mask
  # Generate test set embeddings using the BERT model
  with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask)
      word_embeddings = outputs.last_hidden_state
  sentence_embedding = word_embeddings.mean(dim=1)
  bert_sent_embeddings_test.append(sentence_embedding[0])

In [None]:
from sklearn.linear_model import LogisticRegression

# Fit logistic regression (balanced for class imbalance)
log_reg_model = LogisticRegression(class_weight='balanced', solver='lbfgs', max_iter=1000)
log_reg_model.fit(bert_sent_embeddings_train, y_train)

In [None]:
# Make predictions on the test set
predicted = log_reg_model.predict(bert_sent_embeddings_test)

In [None]:
# Evaluate the logistic regression model trained with BERT embeddings
from sklearn import metrics
print(metrics.classification_report(y_test, predicted, target_names=["Control", "Alzheimer's Indicator"]))

In [None]:
# Confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, predicted)
labels = [label_map[i] for i in np.unique(y_test)]
df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)

# Axis labels
df_cm.index.name = 'Ground Truth Labels'
df_cm.columns.name = 'Predicted Labels'

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
plt.title('Confusion Matrix - BERT embeddings')
plt.show()


### Tf-Idf

As part of the preliminary comparison, a traditional Tf-Idf approach was also evaluated using the same train–test split applied in the BERT configuration experiments.


In [None]:
# Evaluate a logistic regression classifier using a Tf-Idf feature representation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

text_clf=Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)),])


In [None]:
# Train the pipeline model
text_clf.fit(train_text, y_train)

# Evaluate the model
y_pred_tf_idf = text_clf.predict(test_text)

In [None]:
# Compute model accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_tf_idf)
print("Accuracy:", accuracy)


In [None]:
# Evaluate logistic regression model trained using TF-IDF
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred_tf_idf, target_names=["Control", "Alzheimer's Indicator"]))


In [None]:
# Confusion matrix
conf_matrix_tf_idf = metrics.confusion_matrix(y_test, y_pred_tf_idf)
labels = [label_map[i] for i in np.unique(y_test)]
df_cm = pd.DataFrame(conf_matrix_tf_idf, columns=labels, index=labels)

# Axis labels
df_cm.index.name = 'Ground Truth Labels'
df_cm.columns.name = 'Predicted Labels'

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
plt.title('Confusion Matrix - Tf-Idf')
plt.show()

### Non-Contextual Word Embeddings (GloVe)

To complement the preliminary comparison, a non-contextual embedding method based on pre-trained GloVe vectors (300-dimensional) was also evaluated using the same train–test split used in the BERT configuration experiments.

In [None]:
# Load GloVe word embeddings from file
import numpy as np
nltk.download('stopwords')  # Download stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

embeddings_index = {}
# IMPORTANT: Modify this path to point to your local copy of the GloVe embeddings file (300-dimensional)
f = open('/path/to/Glove/glove300.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
# Function to generate sentence embeddings from word embeddings
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if w not in stop_words]  # Remove stopwords
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])  # Append the word embedding
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())


In [None]:
# Generate sentence embeddings for the dataset using the function above
from tqdm import tqdm
from nltk.tokenize import word_tokenize

xtrain_glove = [sent2vec(x) for x in tqdm(train_text)]
xtest_glove = [sent2vec(x) for x in tqdm(test_text)]


In [None]:
xtrain_glove = np.array(xtrain_glove)
xtest_glove = np.array(xtest_glove)

In [None]:
# Train logistic regression model on the GloVe sentence embeddings
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)
log_reg_model.fit(xtrain_glove, y_train)

In [None]:
# Make predictions on the test set
predicted_glove = log_reg_model.predict(xtest_glove)


In [None]:
# Evaluate the logistic regression model trained with GloVe embeddings
from sklearn import metrics
print(metrics.classification_report(y_test, predicted_glove, target_names=["Control", "Alzheimer's Indicator"]))


In [None]:
# Confusion matrix
conf_matrix_glove = metrics.confusion_matrix(y_test, predicted_glove)
labels = [label_map[i] for i in np.unique(y_test)]
df_cm = pd.DataFrame(conf_matrix_glove, columns=labels, index=labels)

# Axis labels
df_cm.index.name = 'Ground Truth Labels'
df_cm.columns.name = 'Predicted Labels'

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
plt.title('Confusion Matrix - GloVe embeddings')
plt.show()

### Gemma-2B

In this subsection, the performance of sentence embeddings obtained from the Gemma-2B language model is evaluated, using the same train–test split applied in the previous BERT configuration experiments. The Gemma-2B model, developed by Google, is available via Hugging Face upon request and can be used through the `transformers` library for embedding generation.

In [None]:
!pip install transformers torch

In [None]:
!huggingface-cli login

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
import torch

# Generate sentence embeddings using Gemma-2B for the training set
embeddings_gemma = []
for i, text in enumerate(train_text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling for sentence embedding
    embeddings_gemma.append(embeddings[0])


In [None]:
# Generate sentence embeddings using Gemma-2B for the test set
embeddings_gemma_test = []
for i, text in enumerate(test_text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling for sentence embedding
    embeddings_gemma_test.append(embeddings[0])


In [None]:
# Train logistic regression model on the Gemma-2B sentence embeddings
from sklearn.linear_model import LogisticRegression

log_reg_model_gemma = LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)
log_reg_model_gemma.fit(embeddings_gemma, y_train)

In [None]:
# Make predictions on the test set
predicted_gemma = log_reg_model_gemma.predict(embeddings_gemma_test)


In [None]:
# Evaluate the logistic regression model trained with Gemma-2B embeddings
from sklearn import metrics
print(metrics.classification_report(y_test, predicted_gemma, target_names=["Control", "Alzheimer's Indicator"]))


In [None]:
# Confusion matrix
conf_matrix_gemma = metrics.confusion_matrix(y_test, predicted_gemma)
labels = [label_map[i] for i in np.unique(y_test)]
df_cm_gemma = pd.DataFrame(conf_matrix_gemma, columns=labels, index=labels)

# Axis labels
df_cm_gemma.index.name = 'Ground Truth Labels'
df_cm_gemma.columns.name = 'Predicted Labels'

# Plot the confusion matrix
plt.figure(figsize=(10, 7))
sn.set(font_scale=1.4)
sn.heatmap(df_cm_gemma, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
plt.title('Confusion Matrix - Gemma-2B embeddings')
plt.show()

# Five-Fold Cross-Validation Analysis

### Common Setup for K-Fold Cross-Validation

This section prepares the data for the 5-fold stratified cross-validation procedure used consistently across all embedding methods in the subsequent experiments. The same splits are used to evaluate all methods to ensure fair and reproducible comparison.


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold

# Filter the dataset to include only Control (0) and Alzheimer's Indicator (1) diagnoses
df_kfold = df_transcripts_filtered[df_transcripts_filtered['global_diagnosis'].isin([0, 1])]

# Define the common input (X) and label (y) sets, maintaining original order
X_common = df_kfold['cleaned_text']
y_common = df_kfold['global_diagnosis']

# Configure 5-fold stratified cross-validation with shuffle and fixed seed for reproducibility
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store the fold indices for reuse across all embedding method evaluations
folds_indices = list(skf.split(np.arange(len(y_common)), y_common))


### Contextual Word Embeddings (BERT) with K-Fold Cross-Validation

This subsection evaluates the predictive performance of the selected BERT configuration (BERT-base with average pooling of token embeddings) using a 5-fold stratified cross-validation.

In [None]:
# Import required libraries
import torch
from transformers import BertTokenizer, BertModel
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Additional libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

plt.rcParams.update({'font.size': 16})
plt.rcParams.update({'figure.figsize': [10, 6]})
plt.style.use('seaborn-v0_8-whitegrid')


In [None]:
# Load BERT-base tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load BERT-large tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
# model = BertModel.from_pretrained('bert-large-uncased')

In [None]:
# Generate BERT-based sentence embeddings (mean pooling) for all transcripts
bert_embeddings_all = []
for text in X_common:
    encoding = tokenizer.batch_encode_plus([text],   # List of input texts
        padding=True,                                # Pad to the maximum sequence length
        truncation=True,                             # Truncate if input exceeds the maximum length
        return_tensors='pt',                         # Return PyTorch tensors
        add_special_tokens=True                      # Add special CLS and SEP tokens
    )
    input_ids = encoding['input_ids']                 # Token identifiers
    attention_mask = encoding['attention_mask']       # Attention mask
    # Generate sentence embeddings using the BERT model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        word_embeddings = outputs.last_hidden_state
    sentence_embedding = word_embeddings.mean(dim=1)
    bert_embeddings_all.append(sentence_embedding[0])

# Convert to NumPy array
bert_embeddings_all = np.array(bert_embeddings_all)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Initialize lists to store metrics for each fold
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Perform 5-fold cross-validation using precomputed fold indices
for fold, (train_idx, test_idx) in enumerate(folds_indices):
    # Split embeddings and labels for the current fold
    X_train, X_test = bert_embeddings_all[train_idx], bert_embeddings_all[test_idx]
    y_train, y_test = y_common.iloc[train_idx], y_common.iloc[test_idx]

    # Train logistic regression model
    clf = LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)
    clf.fit(X_train, y_train)

    # Predict for the current fold
    y_pred = clf.predict(X_test)

    # Compute metrics
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred, average='macro')
    rec = metrics.recall_score(y_test, y_pred, average='macro')
    f1 = metrics.f1_score(y_test, y_pred, average='macro')

    # Store metrics
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)

    # Print classification report
    print(f"Fold {fold + 1} (BERT embeddings):")
    print(metrics.classification_report(y_test, y_pred, target_names=["Control", "Alzheimer's Indicator"]))

    # Confusion matrix
    conf_matrix = metrics.confusion_matrix(y_test, y_pred)
    labels = [label_map[i] for i in np.unique(y_common)]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Ground Truth Labels'
    df_cm.columns.name = 'Predicted Labels'
    plt.figure(figsize=(10, 7))
    sn.set(font_scale=1.4)
    sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
    plt.title(f'Confusion Matrix - Fold {fold + 1} (BERT embeddings)')
    plt.show()

# Compute and print average metrics
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print("Summary of Metrics for 5-Fold Cross-Validation (BERT embeddings):")
print(f"Accuracy:  {mean_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"Precision: {mean_precision:.4f} ± {std_precision:.4f}")
print(f"Recall:    {mean_recall:.4f} ± {std_recall:.4f}")
print(f"F1-score:  {mean_f1:.4f} ± {std_f1:.4f}")


### Tf-Idf with K-Fold Cross-Validation

This subsection evaluates the performance of a logistic regression classifier trained on Tf-Idf feature vectors using a 5-fold stratified cross-validation. The same data splits used in previous experiments are reused here to ensure consistent and fair comparison across embedding methods.


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

plt.rcParams.update({'font.size': 16})
plt.rcParams.update({'figure.figsize': [10, 6]})
plt.style.use('seaborn-v0_8-whitegrid')

# Define a pipeline for Tf-Idf + logistic regression
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000))])

# Generate Tf-Idf features for all transcripts
tfidf_matrix = text_clf.named_steps['tfidf'].fit_transform(text_clf.named_steps['vect'].fit_transform(X_common))

# Convert Tf-Idf sparse matrix to dense NumPy array for easier K-Fold slicing
tfidf_features_all = tfidf_matrix.toarray()



In [None]:
# Initialize metric lists for Tf-Idf evaluation
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Iterate through the folds using precomputed indices
for fold, (train_index, test_index) in enumerate(folds_indices):
    # Split features and labels for the current fold
    X_train_fold, X_test_fold = tfidf_features_all[train_index], tfidf_features_all[test_index]
    y_train_fold, y_test_fold = y_common.iloc[train_index], y_common.iloc[test_index]

    # Train logistic regression model
    clf = LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)
    clf.fit(X_train_fold, y_train_fold)

    # Predict for the current fold
    y_pred_fold = clf.predict(X_test_fold)

    # Compute metrics
    acc = metrics.accuracy_score(y_test_fold, y_pred_fold)
    prec = metrics.precision_score(y_test_fold, y_pred_fold, average='macro')
    rec = metrics.recall_score(y_test_fold, y_pred_fold, average='macro')
    f1 = metrics.f1_score(y_test_fold, y_pred_fold, average='macro')

    # Store results
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)

    # Print classification report for current fold
    print(f"Fold {fold + 1} (TF-IDF):")
    print(metrics.classification_report(y_test_fold, y_pred_fold, target_names=["Control", "Alzheimer's Indicator"]))

    # Plot confusion matrix
    conf_matrix = metrics.confusion_matrix(y_test_fold, y_pred_fold)
    labels = [label_map[i] for i in np.unique(y_common)]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Ground Truth Labels'
    df_cm.columns.name = 'Predicted Labels'
    plt.figure(figsize=(10, 7))
    sn.set(font_scale=1.4)
    sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
    plt.title(f'Confusion Matrix - Fold {fold + 1} (TF-IDF)')
    plt.show()

# Compute and print metric summary
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print("Summary of Metrics for 5-Fold Cross-Validation (TF-IDF):")
print(f"Accuracy:  {mean_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"Precision: {mean_precision:.4f} ± {std_precision:.4f}")
print(f"Recall:    {mean_recall:.4f} ± {std_recall:.4f}")
print(f"F1-score:  {mean_f1:.4f} ± {std_f1:.4f}")



### Non-Contextual Word Embeddings (GloVe) with K-Fold Cross-Validation

This subsection evaluates the performance of sentence embeddings generated using GloVe word embeddings through a 5-fold stratified cross-validation. The same train-test splits used in previous embedding methods are applied to ensure fair comparison.

In [None]:
# Load GloVe word embeddings from file
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

nltk.download('stopwords')  # Download stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

glove_embeddings_index = {}
# IMPORTANT: Modify this path to point to your local copy of the GloVe embeddings file (300-dimensional)
with open('/path/to/Glove/glove300.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        glove_embeddings_index[word] = coefs

print(f'Found {len(glove_embeddings_index)} word vectors.')

In [None]:
# Function to compute sentence embeddings from GloVe word vectors
def sentence_to_vec(text):
    words = str(text).lower()
    words = word_tokenize(words)
    words = [w for w in words if w not in stop_words]   # Remove stopwords
    vectors = []
    for w in words:
        try:
            vectors.append(glove_embeddings_index[w])   # Append word embedding
        except KeyError:
            continue
    vectors = np.array(vectors)
    summed = vectors.sum(axis=0)
    if not isinstance(summed, np.ndarray):
        return np.zeros(300)
    return summed / np.sqrt((summed ** 2).sum())


In [None]:
# Generate sentence embeddings for the full dataset
from tqdm import tqdm
from nltk.tokenize import word_tokenize

glove_embeddings_all = [sentence_to_vec(text) for text in tqdm(X_common)]
glove_embeddings_all = np.array(glove_embeddings_all)

In [None]:
# Initialize metric lists for GloVe evaluation
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Iterate through the folds using precomputed indices
for fold, (train_idx, test_idx) in enumerate(folds_indices):
    X_train_fold, X_test_fold = glove_embeddings_all[train_idx], glove_embeddings_all[test_idx]
    y_train_fold, y_test_fold = y_common.iloc[train_idx], y_common.iloc[test_idx]

    # Train logistic regression model
    clf = LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)
    clf.fit(X_train_fold, y_train_fold)

    # Predict for the current fold
    y_pred_fold = clf.predict(X_test_fold)

    # Compute metrics
    acc = metrics.accuracy_score(y_test_fold, y_pred_fold)
    prec = metrics.precision_score(y_test_fold, y_pred_fold, average='macro')
    rec = metrics.recall_score(y_test_fold, y_pred_fold, average='macro')
    f1 = metrics.f1_score(y_test_fold, y_pred_fold, average='macro')

    # Store results
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)

    # Classification report
    print(f"Fold {fold+1} (GloVe embeddings):")
    print(metrics.classification_report(y_test_fold, y_pred_fold, target_names=["Control", "Alzheimer's Indicator"]))

    # Confusion matrix
    conf_matrix = metrics.confusion_matrix(y_test_fold, y_pred_fold)
    labels = [label_map[i] for i in np.unique(y_common)]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Ground Truth Labels'
    df_cm.columns.name = 'Predicted Labels'
    plt.figure(figsize=(10, 7))
    sn.set(font_scale=1.4)
    sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
    plt.title(f'Confusion Matrix - Fold {fold + 1} (GloVe embeddings)')
    plt.show()

# Compute and print metric summary
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print("Summary of Metrics for 5-Fold Cross-Validation (GloVe embeddings):")
print(f"Accuracy:  {mean_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"Precision: {mean_precision:.4f} ± {std_precision:.4f}")
print(f"Recall:    {mean_recall:.4f} ± {std_recall:.4f}")
print(f"F1-score:  {mean_f1:.4f} ± {std_f1:.4f}")


### Gemma-2B with K-Fold Cross-Validation

This subsection evaluates the performance of sentence embeddings generated using the Gemma-2B language model. The same 5-fold stratified cross-validation setup employed in the other embedding methods was used for consistency and fair comparison.


In [None]:
!pip install transformers torch

In [None]:
!huggingface-cli login

In [None]:
from transformers import AutoModel, AutoTokenizer
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import torch

# Generate Gemma-2B sentence embeddings for all transcripts in X_common
gemma_embeddings_all = []
for i, text in enumerate(X_common):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling over token embeddings to obtain a sentence embedding
        embeddings = outputs.last_hidden_state.mean(dim=1)
    gemma_embeddings_all.append(embeddings[0])

# Convert to NumPy array
gemma_embeddings_all = np.array(gemma_embeddings_all)


In [None]:
# Import required modules for classification and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
# Initialize metric lists for Gemma-2B evaluation
accuracies = []
precisions = []
recalls = []
f1_scores = []

# Iterate through stored fold indices
for fold, (train_idx, test_idx) in enumerate(folds_indices):
    X_train_fold, X_test_fold = gemma_embeddings_all[train_idx], gemma_embeddings_all[test_idx]
    y_train_fold, y_test_fold = y_common.iloc[train_idx], y_common.iloc[test_idx]

    # Train logistic regression model
    clf = LogisticRegression(class_weight='balanced',solver='lbfgs',max_iter=1000)
    clf.fit(X_train_fold, y_train_fold)

    # Predict for the current fold
    y_pred_fold = clf.predict(X_test_fold)

    # Compute metrics
    acc = metrics.accuracy_score(y_test_fold, y_pred_fold)
    prec = metrics.precision_score(y_test_fold, y_pred_fold, average='macro')
    rec = metrics.recall_score(y_test_fold, y_pred_fold, average='macro')
    f1 = metrics.f1_score(y_test_fold, y_pred_fold, average='macro')

    # Store results
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)

    # Classification report
    print(f"Fold {fold+1} (Gemma-2B embeddings):")
    print(metrics.classification_report(y_test_fold, y_pred_fold, target_names=["Control", "Alzheimer's Indicator"]))

    # Confusion matrix
    conf_matrix = metrics.confusion_matrix(y_test_fold, y_pred_fold)
    labels = [label_map[i] for i in np.unique(y_common)]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Ground Truth Labels'
    df_cm.columns.name = 'Predicted Labels'
    plt.figure(figsize=(10, 7))
    sn.set(font_scale=1.4)
    sn.heatmap(df_cm, cmap="Blues", annot=True, fmt='g', annot_kws={"size": 20})
    plt.title(f'Confusion Matrix - Fold {fold+1} (Gemma-2B embeddings)')
    plt.show()

# Compute and print metric summary
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
mean_precision = np.mean(precisions)
std_precision = np.std(precisions)
mean_recall = np.mean(recalls)
std_recall = np.std(recalls)
mean_f1 = np.mean(f1_scores)
std_f1 = np.std(f1_scores)

print("Summary of Metrics for 5-Fold Cross-Validation (Gemma-2B embeddings):")
print(f"Accuracy:  {mean_accuracy:.4f} ± {std_accuracy:.4f}")
print(f"Precision: {mean_precision:.4f} ± {std_precision:.4f}")
print(f"Recall:    {mean_recall:.4f} ± {std_recall:.4f}")
print(f"F1-score:  {mean_f1:.4f} ± {std_f1:.4f}")