<h1><center>Fine Tuning </center></h1>

In [9]:

import pandas as pd
import torch
import numpy as np
import re
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint, uniform
from imblearn.over_sampling import SMOTE
import joblib


In [10]:
# Load dataset
df = pd.read_csv("dementia_dataset_6.csv")
df['Severity'] = df['Severity'].astype(int)

In [11]:
# Clean text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["Transcript_CTD"] = df["Transcript_CTD"].fillna("").apply(clean_text)
df["Transcript_PFT"] = df["Transcript_PFT"].fillna("").apply(clean_text)
df["Transcript_SFT"] = df["Transcript_SFT"].fillna("").apply(clean_text)
df['Text'] = df['Transcript_CTD'] + " " + df['Transcript_PFT'] + " " + df['Transcript_SFT']


In [12]:
# Define all feature columns to include
feature_columns = [
    'TTR', 'Brunet_Index', 'Avg_Word_Length',
    'NOUN_ratio', 'VERB_ratio', 'PRONOUN_ratio',
    'Subordinate_Clauses', 'Parse_Tree_Depth', 'Idea_Density',
    'Key_Elements_Described', 'Irrelevant_Details',
    'Pauses', 'Repair_Rate'
]

In [13]:
# Split dataset
train_idx, test_idx = train_test_split(df.index, test_size=0.3, random_state=42)
train_labels = df.loc[train_idx, 'Severity']
test_labels = df.loc[test_idx, 'Severity']
train_features = df.loc[train_idx, feature_columns].values
test_features = df.loc[test_idx, feature_columns].values

In [14]:
# Normalize numeric features
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)


In [15]:
joblib.dump(scaler, "speech_scaler.joblib")

['speech_scaler.joblib']

In [16]:

# Load BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")


In [17]:
# Function to extract BERT embeddings in batches
def extract_bert_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = list(texts[i:i+batch_size])
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=256)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [18]:
# Extract embeddings for each transcript type
ct_emb_train = extract_bert_embeddings(df.loc[train_idx, 'Transcript_CTD'])
pf_emb_train = extract_bert_embeddings(df.loc[train_idx, 'Transcript_PFT'])
sf_emb_train = extract_bert_embeddings(df.loc[train_idx, 'Transcript_SFT'])

ct_emb_test = extract_bert_embeddings(df.loc[test_idx, 'Transcript_CTD'])
pf_emb_test = extract_bert_embeddings(df.loc[test_idx, 'Transcript_PFT'])
sf_emb_test = extract_bert_embeddings(df.loc[test_idx, 'Transcript_SFT'])


In [19]:
# Combine embeddings and features
train_combined = np.hstack((ct_emb_train, pf_emb_train, sf_emb_train, train_features_scaled))
test_combined = np.hstack((ct_emb_test, pf_emb_test, sf_emb_test, test_features_scaled))


In [20]:
# Define a function for hyperparameter tuning
def tune_hyperparameters(X_train, y_train):
    model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="mlogloss")
    param_distributions = {
        "n_estimators": randint(50, 600),
        "learning_rate": uniform(0.01, 0.3),
        "max_depth": randint(3, 15),
        "min_child_weight": randint(1, 10),
        "gamma": uniform(0, 2),
        "subsample": uniform(0.5, 1.0),
        "colsample_bytree": uniform(0.5, 1.0),
        "reg_alpha": uniform(0, 5),
        "reg_lambda": uniform(0, 5),
    }
    search = RandomizedSearchCV(
        model,
        param_distributions=param_distributions,
        n_iter=100,
        scoring="accuracy",
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1,
    )
    print("Starting hyperparameter tuning with RandomizedSearchCV...")
    search.fit(X_train, y_train)
    print("Best parameters found from RandomizedSearchCV:", search.best_params_)
    return search.best_estimator_


In [21]:
# Perform hyperparameter tuning
best_model = tune_hyperparameters(train_combined, train_labels)


Starting hyperparameter tuning with RandomizedSearchCV...
Fitting 5 folds for each of 100 candidates, totalling 500 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters found from RandomizedSearchCV: {'colsample_bytree': np.float64(0.8500784076946757), 'gamma': np.float64(1.2902067240611297), 'learning_rate': np.float64(0.2106772178989299), 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 565, 'reg_alpha': np.float64(0.1215798321572692), 'reg_lambda': np.float64(3.227361479535839), 'subsample': np.float64(0.6771106794070489)}


In [22]:
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    pred_probabilities = model.predict_proba(X_test)

    # Convert to numpy arrays to ensure consistent handling
    true_labels = np.array(y_test)
    pred_labels = np.array(predictions)

    print("\nEvaluation Results:")
    print(f"Accuracy: {accuracy_score(true_labels, pred_labels):.2f}")
    print(f"Precision: {precision_score(true_labels, pred_labels, average='weighted'):.2f}")
    print(f"Recall: {recall_score(true_labels, pred_labels, average='weighted'):.2f}")
    print(f"F1-Score: {f1_score(true_labels, pred_labels, average='weighted'):.2f}")
    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, pred_labels))
    
    # Probability analysis - with fixes
    print("\nProbability Distribution Analysis:")
    
    # Calculate average prediction probability for correct predictions
    correct_indices = pred_labels == true_labels
    if np.any(correct_indices):
        # Fixed: properly extract probabilities for the correct class
        correct_probs = [prob[label] for prob, label in zip(pred_probabilities[correct_indices], true_labels[correct_indices])]
        print(f"Average probability for correct predictions: {np.mean(correct_probs):.4f}")
    
    # Calculate average highest probability for incorrect predictions
    incorrect_indices = ~correct_indices
    if np.any(incorrect_indices):
        incorrect_max_probs = [np.max(prob) for prob in pred_probabilities[incorrect_indices]]
        print(f"Average highest probability for incorrect predictions: {np.mean(incorrect_max_probs):.4f}")
    
    # Find uncertain predictions (highest probability < threshold)
    threshold = 0.7
    uncertain_predictions = [i for i, probs in enumerate(pred_probabilities) 
                           if np.max(probs) < threshold]
    print(f"Number of uncertain predictions (max prob < {threshold}): {len(uncertain_predictions)}")
    
    return pred_labels, pred_probabilities

In [23]:
# Evaluate the tuned model
predictions, probabilities = evaluate_model(best_model, test_combined, test_labels)



Evaluation Results:
Accuracy: 0.85
Precision: 0.84
Recall: 0.85
F1-Score: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87        92
           1       0.93      0.99      0.96        67
           2       0.44      0.24      0.31        17
           3       0.78      0.70      0.74        10

    accuracy                           0.85       186
   macro avg       0.75      0.70      0.72       186
weighted avg       0.84      0.85      0.84       186


Confusion Matrix:
[[82  4  4  2]
 [ 0 66  1  0]
 [13  0  4  0]
 [ 2  1  0  7]]

Probability Distribution Analysis:
Average probability for correct predictions: 0.8687
Average highest probability for incorrect predictions: 0.7371
Number of uncertain predictions (max prob < 0.7): 33


In [24]:
# Optional: Save the probabilities for further analysis
results_df = pd.DataFrame(probabilities)
results_df['true_class'] = test_labels.values
results_df['predicted_class'] = predictions
results_df.to_csv('prediction_probabilities_speech.csv', index=False)
print("Detailed probability data saved to 'prediction_probabilities_speech.csv'")


Detailed probability data saved to 'prediction_probabilities_speech.csv'


In [25]:
# Save using joblib (better for large NumPy arrays, slightly faster)
joblib.dump(best_model, 'best_model_speech.joblib')
print("Model saved to 'best_model_speech.joblib' using joblib")


Model saved to 'best_model_speech.joblib' using joblib
