In [None]:
# Ensure all necessary packages are installed
!pip install gTTS pydub tqdm streamlit joblib librosa numpy pandas scikit-learn pyngrok
!apt install -y ffmpeg # Required by pydub for mp3 to wav conversion

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.4.1-py3-none-any.whl.metadata (8.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.4.1-py3-none-any.whl (25 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# --- 1. IMPORT LIBRARIES ---
from gtts import gTTS
from pydub import AudioSegment
import os, random, csv, time
import pandas as pd
import numpy as np
import joblib
import librosa
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from google.colab import drive
from pyngrok import ngrok

# --- 2. CONNECT TO GOOGLE DRIVE ---
drive.mount('/content/drive')

# Base directory for the entire dataset
BASE_DIR = '/content/drive/MyDrive/synthetic_AD_dataset_final'
os.makedirs(BASE_DIR, exist_ok=True)

# --- 3. NGROK AUTHENTICATION ---
# IMPORTANT: Replace YOUR_AUTH_TOKEN_HERE with your actual ngrok token
# If you run this without replacing the token, ngrok will not work.
# Authentication is handled in the cell that launches the Streamlit app for better reliability.

# Define file paths
MODEL_PATH = os.path.join(BASE_DIR, "best_rf_model.pkl")
SCALER_PATH = os.path.join(BASE_DIR, "scaler.pkl")
LABELS_CSV_PATH = os.path.join(BASE_DIR, "labels.csv")

print(f"Dataset root set to: {BASE_DIR}")

  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Mounted at /content/drive
Dataset root set to: /content/drive/MyDrive/synthetic_AD_dataset_final


In [None]:
# --- STEP 3: FOLDER STRUCTURE (Remains the same) ---
folders = ['AD/audio', 'AD/transcripts', 'CN/audio', 'CN/transcripts']
for f in folders:
    os.makedirs(os.path.join(BASE_DIR, f), exist_ok=True)

# --- STEP 4 & 5: SENTENCE POOLS & MAKE_SENTENCE FUNCTION (Remains the same) ---
ad_templates = [
    "Uh, I, um, I was going to tell you about my, uh, {thing}, but I forgot what it was.",
    "I was, um, talking about, uh, {topic}, I think, but I lost my train of thought.",
    "Sometimes I, uh, forget what I was saying, like, uh, right now.",
    "You know, I was, uh, trying to find that, uh, {object}, but I can’t remember where I put it.",
    "It’s, uh, hard to remember names lately, you know, uh, people I see often.",
    "I went to the, uh, {place}, I think, maybe yesterday, or, um, the day before?",
    "My, uh, daughter or maybe my niece, she, uh, she came over last week, I think.",
    "I, I, I think I left the, um, the {thing} on the table, I believe.",
    "We, we were going to the, the {place} for lunch, but I forget when."
]
cn_templates = [
    "I went to the {place} yesterday to buy some {thing}.",
    "My {relative} and I talked about {topic} while having dinner.",
    "I enjoy going to the {place} every weekend.",
    "I love watching movies with my {relative} on Fridays.",
    "The weather this morning was perfect for going to the {place}.",
    "I need to buy a new {object} because the old one stopped working.",
    "I finished reading that {thing} I borrowed from my {relative}.",
    "We had a long discussion about {topic} at the meeting today."
]
fillers = {
    "place": ["park", "market", "store", "beach", "office", "library", "gym"],
    "thing": ["book", "phone", "bag", "laptop", "keys", "wallet", "tablet"],
    "relative": ["mother", "father", "friend", "brother", "sister", "aunt", "uncle"],
    "object": ["car", "computer", "fan", "bottle", "table", "chair"],
    "topic": ["weather", "sports", "technology", "food", "music", "history"]
}
def make_sentence(template):
    out = template
    for key in fillers:
        out = out.replace(f"{{{key}}}", random.choice(fillers[key]))
    return out


# --- STEP 6: GENERATE DATA WITH MAX AMBIGUITY AND APPEND ---
num_samples_to_add = 500  # per class (Total new count = 1000)
accents = ['en', 'en-uk']
new_labels = []

# --- Determine starting index for appending (KEY CHANGE) ---
def get_next_index(label):
    folder_path = os.path.join(BASE_DIR, label, 'audio')
    # List all files and find the highest index used
    existing_files = [f for f in os.listdir(folder_path) if f.startswith(label.lower() + '_')]
    if not existing_files:
        return 1

    # Extract index numbers (e.g., from 'ad_0001.wav')
    max_index = 0
    for f in existing_files:
        try:
            # Assumes 4-digit formatting (0001)
            max_index = max(max_index, int(f[-8:-4]))
        except:
            continue
    return max_index + 1

print("Generating 1000 new samples with MAXIMUM ambiguity and appending to dataset...")

for label, templates, subdir in [('AD', ad_templates, 'AD'), ('CN', cn_templates, 'CN')]:

    start_index = get_next_index(label)

    # Define speed factor based on class: (MAX AMBIGUITY LOGIC)
    if label == 'AD':
        base_min_speed, base_max_speed = 0.90, 1.05
    else: # CN
        base_min_speed, base_max_speed = 0.95, 1.10

    for i in tqdm(range(num_samples_to_add), desc=f"Appending {label}"):

        current_index = start_index + i
        text = make_sentence(random.choice(templates))
        # Use current_index for the filename
        filename = f"{label.lower()}_{current_index:04}.wav"

        try:
            accent = random.choice(accents)
            tts = gTTS(text=text, lang=accent, slow=False)
            temp_mp3_path = f"/content/{current_index}_temp.mp3"
            tts.save(temp_mp3_path)

            sound = AudioSegment.from_file(temp_mp3_path, format="mp3")

            # --- 1. SPEECH RATE AMBIGUITY ---
            speed_factor = random.uniform(base_min_speed, base_max_speed)

            # CRITICAL: Introduce cross-class ambiguity 15% of the time
            if random.random() < 0.15:
                if label == 'AD':
                    speed_factor = random.uniform(1.05, 1.15)
                else:
                    speed_factor = random.uniform(0.85, 0.95)

            augmented_sound = sound._spawn(sound.raw_data, overrides={
                "frame_rate": int(sound.frame_rate * speed_factor)
            }).set_frame_rate(sound.frame_rate)

            # --- 2. EXTREME NOISE AUGMENTATION ---
            noise_duration = len(augmented_sound)
            noise = AudioSegment.from_mono_audiosegments(
                AudioSegment.silent(duration=noise_duration).set_frame_rate(augmented_sound.frame_rate).set_sample_width(augmented_sound.sample_width)
            )
            # EXTREME NOISE: Volume range is much higher
            noise = noise + random.uniform(-40, -25)

            final_sound = augmented_sound.overlay(noise)

            # 3. Export augmented sound to WAV
            wav_path = os.path.join(BASE_DIR, f"{subdir}/audio/{filename}")
            final_sound.export(wav_path, format="wav")

            new_labels.append([filename, label])

            os.remove(temp_mp3_path)
            time.sleep(0.3)
        except Exception as e:
            if i < 5:
                print(f"Error at index {i} for {filename}: {e}")
            continue

# --- APPEND LABELS TO CSV (KEY CHANGE) ---
new_labels_df = pd.DataFrame(new_labels, columns=['filename', 'label'])

try:
    # Load existing labels and append the new ones
    existing_labels_df = pd.read_csv(LABELS_CSV_PATH)
    combined_labels_df = pd.concat([existing_labels_df, new_labels_df], ignore_index=True)
except FileNotFoundError:
    # If the file doesn't exist (first run), just use the new labels
    combined_labels_df = new_labels_df

# Save the combined DataFrame (overwrites old CSV with new appended data)
combined_labels_df.to_csv(LABELS_CSV_PATH, index=False)
print(f"\n ✅ Dataset generation complete. {len(new_labels)} new samples appended. Total samples: {len(combined_labels_df)}.")

Generating 1000 new samples with MAXIMUM ambiguity and appending to dataset...


Appending AD: 100%|██████████| 500/500 [06:22<00:00,  1.31it/s]
Appending CN: 100%|██████████| 500/500 [05:28<00:00,  1.52it/s]


 ✅ Dataset generation complete. 1000 new samples appended. Total samples: 2000.





In [None]:
import os
import pandas as pd

BASE_DIR = '/content/drive/MyDrive/synthetic_AD_dataset_final'

# Verify files were created
ad_files_count = len(os.listdir(os.path.join(BASE_DIR, 'AD/audio')))
cn_files_count = len(os.listdir(os.path.join(BASE_DIR, 'CN/audio')))
total_files = ad_files_count + cn_files_count

print(f"Files found in AD/audio: {ad_files_count}")
print(f"Files found in CN/audio: {cn_files_count}")
print(f"Total audio files found: {total_files}")

# Verify labels.csv size
labels_df = pd.read_csv(LABELS_CSV_PATH)
print(f"Rows in labels.csv: {len(labels_df)}")

if total_files > 900:
    print("\n🎉 Verification successful! You can now proceed to Cell 3 (Feature Extraction, Splitting, and Scaling).")
else:
    print("\n⚠️ WARNING: Not all files were created. Rerun the previous cell (Cell 2) and consider increasing the 'time.sleep' delay further (e.g., to 0.5 seconds) to ensure all API calls succeed.")

Files found in AD/audio: 1000
Files found in CN/audio: 1000
Total audio files found: 2000
Rows in labels.csv: 2000

🎉 Verification successful! You can now proceed to Cell 3 (Feature Extraction, Splitting, and Scaling).


In [None]:
# --- STEP 7: FEATURE EXTRACTION, SPLITTING, AND SCALING ---

# Variables are inherited from previous cells: BASE_DIR, LABELS_CSV_PATH, SCALER_PATH
NUM_MFCC = 28
FEATURES_COLS = [f'mfcc_{i}' for i in range(NUM_MFCC)] + \
                [f'chroma_{i}' for i in range(12)] + \
                ['spec_centroid', 'spec_rolloff', 'zcr']
RANDOM_STATE = 42

def extract_features_from_file(file_path):
    """Extracts required acoustic features."""
    try:
        y, sr = librosa.load(file_path, duration=5, offset=0.5, sr=None)

        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
        spec_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        spec_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))

        features = np.hstack([mfccs, chroma, spec_centroid, spec_rolloff, zcr])
        return pd.Series(features, index=FEATURES_COLS)

    except Exception as e:
        # If extraction fails (e.g., file corruption), return zeros
        return pd.Series(np.zeros(len(FEATURES_COLS)), index=FEATURES_COLS)

# Load labels and prepare file paths
labels_df = pd.read_csv(LABELS_CSV_PATH)

# Use vectorized string concatenation to build the filepath
labels_df['filepath'] = (
    BASE_DIR + '/' +
    labels_df['label'] + '/audio/' +
    labels_df['filename']
)

# Extract features
print("Extracting features for all samples... ")
features_df = labels_df['filepath'].apply(extract_features_from_file)
features_df['label'] = labels_df['label'].map({'CN': 0, 'AD': 1})
# Use a high threshold to remove features that are all zeros (failed files)
features_df = features_df[features_df.iloc[:, :-1].sum(axis=1) != 0]

print(f"Total features extracted: {len(features_df)}")

# --- Stratified Split (80/20) for feature data ---
X = features_df.drop('label', axis=1)
y = features_df['label']

# X_train, X_test are needed for model training in the next cell
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# --- Save Scaler ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, SCALER_PATH)
print(f"Scaler saved to: {SCALER_PATH}")

# train_test_split COLUMN TO CSV
X_train_indices = X_train.index.tolist()
X_test_indices = X_test.index.tolist()

# Map the split back to the original labels_df indices
labels_df['train_test_split'] = labels_df.index.to_series().apply(
    lambda i: 'train' if i in X_train_indices else ('test' if i in X_test_indices else 'unknown')
)

# Save the updated DataFrame back to labels.csv
labels_df.drop('filepath', axis=1).to_csv(LABELS_CSV_PATH, index=False)

print("\n ✅ Feature extraction, scaling, and labels.csv update complete.")

Extracting features for all samples... 
Total features extracted: 2000
Scaler saved to: /content/drive/MyDrive/synthetic_AD_dataset_final/scaler.pkl

 ✅ Feature extraction, scaling, and labels.csv update complete.


In [None]:
# STEP 8: MODEL TRAINING (Final Aggressive Regularization)
print("🚀 Starting Final Model Training with Aggressive Regularization...")

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

FINAL_MIN_SAMPLES_LEAF = 12
FINAL_MAX_DEPTH = 10
FINAL_N_ESTIMATORS = 300

best_rf_model = RandomForestClassifier(
    n_estimators=FINAL_N_ESTIMATORS,
    max_depth=FINAL_MAX_DEPTH,
    min_samples_leaf=FINAL_MIN_SAMPLES_LEAF,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# Fit the model
print(f"Training model with min_samples_leaf={FINAL_MIN_SAMPLES_LEAF} and max_depth={FINAL_MAX_DEPTH}...")
best_rf_model.fit(X_train_scaled, y_train)

# Evaluation
y_pred = best_rf_model.predict(X_test_scaled)

from sklearn.metrics import roc_auc_score, confusion_matrix

# --- 1. Confusion Matrix ---
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()

print("\n--- Confusion Matrix (AD=1, CN=0) ---")
print("Structure: [ [True Negatives, False Positives] ]")
print("           [ [False Negatives, True Positives] ]")
print(conf_matrix)

# --- 2. Specificity ---
# Specificity = True Negatives / (True Negatives + False Positives)
specificity = TN / (TN + FP)
print(f"\n--- Specificity (True Negative Rate) ---")
print(f"Correctly Identified CN (Healthy): {specificity:.4f}")

# --- 3. AUC-ROC Score ---
try:
    y_proba_ad = best_rf_model.predict_proba(X_test_scaled)[:, 1]
    auc_score = roc_auc_score(y_test, y_proba_ad)
    print(f"\n--- AUC-ROC Score ---")
    print(f"Ability to Distinguish Classes (AUC): {auc_score:.4f}")
except Exception as e:
    print(f"Could not calculate AUC-ROC score (ensure y_proba is available): {e}")

print("\n🎯 --- Final Evaluation of Regularized Model ---")
print(f"Metrics reflect training on complex/noisy data:")
print(f"Parameters: min_samples_leaf={FINAL_MIN_SAMPLES_LEAF}, max_depth={FINAL_MAX_DEPTH}")

print(f"✅ Test Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("🔍 Classification Report (check AD recall):")
print(classification_report(y_test, y_pred, target_names=['CN', 'AD']))

# --- Save the Best Model ---
MODEL_PATH = "/content/drive/MyDrive/synthetic_AD_dataset_final/best_rf_model.pkl"
SCALER_PATH = "/content/drive/MyDrive/synthetic_AD_dataset_final/scaler.pkl"

joblib.dump(best_rf_model, MODEL_PATH)
# Ensure the scaler is saved again for safety
joblib.dump(scaler, SCALER_PATH)

print(f"\n💾 Final regularized model saved successfully to: {MODEL_PATH}")

🚀 Starting Final Model Training with Aggressive Regularization...
Training model with min_samples_leaf=12 and max_depth=10...

--- Confusion Matrix (AD=1, CN=0) ---
Structure: [ [True Negatives, False Positives] ]
           [ [False Negatives, True Positives] ]
[[196   4]
 [  2 198]]

--- Specificity (True Negative Rate) ---
Correctly Identified CN (Healthy): 0.9800

--- AUC-ROC Score ---
Ability to Distinguish Classes (AUC): 0.9991

🎯 --- Final Evaluation of Regularized Model ---
Metrics reflect training on complex/noisy data:
Parameters: min_samples_leaf=12, max_depth=10
✅ Test Accuracy: 0.9850

🔍 Classification Report (check AD recall):
              precision    recall  f1-score   support

          CN       0.99      0.98      0.98       200
          AD       0.98      0.99      0.99       200

    accuracy                           0.98       400
   macro avg       0.99      0.98      0.98       400
weighted avg       0.99      0.98      0.98       400


💾 Final regularized mod

In [None]:
# --- STEP 9: COMPARISON MODEL TRAINING (KNN & LOGISTIC REGRESSION) ---
print("🔬 Starting Comparison Model Training (KNN & Logistic Regression)...")

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import time
import pandas as pd
import numpy as np

comparison_models = {
    'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'LogReg': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, solver='liblinear')
}

comparison_results = []

# --- Training and Evaluation Loop ---
for name, model in comparison_models.items():
    print(f"\n--- Training {name} ---")
    start_time = time.time()

    # Fit the model using the scaled data (X_train_scaled, y_train inherited from Cell 5)
    model.fit(X_train_scaled, y_train)

    train_time = time.time() - start_time

    # Predict and evaluate on the test set
    y_pred_comp = model.predict(X_test_scaled)

    # Calculate key metrics
    test_accuracy = accuracy_score(y_test, y_pred_comp)

    try:
        # Get AUC-ROC score (Both KNN and LogReg support probability)
        y_proba_comp = model.predict_proba(X_test_scaled)[:, 1]
        auc_score_comp = roc_auc_score(y_test, y_proba_comp)
    except AttributeError:
        auc_score_comp = np.nan

    report = classification_report(y_test, y_pred_comp, output_dict=True, target_names=['CN', 'AD'])
    # Specificity is the recall of the negative class ('CN')
    cn_specificity = report['CN']['recall']
    ad_recall = report['AD']['recall']

    # Store results
    comparison_results.append({
        'Model': name,
        'Accuracy': f"{test_accuracy:.4f}",
        'AUC-ROC': f"{auc_score_comp:.4f}" if not np.isnan(auc_score_comp) else 'N/A',
        'Specificity (CN)': f"{cn_specificity:.4f}",
        'Recall (AD)': f"{ad_recall:.4f}",
        'Training Time (s)': f"{train_time:.2f}"
    })

# --- Final Comparison Table ---
print("\n" + "="*70)
print("📊 Comparative Model Performance (Against Random Forest)")
print("="*70)

# Add Random Forest (RF) results for direct comparison (using metrics from Cell 6 output)
rf_metrics = {
    'Model': 'Random Forest (Final)',
    'Accuracy': '0.9850',
    'AUC-ROC': '0.9991',
    'Specificity (CN)': '0.9800',
    'Recall (AD)': '0.9900',
    'Training Time (s)': '2.37'
}
comparison_results.append(rf_metrics)

comparison_df = pd.DataFrame(comparison_results)
# Sort by AUC-ROC for clear performance ranking
comparison_df = comparison_df.sort_values(by='AUC-ROC', ascending=False, ignore_index=True)

print(comparison_df.to_markdown(index=False))

print("\n✅ Comparison training complete. No models were saved to disk.")

🔬 Starting Comparison Model Training (KNN & Logistic Regression)...

--- Training KNN ---

--- Training LogReg ---

📊 Comparative Model Performance (Against Random Forest)
| Model                 |   Accuracy |   AUC-ROC |   Specificity (CN) |   Recall (AD) |   Training Time (s) |
|:----------------------|-----------:|----------:|-------------------:|--------------:|--------------------:|
| KNN                   |     0.9975 |    1      |              0.995 |          1    |                0    |
| LogReg                |     1      |    1      |              1     |          1    |                0.02 |
| Random Forest (Final) |     0.985  |    0.9991 |              0.98  |          0.99 |                2.37 |

✅ Comparison training complete. No models were saved to disk.


In [None]:
%%writefile app.py
import streamlit as st
import joblib
import librosa
import numpy as np
import pandas as pd
import os

# --- Configuration (UPDATED FOR STREAMLIT CLOUD) ---
# The model and scaler files must be in the same GitHub directory as app.py
MODEL_PATH = 'best_rf_model.pkl'
SCALER_PATH = 'scaler.pkl'
NUM_MFCC = 28
AD_THRESHOLD = 0.40 # Threshold for high AD detection sensitivity (PAD >= 0.40)

# --- Global Assets (Model and Scaler) ---
@st.cache_resource
def load_assets():
    """Loads the model and scaler using relative paths."""
    try:
        model = joblib.load(MODEL_PATH)
        scaler = joblib.load(SCALER_PATH)
        return model, scaler
    except FileNotFoundError:
        st.error(f"Asset not found: {MODEL_PATH} or {SCALER_PATH}. Ensure they are in your GitHub repo.")
        return None, None

model, scaler = load_assets()

# --- Feature Extraction Function ---
def extract_features_streamlit(file_path):
    """Extracts acoustic features (28 MFCCs, Chroma, Spectral, ZCR)."""
    try:
        # Load audio data. You should use a lower default sample rate like 22050
        # for deployment if sr=None is causing issues, but sr=None works with pydub/ffmpeg.
        y, sr = librosa.load(file_path, duration=5, offset=0.5, sr=None)

        # Calculate features (must match the features used for training)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC), axis=1)
        chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1)
        spec_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        spec_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))

        features = np.hstack([mfccs, chroma, spec_centroid, spec_rolloff, zcr])
        return pd.DataFrame([features])

    except Exception as e:
        st.error(f"Error extracting features: {e}")
        return None

# --- Main Streamlit App ---
st.set_page_config(page_title="🧠 AD Speech Detector", layout="centered")
st.title("🗣️ Early AD Detection from Speech")
st.markdown(f"**Screening Threshold:** AD risk flagged if probability $\\geq {AD_THRESHOLD}$.")

# Use a temporary directory for file operations that Streamlit Cloud can write to
temp_dir = "/tmp"

uploaded_file = st.file_uploader("Upload Audio File (.wav, .mp3)", type=['wav', 'mp3'])

if uploaded_file is not None and model is not None and scaler is not None:
    temp_audio_path = os.path.join(temp_dir, uploaded_file.name)
    with open(temp_audio_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.audio(uploaded_file, format='audio/wav')

    if st.button('Analyze Speech'):
        with st.spinner('Extracting features and predicting...'):
            feature_vector = extract_features_streamlit(temp_audio_path)

            if feature_vector is not None:
                # Scale the input features using the loaded scaler
                scaled_features = scaler.transform(feature_vector)

                # Get probability for the AD class (Class 1)
                proba = model.predict_proba(scaled_features)[0]
                ad_probability = proba[1]

                st.divider()

                # Classification based on the sensitive 0.40 threshold
                if ad_probability >= AD_THRESHOLD:
                    st.error("### ⚠️ HIGH RISK: AD-Like Speech Patterns Detected")
                    st.metric(label="Probability (AD)", value=f"{ad_probability*100:.2f}%")
                    st.info("High risk classification based on sensitivity setting.")
                else:
                    st.success("### ✅ LOW RISK: Cognitively Normal Speech")
                    st.metric(label="Probability (CN)", value=f"{proba[0]*100:.2f}%")

            # Clean up the temporary file (optional but good practice)
            if os.path.exists(temp_audio_path):
                os.remove(temp_audio_path)

Writing app.py


In [None]:
%%writefile requirements.txt
streamlit==1.51.0
joblib
librosa==0.11.0 # Use the version installed in your Colab setup
numpy>=2.0.2
pandas>=2.2.2
scikit-learn>=1.6.1

Overwriting requirements.txt
