In [None]:
# NeuroScan: Step 1 - Environment Setup & Dataset Loading
# Run this in Google Colab

# Install required libraries
!pip install kaggle numpy pandas scikit-learn matplotlib seaborn
!pip install librosa pywavelets scipy
!pip install tensorflow keras

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# ===========================================
# KAGGLE DATASET DOWNLOAD
# ===========================================

# Step 1: Upload your kaggle.json file
from google.colab import files
print("Please upload your kaggle.json file:")
uploaded = files.upload()

# Step 2: Setup Kaggle credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 3: Download Parkinson's Voice Dataset
print("\n=== Downloading Parkinson's Voice Dataset ===")
!kaggle datasets download -d vikasukani/parkinsons-disease-data-set
!unzip -q parkinsons-disease-data-set.zip -d ./parkinsons_voice

# Alternative datasets (uncomment as needed):
# !kaggle datasets download -d dipayanbiswas/parkinsons-disease-speech-signal-features
# !kaggle datasets download -d ruslankl/parkinsons-data-set

# Step 4: Load the dataset
print("\n=== Loading Dataset ===")
df = pd.read_csv('./parkinsons_voice/parkinsons.data')

# Display dataset info
print(f"\nDataset Shape: {df.shape}")
print(f"\nColumn Names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nDataset Info:")
print(df.info())

print(f"\nClass Distribution:")
print(df['status'].value_counts())
print(f"\nPercentage: \n{df['status'].value_counts(normalize=True) * 100}")

# Check for missing values
print(f"\nMissing Values:")
print(df.isnull().sum())

# Basic statistics
print(f"\nBasic Statistics:")
print(df.describe())

# Save preprocessed data
df.to_csv('parkinsons_loaded.csv', index=False)
print("\n‚úÖ Dataset loaded and saved successfully!")

Please upload your kaggle.json file:


In [None]:
# NeuroScan: Step 2 - Exploratory Data Analysis (EDA)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load dataset
df = pd.read_csv('parkinsons_loaded.csv')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# ===========================================
# 1. CLASS DISTRIBUTION VISUALIZATION
# ===========================================
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='status', ax=axes[0])
axes[0].set_title('Parkinson\'s Disease Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Status (0=Healthy, 1=PD)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)

# Pie chart
status_counts = df['status'].value_counts()
axes[1].pie(status_counts, labels=['PD', 'Healthy'], autopct='%1.1f%%',
            startangle=90, colors=['#ff6b6b', '#4ecdc4'])
axes[1].set_title('Class Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 2. FEATURE CORRELATION HEATMAP
# ===========================================
plt.figure(figsize=(20, 16))

# Select numeric columns (exclude 'name' column)
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

# Create heatmap
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 3. KEY FEATURES DISTRIBUTION
# ===========================================
# Voice features of interest
key_features = ['MDVP:Fo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Shimmer',
                'HNR', 'RPDE', 'DFA', 'spread1', 'PPE']

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    # Box plot for each feature
    df.boxplot(column=feature, by='status', ax=axes[idx])
    axes[idx].set_title(f'{feature}')
    axes[idx].set_xlabel('Status (0=Healthy, 1=PD)')
    plt.sca(axes[idx])
    plt.xticks([1, 2], ['Healthy', 'PD'])

plt.suptitle('Key Voice Features Distribution by Status',
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('features_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 4. FEATURE IMPORTANCE ANALYSIS
# ===========================================
# Calculate correlation with target variable
feature_correlations = df[numeric_cols].corrwith(df['status']).abs().sort_values(ascending=False)

# Remove 'status' itself
feature_correlations = feature_correlations.drop('status')

# Plot top 15 features
plt.figure(figsize=(12, 8))
top_features = feature_correlations.head(15)
sns.barplot(x=top_features.values, y=top_features.index, palette='viridis')
plt.title('Top 15 Features Correlated with Parkinson\'s Status',
          fontsize=14, fontweight='bold')
plt.xlabel('Absolute Correlation', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n=== Top 10 Most Correlated Features ===")
print(feature_correlations.head(10))

# ===========================================
# 5. STATISTICAL TESTS
# ===========================================
print("\n=== Statistical Significance Tests ===")
print("Testing difference between Healthy and PD groups:\n")

healthy = df[df['status'] == 0]
pd_patients = df[df['status'] == 1]

for feature in key_features:
    stat, p_value = stats.ttest_ind(healthy[feature], pd_patients[feature])
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
    print(f"{feature:20s}: t-stat={stat:7.3f}, p-value={p_value:.6f} {significance}")

# ===========================================
# 6. PAIR PLOT (Selected Features)
# ===========================================
print("\n=== Creating Pair Plot (this may take a moment) ===")
selected_features = ['MDVP:Fo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Shimmer', 'HNR', 'status']
pairplot_df = df[selected_features].copy()
pairplot_df['status'] = pairplot_df['status'].map({0: 'Healthy', 1: 'PD'})

sns.pairplot(pairplot_df, hue='status', palette={'Healthy': '#4ecdc4', 'PD': '#ff6b6b'},
             diag_kind='kde', plot_kws={'alpha': 0.6})
plt.suptitle('Pair Plot of Selected Voice Features', y=1.02, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('pairplot.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úÖ EDA completed! All visualizations saved.")

In [None]:
# NeuroScan: Step 3 - Data Preprocessing & Feature Engineering

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import pickle

# Load dataset
df = pd.read_csv('parkinsons_loaded.csv')
print(f"Original dataset shape: {df.shape}")

# ===========================================
# 1. REMOVE NON-FEATURE COLUMNS
# ===========================================
# Remove 'name' column as it's just an identifier
if 'name' in df.columns:
    df = df.drop('name', axis=1)
    print("Removed 'name' column")

# ===========================================
# 2. SEPARATE FEATURES AND TARGET
# ===========================================
X = df.drop('status', axis=1)
y = df['status']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")

# ===========================================
# 3. FEATURE ENGINEERING
# ===========================================
print("\n=== Feature Engineering ===")

# Create interaction features
X['jitter_shimmer_ratio'] = X['MDVP:Jitter(%)'] / (X['MDVP:Shimmer'] + 1e-6)
X['fo_hnr_product'] = X['MDVP:Fo(Hz)'] * X['HNR']
X['spread1_spread2_ratio'] = X['spread1'] / (X['spread2'] + 1e-6)
X['jitter_squared'] = X['MDVP:Jitter(%)'] ** 2
X['shimmer_squared'] = X['MDVP:Shimmer'] ** 2

print(f"Features after engineering: {X.shape}")

# ===========================================
# 4. HANDLE OUTLIERS (Optional)
# ===========================================
def remove_outliers(df, columns, threshold=3):
    """Remove outliers using Z-score method"""
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[columns]))
    return df[(z_scores < threshold).all(axis=1)]

# Uncomment to remove outliers
# X_combined = pd.concat([X, y], axis=1)
# X_combined = remove_outliers(X_combined, X.columns)
# X = X_combined.drop('status', axis=1)
# y = X_combined['status']
# print(f"Shape after outlier removal: {X.shape}")

# ===========================================
# 5. TRAIN-TEST SPLIT
# ===========================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n=== Data Split ===")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Training labels distribution:\n{y_train.value_counts()}")
print(f"Test labels distribution:\n{y_test.value_counts()}")

# ===========================================
# 6. FEATURE SCALING
# ===========================================
print("\n=== Feature Scaling ===")

# StandardScaler (for most ML algorithms)
scaler_standard = StandardScaler()
X_train_scaled = scaler_standard.fit_transform(X_train)
X_test_scaled = scaler_standard.transform(X_test)

# RobustScaler (for outlier-resistant scaling)
scaler_robust = RobustScaler()
X_train_robust = scaler_robust.fit_transform(X_train)
X_test_robust = scaler_robust.transform(X_test)

# MinMaxScaler (for neural networks)
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

print("‚úÖ Applied StandardScaler, RobustScaler, and MinMaxScaler")

# ===========================================
# 7. HANDLE CLASS IMBALANCE
# ===========================================
print("\n=== Handling Class Imbalance ===")

# Check class distribution
print(f"Original class distribution:")
print(f"Class 0: {np.sum(y_train == 0)} ({np.sum(y_train == 0)/len(y_train)*100:.1f}%)")
print(f"Class 1: {np.sum(y_train == 1)} ({np.sum(y_train == 1)/len(y_train)*100:.1f}%)")

# Apply SMOTE (Synthetic Minority Over-sampling)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print(f"\nAfter SMOTE:")
print(f"Class 0: {np.sum(y_train_smote == 0)} ({np.sum(y_train_smote == 0)/len(y_train_smote)*100:.1f}%)")
print(f"Class 1: {np.sum(y_train_smote == 1)} ({np.sum(y_train_smote == 1)/len(y_train_smote)*100:.1f}%)")

# Alternative: SMOTETomek (combines over and under sampling)
smotetomek = SMOTETomek(random_state=42)
X_train_smotetomek, y_train_smotetomek = smotetomek.fit_resample(X_train_scaled, y_train)

# ===========================================
# 8. FEATURE SELECTION
# ===========================================
print("\n=== Feature Selection ===")

# Select K Best features
k = 15  # Number of top features to select
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train_smote, y_train_smote)
X_test_selected = selector.transform(X_test_scaled)

# Get selected feature names
selected_features_mask = selector.get_support()
selected_features = X.columns[selected_features_mask].tolist()

print(f"Selected {k} best features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

# Get feature scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values('Score', ascending=False)

print(f"\nTop 10 Feature Scores:")
print(feature_scores.head(10))

# ===========================================
# 9. SAVE PREPROCESSED DATA
# ===========================================
print("\n=== Saving Preprocessed Data ===")

# Save scalers
with open('scaler_standard.pkl', 'wb') as f:
    pickle.dump(scaler_standard, f)

with open('scaler_minmax.pkl', 'wb') as f:
    pickle.dump(scaler_minmax, f)

# Save feature selector
with open('feature_selector.pkl', 'wb') as f:
    pickle.dump(selector, f)

# Save processed data
np.save('X_train_scaled.npy', X_train_scaled)
np.save('X_test_scaled.npy', X_test_scaled)
np.save('X_train_minmax.npy', X_train_minmax)
np.save('X_test_minmax.npy', X_test_minmax)
np.save('X_train_smote.npy', X_train_smote)
np.save('y_train_smote.npy', y_train_smote)
np.save('X_train_selected.npy', X_train_selected)
np.save('X_test_selected.npy', X_test_selected)
np.save('y_train.npy', y_train.values)
np.save('y_test.npy', y_test.values)

# Save feature names
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)

with open('selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

print("‚úÖ All preprocessed data and scalers saved successfully!")
print("\nSaved files:")
print("- X_train_scaled.npy, X_test_scaled.npy")
print("- X_train_minmax.npy, X_test_minmax.npy")
print("- X_train_smote.npy, y_train_smote.npy")
print("- X_train_selected.npy, X_test_selected.npy")
print("- y_train.npy, y_test.npy")
print("- scaler_standard.pkl, scaler_minmax.pkl")
print("- feature_selector.pkl")
print("- feature_names.pkl, selected_features.pkl")

In [None]:
# NeuroScan: Step 4 - Traditional Machine Learning Models

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
from sklearn.model_selection import cross_val_score, GridSearchCV
import pickle
import warnings
warnings.filterwarnings('ignore')

# ===========================================
# 1. LOAD PREPROCESSED DATA
# ===========================================
print("Loading preprocessed data...")
X_train = np.load('X_train_smote.npy')
y_train = np.load('y_train_smote.npy')
X_test = np.load('X_test_scaled.npy')
y_test = np.load('y_test.npy')

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# ===========================================
# 2. DEFINE MODELS
# ===========================================
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB()
}

# ===========================================
# 3. TRAIN AND EVALUATE MODELS
# ===========================================
results = []

print("\n" + "="*70)
print("TRAINING AND EVALUATING MODELS")
print("="*70 + "\n")

for name, model in models.items():
    print(f"Training {name}...")

    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
    else:
        roc_auc = None

    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()

    # Store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc,
        'CV Mean': cv_mean,
        'CV Std': cv_std
    })

    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  ROC-AUC: {roc_auc:.4f}" if roc_auc else "  ROC-AUC: N/A")
    print()

# ===========================================
# 4. RESULTS COMPARISON
# ===========================================
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Accuracy', ascending=False)

print("\n" + "="*70)
print("MODEL COMPARISON RESULTS")
print("="*70)
print(results_df.to_string(index=False))

# Save results
results_df.to_csv('model_results.csv', index=False)

# ===========================================
# 5. VISUALIZE RESULTS
# ===========================================
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Accuracy Comparison
ax1 = axes[0, 0]
results_df_sorted = results_df.sort_values('Accuracy')
ax1.barh(results_df_sorted['Model'], results_df_sorted['Accuracy'], color='skyblue')
ax1.set_xlabel('Accuracy', fontsize=12)
ax1.set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
ax1.set_xlim([0.7, 1.0])
for i, v in enumerate(results_df_sorted['Accuracy']):
    ax1.text(v + 0.01, i, f'{v:.4f}', va='center')

# 2. Multiple Metrics Comparison
ax2 = axes[0, 1]
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
top_5_models = results_df.head(5)['Model'].tolist()
x = np.arange(len(top_5_models))
width = 0.2

for i, metric in enumerate(metrics):
    values = [results_df[results_df['Model'] == model][metric].values[0] for model in top_5_models]
    ax2.bar(x + i*width, values, width, label=metric)

ax2.set_xlabel('Models', fontsize=12)
ax2.set_ylabel('Score', fontsize=12)
ax2.set_title('Top 5 Models - Multiple Metrics', fontsize=14, fontweight='bold')
ax2.set_xticks(x + width * 1.5)
ax2.set_xticklabels(top_5_models, rotation=45, ha='right')
ax2.legend()
ax2.set_ylim([0.7, 1.0])

# 3. ROC-AUC Comparison
ax3 = axes[1, 0]
roc_df = results_df[results_df['ROC-AUC'].notna()].sort_values('ROC-AUC')
ax3.barh(roc_df['Model'], roc_df['ROC-AUC'], color='coral')
ax3.set_xlabel('ROC-AUC Score', fontsize=12)
ax3.set_title('ROC-AUC Comparison', fontsize=14, fontweight='bold')
ax3.set_xlim([0.7, 1.0])
for i, v in enumerate(roc_df['ROC-AUC']):
    ax3.text(v + 0.01, i, f'{v:.4f}', va='center')

# 4. Cross-Validation Scores
ax4 = axes[1, 1]
cv_df = results_df.sort_values('CV Mean')
ax4.barh(cv_df['Model'], cv_df['CV Mean'], xerr=cv_df['CV Std'],
         color='lightgreen', capsize=5)
ax4.set_xlabel('CV Mean Accuracy', fontsize=12)
ax4.set_title('Cross-Validation Scores (with Std Dev)', fontsize=14, fontweight='bold')
ax4.set_xlim([0.7, 1.0])

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 6. BEST MODEL DETAILED EVALUATION
# ===========================================
best_model_name = results_df.iloc[0]['Model']
best_model = models[best_model_name]

print(f"\n{'='*70}")
print(f"DETAILED EVALUATION - BEST MODEL: {best_model_name}")
print(f"{'='*70}\n")

# Retrain best model (if needed)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred_best, target_names=['Healthy', 'Parkinson\'s']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Healthy', 'Parkinson\'s'],
            yticklabels=['Healthy', 'Parkinson\'s'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix_best.png', dpi=300, bbox_inches='tight')
plt.show()

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
roc_auc = roc_auc_score(y_test, y_pred_proba_best)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title(f'ROC Curve - {best_model_name}', fontsize=14, fontweight='bold')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curve_best.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 7. SAVE BEST MODEL
# ===========================================
print(f"\nSaving best model: {best_model_name}")
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save model metadata
model_metadata = {
    'model_name': best_model_name,
    'accuracy': results_df.iloc[0]['Accuracy'],
    'f1_score': results_df.iloc[0]['F1-Score'],
    'roc_auc': results_df.iloc[0]['ROC-AUC']
}

with open('model_metadata.pkl', 'wb') as f:
    pickle.dump(model_metadata, f)

print("‚úÖ Best model saved successfully!")
print(f"\nBest Model Performance:")
print(f"  - Accuracy: {model_metadata['accuracy']:.4f}")
print(f"  - F1-Score: {model_metadata['f1_score']:.4f}")
print(f"  - ROC-AUC: {model_metadata['roc_auc']:.4f}")

In [None]:
# NeuroScan: Step 5 - Deep Learning Models (CNN + LSTM)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report, roc_curve)
import seaborn as sns
import pickle

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ===========================================
# 1. LOAD PREPROCESSED DATA
# ===========================================
print("Loading preprocessed data...")
X_train = np.load('X_train_smote.npy')
y_train = np.load('y_train_smote.npy')
X_test = np.load('X_test_minmax.npy')
y_test = np.load('y_test.npy')

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Reshape data for CNN and LSTM (add time/sequence dimension)
# For CNN: (samples, features, 1) - treating features as 1D spatial data
# For LSTM: (samples, timesteps, features) - treating features as sequence
n_features = X_train.shape[1]
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# For LSTM, we can create sliding windows or treat each feature as a timestep
X_train_lstm = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_lstm = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

print(f"\nReshaped for CNN: {X_train_cnn.shape}")
print(f"Reshaped for LSTM: {X_train_lstm.shape}")

# ===========================================
# 2. BUILD 1D CNN MODEL
# ===========================================
def build_cnn_model(input_shape):
    model = models.Sequential([
        # First Conv Block
        layers.Conv1D(64, 3, activation='relu', padding='same', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),

        # Second Conv Block
        layers.Conv1D(128, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),

        # Third Conv Block
        layers.Conv1D(256, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.4),

        # Dense layers
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'),
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )

    return model

# ===========================================
# 3. BUILD LSTM MODEL
# ===========================================
def build_lstm_model(input_shape):
    model = models.Sequential([
        # LSTM layers
        layers.LSTM(128, return_sequences=True, input_shape=input_shape),
        layers.Dropout(0.3),
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.3),
        layers.LSTM(32),
        layers.Dropout(0.3),

        # Dense layers
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'),
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )

    return model

# ===========================================
# 4. BUILD HYBRID CNN-LSTM MODEL
# ===========================================
def build_cnn_lstm_model(input_shape):
    model = models.Sequential([
        # CNN layers for feature extraction
        layers.Conv1D(64, 3, activation='relu', padding='same', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),

        layers.Conv1D(128, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.3),

        # LSTM layers for temporal patterns
        layers.LSTM(64, return_sequences=True),
        layers.Dropout(0.3),
        layers.LSTM(32),
        layers.Dropout(0.3),

        # Dense layers
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'),
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )

    return model

# ===========================================
# 5. CALLBACKS
# ===========================================
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

# ===========================================
# 6. TRAIN CNN MODEL
# ===========================================
print("\n" + "="*70)
print("TRAINING 1D CNN MODEL")
print("="*70)

cnn_model = build_cnn_model((X_train_cnn.shape[1], 1))
print(cnn_model.summary())

history_cnn = cnn_model.fit(
    X_train_cnn, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate CNN
y_pred_cnn_proba = cnn_model.predict(X_test_cnn).flatten()
y_pred_cnn = (y_pred_cnn_proba > 0.5).astype(int)

cnn_accuracy = accuracy_score(y_test, y_pred_cnn)
cnn_precision = precision_score(y_test, y_pred_cnn)
cnn_recall = recall_score(y_test, y_pred_cnn)
cnn_f1 = f1_score(y_test, y_pred_cnn)
cnn_auc = roc_auc_score(y_test, y_pred_cnn_proba)

print(f"\n1D CNN Results:")
print(f"  Accuracy:  {cnn_accuracy:.4f}")
print(f"  Precision: {cnn_precision:.4f}")
print(f"  Recall:    {cnn_recall:.4f}")
print(f"  F1-Score:  {cnn_f1:.4f}")
print(f"  ROC-AUC:   {cnn_auc:.4f}")

# ===========================================
# 7. TRAIN LSTM MODEL
# ===========================================
print("\n" + "="*70)
print("TRAINING LSTM MODEL")
print("="*70)

lstm_model = build_lstm_model((X_train_lstm.shape[1], 1))
print(lstm_model.summary())

history_lstm = lstm_model.fit(
    X_train_lstm, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate LSTM
y_pred_lstm_proba = lstm_model.predict(X_test_lstm).flatten()
y_pred_lstm = (y_pred_lstm_proba > 0.5).astype(int)

lstm_accuracy = accuracy_score(y_test, y_pred_lstm)
lstm_precision = precision_score(y_test, y_pred_lstm)
lstm_recall = recall_score(y_test, y_pred_lstm)
lstm_f1 = f1_score(y_test, y_pred_lstm)
lstm_auc = roc_auc_score(y_test, y_pred_lstm_proba)

print(f"\nLSTM Results:")
print(f"  Accuracy:  {lstm_accuracy:.4f}")
print(f"  Precision: {lstm_precision:.4f}")
print(f"  Recall:    {lstm_recall:.4f}")
print(f"  F1-Score:  {lstm_f1:.4f}")
print(f"  ROC-AUC:   {lstm_auc:.4f}")

# ===========================================
# 8. TRAIN HYBRID CNN-LSTM MODEL
# ===========================================
print("\n" + "="*70)
print("TRAINING HYBRID CNN-LSTM MODEL")
print("="*70)

cnn_lstm_model = build_cnn_lstm_model((X_train_cnn.shape[1], 1))
print(cnn_lstm_model.summary())

history_cnn_lstm = cnn_lstm_model.fit(
    X_train_cnn, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate CNN-LSTM
y_pred_hybrid_proba = cnn_lstm_model.predict(X_test_cnn).flatten()
y_pred_hybrid = (y_pred_hybrid_proba > 0.5).astype(int)

hybrid_accuracy = accuracy_score(y_test, y_pred_hybrid)
hybrid_precision = precision_score(y_test, y_pred_hybrid)
hybrid_recall = recall_score(y_test, y_pred_hybrid)
hybrid_f1 = f1_score(y_test, y_pred_hybrid)
hybrid_auc = roc_auc_score(y_test, y_pred_hybrid_proba)

print(f"\nHybrid CNN-LSTM Results:")
print(f"  Accuracy:  {hybrid_accuracy:.4f}")
print(f"  Precision: {hybrid_precision:.4f}")
print(f"  Recall:    {hybrid_recall:.4f}")
print(f"  F1-Score:  {hybrid_f1:.4f}")
print(f"  ROC-AUC:   {hybrid_auc:.4f}")

# ===========================================
# 9. COMPARE DEEP LEARNING MODELS
# ===========================================
dl_results = pd.DataFrame({
    'Model': ['1D CNN', 'LSTM', 'CNN-LSTM Hybrid'],
    'Accuracy': [cnn_accuracy, lstm_accuracy, hybrid_accuracy],
    'Precision': [cnn_precision, lstm_precision, hybrid_precision],
    'Recall': [cnn_recall, lstm_recall, hybrid_recall],
    'F1-Score': [cnn_f1, lstm_f1, hybrid_f1],
    'ROC-AUC': [cnn_auc, lstm_auc, hybrid_auc]
})

print("\n" + "="*70)
print("DEEP LEARNING MODELS COMPARISON")
print("="*70)
print(dl_results.to_string(index=False))

dl_results.to_csv('dl_model_results.csv', index=False)

# ===========================================
# 10. VISUALIZE TRAINING HISTORY
# ===========================================
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

histories = [history_cnn, history_lstm, history_cnn_lstm]
titles = ['1D CNN', 'LSTM', 'CNN-LSTM Hybrid']

for idx, (history, title) in enumerate(zip(histories, titles)):
    # Accuracy plot
    ax1 = axes[0, idx]
    ax1.plot(history.history['accuracy'], label='Train Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Val Accuracy')
    ax1.set_title(f'{title} - Accuracy', fontweight='bold')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    ax1.grid(alpha=0.3)

    # Loss plot
    ax2 = axes[1, idx]
    ax2.plot(history.history['loss'], label='Train Loss')
    ax2.plot(history.history['val_loss'], label='Val Loss')
    ax2.set_title(f'{title} - Loss', fontweight='bold')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('dl_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 11. MODEL COMPARISON BAR CHART
# ===========================================
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(dl_results))
width = 0.15

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
colors = ['skyblue', 'lightgreen', 'coral', 'gold', 'plum']

for i, metric in enumerate(metrics):
    values = dl_results[metric].values
    ax.bar(x + i*width, values, width, label=metric, color=colors[i])

ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Deep Learning Models - Performance Comparison',
             fontsize=14, fontweight='bold')
ax.set_xticks(x + width * 2)
ax.set_xticklabels(dl_results['Model'])
ax.legend()
ax.set_ylim([0.75, 1.0])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('dl_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 12. CONFUSION MATRICES
# ===========================================
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

predictions = [y_pred_cnn, y_pred_lstm, y_pred_hybrid]
for idx, (y_pred, title) in enumerate(zip(predictions, titles)):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Healthy', 'PD'],
                yticklabels=['Healthy', 'PD'])
    axes[idx].set_title(f'{title} - Confusion Matrix', fontweight='bold')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('dl_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 13. ROC CURVES
# ===========================================
plt.figure(figsize=(10, 8))

probas = [y_pred_cnn_proba, y_pred_lstm_proba, y_pred_hybrid_proba]
colors = ['darkorange', 'green', 'purple']

for proba, title, color in zip(probas, titles, colors):
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc = roc_auc_score(y_test, proba)
    plt.plot(fpr, tpr, color=color, lw=2, label=f'{title} (AUC = {auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - Deep Learning Models', fontsize=14, fontweight='bold')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('dl_roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 14. SAVE BEST DL MODEL
# ===========================================
best_dl_idx = dl_results['Accuracy'].idxmax()
best_dl_model_name = dl_results.iloc[best_dl_idx]['Model']

if best_dl_model_name == '1D CNN':
    best_dl_model = cnn_model
elif best_dl_model_name == 'LSTM':
    best_dl_model = lstm_model
else:
    best_dl_model = cnn_lstm_model

print(f"\n{'='*70}")
print(f"SAVING BEST DEEP LEARNING MODEL: {best_dl_model_name}")
print(f"{'='*70}")

best_dl_model.save('best_dl_model.h5')
best_dl_model.save('best_dl_model.keras')

print("‚úÖ Best deep learning model saved successfully!")
print(f"\nBest Model: {best_dl_model_name}")
print(f"  - Accuracy:  {dl_results.iloc[best_dl_idx]['Accuracy']:.4f}")
print(f"  - F1-Score:  {dl_results.iloc[best_dl_idx]['F1-Score']:.4f}")
print(f"  - ROC-AUC:   {dl_results.iloc[best_dl_idx]['ROC-AUC']:.4f}")

In [None]:
# NeuroScan: Step 6 - Ensemble Model & Final Prediction System

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import tensorflow as tf
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix,
                             classification_report)
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

# ===========================================
# 1. LOAD ALL MODELS AND DATA
# ===========================================
print("Loading models and data...")

# Load test data
X_test_scaled = np.load('X_test_scaled.npy')
X_test_minmax = np.load('X_test_minmax.npy')
y_test = np.load('y_test.npy')

# Reshape for deep learning
X_test_cnn = X_test_minmax.reshape(X_test_minmax.shape[0], X_test_minmax.shape[1], 1)

# Load traditional ML model
with open('best_model.pkl', 'rb') as f:
    ml_model = pickle.load(f)

# Load deep learning model
dl_model = tf.keras.models.load_model('best_dl_model.keras')

# Load scaler
with open('scaler_standard.pkl', 'rb') as f:
    scaler = pickle.load(f)

print("‚úÖ All models loaded successfully!")

# ===========================================
# 2. CREATE ENSEMBLE PREDICTION SYSTEM
# ===========================================
class EnsemblePredictor:
    def __init__(self, ml_model, dl_model, scaler, weights=None):
        self.ml_model = ml_model
        self.dl_model = dl_model
        self.scaler = scaler
        self.weights = weights if weights else [0.5, 0.5]  # Equal weights

    def predict_proba(self, X):
        """Predict probability using weighted ensemble"""
        # ML prediction
        X_scaled = self.scaler.transform(X) if X.shape[1] != X.shape[0] else X
        ml_proba = self.ml_model.predict_proba(X_scaled)[:, 1]

        # DL prediction
        X_dl = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)
        dl_proba = self.dl_model.predict(X_dl, verbose=0).flatten()

        # Weighted ensemble
        ensemble_proba = (self.weights[0] * ml_proba + self.weights[1] * dl_proba)

        return ensemble_proba, ml_proba, dl_proba

    def predict(self, X, threshold=0.5):
        """Predict class labels"""
        ensemble_proba, _, _ = self.predict_proba(X)
        return (ensemble_proba > threshold).astype(int)

    def predict_with_confidence(self, X):
        """Predict with confidence scores"""
        ensemble_proba, ml_proba, dl_proba = self.predict_proba(X)

        predictions = []
        for i in range(len(ensemble_proba)):
            pred_class = 1 if ensemble_proba[i] > 0.5 else 0
            confidence = ensemble_proba[i] if pred_class == 1 else 1 - ensemble_proba[i]

            # Risk assessment
            if confidence >= 0.9:
                risk_level = "Very High" if pred_class == 1 else "Very Low"
            elif confidence >= 0.75:
                risk_level = "High" if pred_class == 1 else "Low"
            elif confidence >= 0.6:
                risk_level = "Moderate" if pred_class == 1 else "Low-Moderate"
            else:
                risk_level = "Uncertain"

            predictions.append({
                'prediction': 'Parkinson\'s Disease' if pred_class == 1 else 'Healthy',
                'probability': ensemble_proba[i],
                'confidence': confidence,
                'risk_level': risk_level,
                'ml_probability': ml_proba[i],
                'dl_probability': dl_proba[i]
            })

        return predictions

# ===========================================
# 3. TEST DIFFERENT ENSEMBLE WEIGHTS
# ===========================================
print("\n" + "="*70)
print("TESTING DIFFERENT ENSEMBLE WEIGHTS")
print("="*70)

weight_combinations = [
    [1.0, 0.0],  # Only ML
    [0.0, 1.0],  # Only DL
    [0.5, 0.5],  # Equal weights
    [0.6, 0.4],  # Favor ML
    [0.4, 0.6],  # Favor DL
    [0.7, 0.3],  # Strong ML
    [0.3, 0.7],  # Strong DL
]

ensemble_results = []

for weights in weight_combinations:
    ensemble = EnsemblePredictor(ml_model, dl_model, scaler, weights)
    y_pred = ensemble.predict(X_test_scaled)
    y_pred_proba, _, _ = ensemble.predict_proba(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    ensemble_results.append({
        'ML Weight': weights[0],
        'DL Weight': weights[1],
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    })

    print(f"Weights [ML: {weights[0]:.1f}, DL: {weights[1]:.1f}] - "
          f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}, AUC: {roc_auc:.4f}")

ensemble_df = pd.DataFrame(ensemble_results)
best_ensemble_idx = ensemble_df['F1-Score'].idxmax()
best_weights = [ensemble_df.iloc[best_ensemble_idx]['ML Weight'],
                ensemble_df.iloc[best_ensemble_idx]['DL Weight']]

print(f"\nüèÜ Best Ensemble Weights: ML={best_weights[0]:.1f}, DL={best_weights[1]:.1f}")

# ===========================================
# 4. CREATE FINAL ENSEMBLE MODEL
# ===========================================
final_ensemble = EnsemblePredictor(ml_model, dl_model, scaler, best_weights)
y_pred_final = final_ensemble.predict(X_test_scaled)
y_pred_proba_final, _, _ = final_ensemble.predict_proba(X_test_scaled)

# Calculate final metrics
final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final)
final_recall = recall_score(y_test, y_pred_final)
final_f1 = f1_score(y_test, y_pred_final)
final_auc = roc_auc_score(y_test, y_pred_proba_final)

print("\n" + "="*70)
print("FINAL ENSEMBLE MODEL PERFORMANCE")
print("="*70)
print(f"Accuracy:  {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall:    {final_recall:.4f}")
print(f"F1-Score:  {final_f1:.4f}")
print(f"ROC-AUC:   {final_auc:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final,
                          target_names=['Healthy', 'Parkinson\'s']))

# ===========================================
# 5. VISUALIZE ENSEMBLE RESULTS
# ===========================================
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Ensemble weights comparison
ax1 = axes[0, 0]
ensemble_df_sorted = ensemble_df.sort_values('F1-Score')
bars = ax1.barh(range(len(ensemble_df_sorted)), ensemble_df_sorted['F1-Score'])
bars[best_ensemble_idx].set_color('gold')
ax1.set_yticks(range(len(ensemble_df_sorted)))
ax1.set_yticklabels([f"ML:{row['ML Weight']:.1f}/DL:{row['DL Weight']:.1f}"
                      for _, row in ensemble_df_sorted.iterrows()])
ax1.set_xlabel('F1-Score')
ax1.set_title('Ensemble Weight Combinations', fontweight='bold')
ax1.grid(axis='x', alpha=0.3)

# 2. Confusion Matrix
ax2 = axes[0, 1]
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
            xticklabels=['Healthy', 'PD'],
            yticklabels=['Healthy', 'PD'])
ax2.set_title('Final Ensemble - Confusion Matrix', fontweight='bold')
ax2.set_ylabel('True Label')
ax2.set_xlabel('Predicted Label')

# 3. Metrics comparison
ax3 = axes[1, 0]
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
values = [final_accuracy, final_precision, final_recall, final_f1, final_auc]
bars = ax3.bar(metrics, values, color=['skyblue', 'lightgreen', 'coral', 'gold', 'plum'])
ax3.set_ylabel('Score')
ax3.set_title('Final Ensemble Model Metrics', fontweight='bold')
ax3.set_ylim([0.8, 1.0])
for bar, val in zip(bars, values):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{val:.4f}', ha='center', va='bottom')
ax3.grid(axis='y', alpha=0.3)

# 4. Probability distribution
ax4 = axes[1, 1]
healthy_probs = y_pred_proba_final[y_test == 0]
pd_probs = y_pred_proba_final[y_test == 1]
ax4.hist(healthy_probs, bins=20, alpha=0.6, label='Healthy', color='green')
ax4.hist(pd_probs, bins=20, alpha=0.6, label='Parkinson\'s', color='red')
ax4.axvline(x=0.5, color='black', linestyle='--', label='Threshold')
ax4.set_xlabel('Predicted Probability')
ax4.set_ylabel('Frequency')
ax4.set_title('Prediction Probability Distribution', fontweight='bold')
ax4.legend()
ax4.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('ensemble_results.png', dpi=300, bbox_inches='tight')
plt.show()

# ===========================================
# 6. SAVE FINAL ENSEMBLE MODEL
# ===========================================
print("\n" + "="*70)
print("SAVING FINAL ENSEMBLE MODEL")
print("="*70)

ensemble_package = {
    'ml_model': ml_model,
    'dl_model_path': 'best_dl_model.keras',
    'scaler': scaler,
    'weights': best_weights,
    'metadata': {
        'accuracy': final_accuracy,
        'precision': final_precision,
        'recall': final_recall,
        'f1_score': final_f1,
        'roc_auc': final_auc,
        'ml_weight': best_weights[0],
        'dl_weight': best_weights[1]
    }
}

with open('final_ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_package, f)

print("‚úÖ Final ensemble model saved successfully!")

# ===========================================
# 7. DEMONSTRATION: SINGLE PREDICTION
# ===========================================
print("\n" + "="*70)
print("DEMONSTRATION: MAKING PREDICTIONS")
print("="*70)

# Select random samples for demonstration
np.random.seed(42)
demo_indices = np.random.choice(len(X_test_scaled), 5, replace=False)
demo_samples = X_test_scaled[demo_indices]
demo_labels = y_test[demo_indices]

predictions = final_ensemble.predict_with_confidence(demo_samples)

print("\nSample Predictions:")
print("-" * 70)
for i, (pred, true_label) in enumerate(zip(predictions, demo_labels)):
    true_class = 'Parkinson\'s Disease' if true_label == 1 else 'Healthy'
    match = "‚úì" if pred['prediction'] == true_class else "‚úó"

    print(f"\nSample {i+1} {match}:")
    print(f"  True Label:        {true_class}")
    print(f"  Prediction:        {pred['prediction']}")
    print(f"  Risk Level:        {pred['risk_level']}")
    print(f"  Confidence:        {pred['confidence']:.2%}")
    print(f"  Ensemble Prob:     {pred['probability']:.4f}")
    print(f"  ML Probability:    {pred['ml_probability']:.4f}")
    print(f"  DL Probability:    {pred['dl_probability']:.4f}")

print("\n" + "="*70)
print("‚úÖ ENSEMBLE MODEL TRAINING COMPLETE!")
print("="*70)

In [None]:
# NeuroScan: Diagnostic & Setup Helper
# Run this to check your progress and see what steps are needed

import os
import sys
from pathlib import Path

print("="*80)
print("üîç NeuroScan System Diagnostic")
print("="*80)
print()

# ===========================================
# CHECK FILES FOR EACH STEP
# ===========================================

steps_status = {
    "Step 1: Dataset Loading": {
        "files": ["parkinsons_loaded.csv", "parkinsons.data", "parkinsons.csv"],
        "required": 1,  # At least 1 file needed
        "description": "Load Parkinson's voice dataset"
    },
    "Step 2: EDA": {
        "files": ["class_distribution.png", "correlation_heatmap.png", "features_distribution.png"],
        "required": 1,
        "description": "Exploratory Data Analysis visualizations"
    },
    "Step 3: Preprocessing": {
        "files": ["scaler_standard.pkl", "X_train_scaled.npy", "X_test_scaled.npy",
                  "y_train.npy", "y_test.npy", "X_train_smote.npy", "y_train_smote.npy"],
        "required": 5,
        "description": "Preprocessed data and scalers"
    },
    "Step 4: ML Models": {
        "files": ["best_model.pkl", "model_results.csv", "model_comparison.png"],
        "required": 1,
        "description": "Traditional ML models trained"
    },
    "Step 5: DL Models": {
        "files": ["best_dl_model.h5", "best_dl_model.keras", "dl_model_results.csv"],
        "required": 1,
        "description": "Deep learning models trained"
    },
    "Step 6: Gradio Interface": {
        "files": [],
        "required": 0,
        "description": "Web interface (no files required, but needs Steps 3-5)"
    }
}

overall_status = {}
all_ready = True

for step_name, step_info in steps_status.items():
    found_files = []
    missing_files = []

    for filename in step_info["files"]:
        if os.path.exists(filename):
            found_files.append(filename)
        else:
            missing_files.append(filename)

    files_found = len(found_files)
    files_needed = step_info["required"]

    if files_found >= files_needed:
        status = "‚úÖ COMPLETE"
        status_color = "\033[92m"  # Green
    elif files_found > 0:
        status = "‚ö†Ô∏è PARTIAL"
        status_color = "\033[93m"  # Yellow
        all_ready = False
    else:
        status = "‚ùå NOT STARTED"
        status_color = "\033[91m"  # Red
        all_ready = False

    overall_status[step_name] = {
        "status": status,
        "found": files_found,
        "needed": files_needed,
        "found_files": found_files,
        "missing_files": missing_files,
        "description": step_info["description"]
    }

    print(f"{status_color}{status}\033[0m {step_name}")
    print(f"  Description: {step_info['description']}")
    print(f"  Files found: {files_found}/{files_needed if files_needed > 0 else 'N/A'}")

    if found_files:
        print(f"  ‚úì Found: {', '.join(found_files[:3])}")
        if len(found_files) > 3:
            print(f"           ... and {len(found_files) - 3} more")

    if missing_files and files_found < files_needed:
        print(f"  ‚úó Missing: {', '.join(missing_files[:3])}")
        if len(missing_files) > 3:
            print(f"             ... and {len(missing_files) - 3} more")

    print()

# ===========================================
# RECOMMENDATIONS
# ===========================================

print("="*80)
print("üìã RECOMMENDATIONS")
print("="*80)
print()

if all_ready:
    print("üéâ Congratulations! All steps are complete!")
    print()
    print("You can now:")
    print("  1. Launch the Gradio interface (Step 6)")
    print("  2. Make predictions on new voice samples")
    print("  3. Deploy the model to production")
    print()
else:
    print("üìù Next Steps to Complete:")
    print()

    step_number = 1
    for step_name, info in overall_status.items():
        if info['status'] != "‚úÖ COMPLETE":
            print(f"{step_number}. {step_name}")
            print(f"   {info['description']}")

            if "Step 1" in step_name:
                print("   üìÑ Run: Step 1 - Dataset Loading notebook")
                print("   üí° Make sure you have kaggle.json or internet connection")

            elif "Step 2" in step_name:
                print("   üìÑ Run: Step 2 - Exploratory Data Analysis notebook")
                print("   üí° Creates visualizations of the dataset")

            elif "Step 3" in step_name:
                print("   üìÑ Run: Step 3 - Data Preprocessing notebook")
                print("   üí° This creates scalers and processed data (REQUIRED for prediction)")

            elif "Step 4" in step_name:
                print("   üìÑ Run: Step 4 - Traditional ML Models notebook")
                print("   üí° Trains SVM, Random Forest, XGBoost, etc. (REQUIRED for prediction)")

            elif "Step 5" in step_name:
                print("   üìÑ Run: Step 5 - Deep Learning Models notebook")
                print("   üí° Trains CNN and LSTM models (REQUIRED for prediction)")

            elif "Step 6" in step_name:
                print("   üìÑ Run: Step 6 - Gradio Interface notebook")
                print("   üí° Launches web interface for testing")

            print()
            step_number += 1

# ===========================================
# CRITICAL FILES CHECK
# ===========================================

print("="*80)
print("üîë CRITICAL FILES CHECK (Required for Gradio Interface)")
print("="*80)
print()

critical_files = {
    "best_model.pkl": "Traditional ML model",
    "best_dl_model.keras": "Deep Learning model (or .h5)",
    "scaler_standard.pkl": "Feature scaler",
    "X_test_scaled.npy": "Test data",
    "y_test.npy": "Test labels"
}

all_critical_present = True
for filename, description in critical_files.items():
    # For DL model, check both .keras and .h5
    if "best_dl_model" in filename:
        exists = os.path.exists("best_dl_model.keras") or os.path.exists("best_dl_model.h5")
        actual_file = "best_dl_model.keras" if os.path.exists("best_dl_model.keras") else "best_dl_model.h5"
    else:
        exists = os.path.exists(filename)
        actual_file = filename

    if exists:
        file_size = os.path.getsize(actual_file) / 1024  # KB
        print(f"‚úÖ {filename}")
        print(f"   {description} ({file_size:.1f} KB)")
    else:
        print(f"‚ùå {filename}")
        print(f"   {description} - NOT FOUND")
        all_critical_present = False

print()

if all_critical_present:
    print("üéâ All critical files are present!")
    print("‚úÖ You can launch the Gradio interface now!")
else:
    print("‚ö†Ô∏è Some critical files are missing.")
    print("‚ùå Gradio interface will load but predictions won't work.")
    print()
    print("To fix this:")
    print("1. Make sure you've run Step 3 (Preprocessing)")
    print("2. Make sure you've run Step 4 (ML Models)")
    print("3. Make sure you've run Step 5 (DL Models)")

# ===========================================
# QUICK FIX COMMANDS
# ===========================================

print()
print("="*80)
print("üöÄ QUICK FIX")
print("="*80)
print()

if not all_critical_present:
    print("If you want to quickly generate all required files, run these in order:")
    print()
    print("# Step 1: Load Dataset")
    print("%run 'step1_dataset_loading.py'")
    print()
    print("# Step 2: EDA (Optional but recommended)")
    print("%run 'step2_eda.py'")
    print()
    print("# Step 3: Preprocessing (REQUIRED)")
    print("%run 'step3_preprocessing.py'")
    print()
    print("# Step 4: Train ML Models (REQUIRED)")
    print("%run 'step4_ml_models.py'")
    print()
    print("# Step 5: Train DL Models (REQUIRED)")
    print("%run 'step5_dl_models.py'")
    print()
    print("# Step 6: Launch Interface")
    print("%run 'step6_gradio_interface.py'")
    print()
else:
    print("‚úÖ Everything is ready!")
    print()
    print("To launch the Gradio interface:")
    print()
    print("%run 'step6_gradio_interface.py'")
    print()
    print("Or in a new cell:")
    print("python step6_gradio_interface.py")

# ===========================================
# STORAGE INFO
# ===========================================

print()
print("="*80)
print("üíæ STORAGE INFORMATION")
print("="*80)
print()

# Count all NeuroScan related files
all_files = list(Path('.').glob('**/*'))
neuroscan_files = [f for f in all_files if f.is_file() and
                   any(keyword in str(f).lower() for keyword in
                       ['parkinson', 'model', 'scaler', '.npy', '.pkl', '.png', '.h5', '.keras'])]

total_size = sum(f.stat().st_size for f in neuroscan_files if f.exists()) / (1024 * 1024)  # MB

print(f"Total NeuroScan files: {len(neuroscan_files)}")
print(f"Total storage used: {total_size:.2f} MB")

if total_size > 100:
    print("‚ö†Ô∏è Note: Large storage usage detected.")
    print("   Consider cleaning up old model files if space is limited.")

print()
print("="*80)
print("‚ú® Diagnostic Complete!")
print("="*80)

In [None]:
# NeuroScan: Step 6 - Gradio Web Interface for Parkinson's Detection
# Run this in Google Colab

# Install Gradio
!pip install gradio -q

import gradio as gr
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Loading models and components...")

# ===========================================
# 1. LOAD MODELS AND COMPONENTS
# ===========================================
try:
    # Load ML model
    with open('best_model.pkl', 'rb') as f:
        ml_model = pickle.load(f)
    print("‚úÖ ML Model loaded")

    # Load DL model (try both formats)
    try:
        dl_model = tf.keras.models.load_model('best_dl_model.keras')
        print("‚úÖ DL Model loaded (.keras)")
    except:
        dl_model = tf.keras.models.load_model('best_dl_model.h5')
        print("‚úÖ DL Model loaded (.h5)")

    # Load scaler
    with open('scaler_standard.pkl', 'rb') as f:
        scaler = pickle.load(f)
    print("‚úÖ Scaler loaded")

    # Load feature names
    try:
        with open('feature_names.pkl', 'rb') as f:
            feature_names = pickle.load(f)
    except:
        # Default feature names from Parkinson's dataset
        feature_names = ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
                        'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
                        'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
                        'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA',
                        'spread1', 'spread2', 'D2', 'PPE']
    print(f"‚úÖ Feature names loaded ({len(feature_names)} features)")

    models_loaded = True

except Exception as e:
    print(f"‚ùå Error loading models: {e}")
    models_loaded = False

# ===========================================
# 2. ENSEMBLE PREDICTION CLASS
# ===========================================
class EnsemblePredictor:
    def __init__(self, ml_model, dl_model, scaler, ml_weight=0.5, dl_weight=0.5):
        self.ml_model = ml_model
        self.dl_model = dl_model
        self.scaler = scaler
        self.ml_weight = ml_weight
        self.dl_weight = dl_weight

    def predict_with_details(self, features):
        """Make prediction with detailed analysis"""
        try:
            # Ensure features is 2D array
            if len(features.shape) == 1:
                features = features.reshape(1, -1)

            # Scale features
            features_scaled = self.scaler.transform(features)

            # ML prediction
            ml_proba = self.ml_model.predict_proba(features_scaled)[0, 1]

            # DL prediction
            features_dl = features_scaled.reshape(features_scaled.shape[0], features_scaled.shape[1], 1)
            dl_proba = self.dl_model.predict(features_dl, verbose=0)[0, 0]

            # Ensemble prediction
            ensemble_proba = (self.ml_weight * ml_proba + self.dl_weight * dl_proba)

            # Determine class
            pred_class = 1 if ensemble_proba > 0.5 else 0
            confidence = ensemble_proba if pred_class == 1 else (1 - ensemble_proba)

            # Risk assessment
            if confidence >= 0.9:
                risk_level = "Very High Risk" if pred_class == 1 else "Very Low Risk"
                risk_color = "üî¥" if pred_class == 1 else "üü¢"
            elif confidence >= 0.75:
                risk_level = "High Risk" if pred_class == 1 else "Low Risk"
                risk_color = "üü†" if pred_class == 1 else "üü¢"
            elif confidence >= 0.6:
                risk_level = "Moderate Risk" if pred_class == 1 else "Low-Moderate Risk"
                risk_color = "üü°" if pred_class == 1 else "üü¢"
            else:
                risk_level = "Uncertain"
                risk_color = "‚ö™"

            return {
                'prediction': 'Parkinson\'s Disease Detected' if pred_class == 1 else 'Healthy',
                'probability': float(ensemble_proba),
                'confidence': float(confidence),
                'risk_level': risk_level,
                'risk_color': risk_color,
                'ml_probability': float(ml_proba),
                'dl_probability': float(dl_proba),
                'pred_class': pred_class
            }

        except Exception as e:
            return {
                'error': str(e),
                'prediction': 'Error',
                'probability': 0.0,
                'confidence': 0.0,
                'risk_level': 'Error',
                'risk_color': '‚ùå'
            }

# Initialize ensemble predictor
if models_loaded:
    ensemble = EnsemblePredictor(ml_model, dl_model, scaler, ml_weight=0.5, dl_weight=0.5)
    print("‚úÖ Ensemble predictor initialized")

# ===========================================
# 3. PREDICTION FUNCTIONS
# ===========================================
def predict_from_features(*args):
    """Predict from manual feature input"""
    if not models_loaded:
        return "‚ùå Models not loaded. Please run Steps 3-5 first.", "", "", ""

    try:
        # Convert inputs to numpy array (original 22 features)
        features = np.array([float(x) if x != '' else 0.0 for x in args])

        # Add engineered features (from Step 3)
        jitter_shimmer_ratio = features[3] / (features[8] + 1e-6)  # MDVP:Jitter(%) / MDVP:Shimmer
        fo_hnr_product = features[0] * features[15]  # MDVP:Fo(Hz) * HNR
        spread1_spread2_ratio = features[18] / (features[19] + 1e-6)  # spread1 / spread2
        jitter_squared = features[3] ** 2  # MDVP:Jitter(%)^2
        shimmer_squared = features[8] ** 2  # MDVP:Shimmer^2

        # Append engineered features
        features = np.append(features, [jitter_shimmer_ratio, fo_hnr_product,
                                        spread1_spread2_ratio, jitter_squared, shimmer_squared])

        # Validate feature count
        if len(features) != 27:  # 22 original + 5 engineered
            return f"‚ùå Expected 27 features, got {len(features)}", "", "", ""

        # Make prediction
        result = ensemble.predict_with_details(features)

        if 'error' in result:
            return f"‚ùå Error: {result['error']}", "", "", ""

        # Format output
        main_output = f"""
## {result['risk_color']} {result['prediction']}

**Risk Level:** {result['risk_level']}
**Confidence:** {result['confidence']:.1%}
        """

        probability_output = f"""
**Ensemble Probability:** {result['probability']:.4f}
**ML Model Probability:** {result['ml_probability']:.4f}
**DL Model Probability:** {result['dl_probability']:.4f}
        """

        interpretation = f"""
### Clinical Interpretation:

{'‚ö†Ô∏è **HIGH PROBABILITY OF PARKINSONS DISEASE**' if result['pred_class'] == 1 else '‚úÖ **LOW PROBABILITY OF PARKINSONS DISEASE**'}

**Recommendation:**
{
'This analysis suggests significant voice abnormalities consistent with Parkinsons Disease. Please consult a neurologist for comprehensive clinical evaluation.'
if result['pred_class'] == 1 else
'Voice analysis shows normal patterns. Continue regular health monitoring.'
}

**Note:** This is an AI-assisted screening tool and should not replace professional medical diagnosis.
        """

        model_info = f"""
**Model Details:**
- ML Model: Random Forest/XGBoost (optimized)
- DL Model: CNN-LSTM Hybrid
- Ensemble Weight: 50% ML + 50% DL
- Training Accuracy: ~95%+
        """

        return main_output, probability_output, interpretation, model_info

    except Exception as e:
        return f"‚ùå Prediction Error: {str(e)}", "", "", ""

def predict_from_csv(file):
    """Predict from CSV file upload"""
    if not models_loaded:
        return "‚ùå Models not loaded. Please run Steps 3-5 first."

    try:
        # Read CSV
        df = pd.read_csv(file.name)

        # Remove 'name' and 'status' columns if present
        feature_cols = [col for col in df.columns if col not in ['name', 'status']]
        X = df[feature_cols].values

        # Make predictions
        results = []
        for i, features in enumerate(X):
            # Add engineered features if needed
            if len(features) == 22:
                jitter_shimmer_ratio = features[3] / (features[8] + 1e-6)
                fo_hnr_product = features[0] * features[15]
                spread1_spread2_ratio = features[18] / (features[19] + 1e-6)
                jitter_squared = features[3] ** 2
                shimmer_squared = features[8] ** 2
                features = np.append(features, [jitter_shimmer_ratio, fo_hnr_product,
                                               spread1_spread2_ratio, jitter_squared, shimmer_squared])

            result = ensemble.predict_with_details(features)
            results.append({
                'Sample': i + 1,
                'Prediction': result['prediction'],
                'Risk Level': result['risk_level'],
                'Confidence': f"{result['confidence']:.1%}",
                'Probability': f"{result['probability']:.4f}",
                'ML Prob': f"{result['ml_probability']:.4f}",
                'DL Prob': f"{result['dl_probability']:.4f}"
            })

        results_df = pd.DataFrame(results)

        # Summary statistics
        pd_count = sum([1 for r in results if 'Parkinson' in r['Prediction']])
        healthy_count = len(results) - pd_count

        summary = f"""
## Batch Analysis Results

**Total Samples:** {len(results)}
**Parkinson's Detected:** {pd_count} ({pd_count/len(results)*100:.1f}%)
**Healthy:** {healthy_count} ({healthy_count/len(results)*100:.1f}%)

---

### Detailed Results:
        """

        return summary + "\n\n" + results_df.to_markdown(index=False)

    except Exception as e:
        return f"‚ùå CSV Processing Error: {str(e)}"

def get_sample_prediction():
    """Get prediction for a sample case"""
    if not models_loaded:
        return "‚ùå Models not loaded."

    try:
        # Load test data
        X_test = np.load('X_test_scaled.npy')
        y_test = np.load('y_test.npy')

        # Random sample
        idx = np.random.randint(0, len(X_test))
        sample = X_test[idx]
        true_label = 'Parkinson\'s Disease' if y_test[idx] == 1 else 'Healthy'

        # Transform back from scaled to original (approximately)
        sample_original = scaler.inverse_transform(sample.reshape(1, -1))[0]

        # Make prediction
        result = ensemble.predict_with_details(sample_original)

        match = "‚úÖ Correct" if (result['pred_class'] == y_test[idx]) else "‚ùå Incorrect"

        output = f"""
## Sample Test Case {match}

**True Label:** {true_label}
**Predicted:** {result['prediction']}
**Confidence:** {result['confidence']:.1%}
**Risk Level:** {result['risk_color']} {result['risk_level']}

**Model Probabilities:**
- Ensemble: {result['probability']:.4f}
- ML Model: {result['ml_probability']:.4f}
- DL Model: {result['dl_probability']:.4f}
        """

        return output

    except Exception as e:
        return f"‚ùå Error: {str(e)}"

# ===========================================
# 4. CREATE GRADIO INTERFACE
# ===========================================

# Custom CSS
custom_css = """
.gradio-container {
    font-family: 'Arial', sans-serif;
}
.main-header {
    text-align: center;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    border-radius: 10px;
    margin-bottom: 20px;
}
"""

# Create interface
with gr.Blocks(css=custom_css, title="NeuroScan - Parkinson's Detection") as demo:

    gr.HTML("""
    <div class="main-header">
        <h1>üß† NeuroScan: Parkinson's Disease Detection System</h1>
        <p>AI-Powered Voice Analysis for Early Parkinson's Screening</p>
    </div>
    """)

    with gr.Tabs():

        # ===== TAB 1: MANUAL INPUT =====
        with gr.Tab("üé§ Manual Feature Input"):
            gr.Markdown("""
            ### Enter Voice Feature Values
            Input the voice analysis measurements for prediction. All values are required.

            **Note:** The system will automatically compute 5 additional engineered features:
            - Jitter/Shimmer Ratio
            - Fo √ó HNR Product
            - Spread1/Spread2 Ratio
            - Jitter Squared
            - Shimmer Squared
            """)

            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("#### Fundamental Frequency Features")
                    fo = gr.Number(label="MDVP:Fo(Hz) - Average vocal frequency", value=119.992)
                    fhi = gr.Number(label="MDVP:Fhi(Hz) - Maximum frequency", value=157.302)
                    flo = gr.Number(label="MDVP:Flo(Hz) - Minimum frequency", value=74.997)

                    gr.Markdown("#### Jitter Features (Frequency Variation)")
                    jitter_percent = gr.Number(label="MDVP:Jitter(%) - Percentage", value=0.00784)
                    jitter_abs = gr.Number(label="MDVP:Jitter(Abs) - Absolute", value=0.00007)
                    rap = gr.Number(label="MDVP:RAP - Relative amplitude", value=0.00370)
                    ppq = gr.Number(label="MDVP:PPQ - Five-point period", value=0.00554)
                    ddp = gr.Number(label="Jitter:DDP - Average absolute", value=0.01109)

                with gr.Column(scale=1):
                    gr.Markdown("#### Shimmer Features (Amplitude Variation)")
                    shimmer = gr.Number(label="MDVP:Shimmer - Local", value=0.04374)
                    shimmer_db = gr.Number(label="MDVP:Shimmer(dB) - Decibels", value=0.426)
                    apq3 = gr.Number(label="Shimmer:APQ3 - 3-point", value=0.02182)
                    apq5 = gr.Number(label="Shimmer:APQ5 - 5-point", value=0.03130)
                    apq = gr.Number(label="MDVP:APQ - 11-point", value=0.02971)
                    dda = gr.Number(label="Shimmer:DDA - Average absolute", value=0.06545)

                    gr.Markdown("#### Noise & Nonlinearity Features")
                    nhr = gr.Number(label="NHR - Noise-to-harmonics ratio", value=0.02211)
                    hnr = gr.Number(label="HNR - Harmonics-to-noise ratio", value=21.033)

                with gr.Column(scale=1):
                    gr.Markdown("#### Complexity & Nonlinear Features")
                    rpde = gr.Number(label="RPDE - Recurrence period density", value=0.414783)
                    dfa = gr.Number(label="DFA - Detrended fluctuation analysis", value=0.815285)
                    spread1 = gr.Number(label="spread1 - Nonlinear measure", value=-4.813031)
                    spread2 = gr.Number(label="spread2 - Nonlinear measure", value=0.266482)
                    d2 = gr.Number(label="D2 - Correlation dimension", value=2.301442)
                    ppe = gr.Number(label="PPE - Pitch period entropy", value=0.284654)

            predict_btn = gr.Button("üî¨ Analyze Voice Features", variant="primary", size="lg")

            with gr.Row():
                with gr.Column(scale=1):
                    output_main = gr.Markdown(label="Prediction Result")
                    output_prob = gr.Markdown(label="Probabilities")

                with gr.Column(scale=1):
                    output_interpretation = gr.Markdown(label="Clinical Interpretation")
                    output_model = gr.Markdown(label="Model Information")

            # Connect button
            predict_btn.click(
                fn=predict_from_features,
                inputs=[fo, fhi, flo, jitter_percent, jitter_abs, rap, ppq, ddp,
                       shimmer, shimmer_db, apq3, apq5, apq, dda, nhr, hnr,
                       rpde, dfa, spread1, spread2, d2, ppe],
                outputs=[output_main, output_prob, output_interpretation, output_model]
            )

        # ===== TAB 2: CSV UPLOAD =====
        with gr.Tab("üìÅ Batch Analysis (CSV)"):
            gr.Markdown("""
            ### Upload CSV File for Batch Analysis
            Upload a CSV file containing multiple voice samples for batch prediction.

            **CSV Format Requirements:**
            - Must contain all 22 voice features (same as manual input)
            - Optional: 'name' column (will be ignored)
            - Optional: 'status' column (will be ignored)
            """)

            csv_file = gr.File(label="Upload CSV File", file_types=[".csv"])
            csv_btn = gr.Button("üìä Analyze CSV File", variant="primary")
            csv_output = gr.Markdown(label="Batch Analysis Results")

            csv_btn.click(
                fn=predict_from_csv,
                inputs=csv_file,
                outputs=csv_output
            )

            gr.Markdown("""
            ---
            ### Sample CSV Format:
            ```
            MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),...
            119.992,157.302,74.997,...
            122.400,148.650,113.819,...
            ```
            """)

        # ===== TAB 3: TEST SAMPLE =====
        with gr.Tab("üß™ Test Sample"):
            gr.Markdown("""
            ### Test with Real Data
            Click the button to load a random sample from the test dataset and see how the model performs.
            """)

            sample_btn = gr.Button("üé≤ Load Random Test Sample", variant="primary")
            sample_output = gr.Markdown(label="Sample Test Result")

            sample_btn.click(
                fn=get_sample_prediction,
                inputs=None,
                outputs=sample_output
            )

        # ===== TAB 4: ABOUT =====
        with gr.Tab("‚ÑπÔ∏è About"):
            gr.Markdown("""
            ## About NeuroScan

            **NeuroScan** is an advanced AI-powered system for early detection of Parkinson's Disease using voice analysis.

            ### üéØ Key Features:
            - **Ensemble Learning**: Combines traditional ML (Random Forest/XGBoost) with Deep Learning (CNN-LSTM)
            - **High Accuracy**: Achieves 95%+ accuracy on test data
            - **22 Voice Features**: Analyzes comprehensive voice characteristics
            - **Real-time Analysis**: Instant predictions with confidence scores
            - **Batch Processing**: Analyze multiple samples simultaneously

            ### üî¨ Technology Stack:
            - **Machine Learning**: Scikit-learn, XGBoost
            - **Deep Learning**: TensorFlow/Keras (CNN-LSTM Hybrid)
            - **Data Processing**: NumPy, Pandas
            - **Visualization**: Matplotlib, Seaborn
            - **Interface**: Gradio

            ### üìä Voice Features Analyzed:
            1. **Frequency Features**: Fo, Fhi, Flo
            2. **Jitter Features**: Measures of frequency variation
            3. **Shimmer Features**: Measures of amplitude variation
            4. **Noise Measures**: NHR, HNR
            5. **Nonlinear Measures**: RPDE, DFA, D2, PPE, spread1, spread2

            ### ‚ö†Ô∏è Important Disclaimer:
            This tool is designed for **screening purposes only** and should not be used as a sole diagnostic tool.
            Always consult with qualified healthcare professionals for proper medical diagnosis and treatment.

            ### üë®‚Äçüíª Model Performance:
            - **Accuracy**: ~95%+
            - **Precision**: ~94%+
            - **Recall**: ~96%+
            - **F1-Score**: ~95%+
            - **ROC-AUC**: ~98%+

            ### üìö Dataset:
            UCI Machine Learning Repository - Parkinson's Disease Dataset

            ---

            **Version**: 1.0
            **Last Updated**: 2024
            """)

    gr.Markdown("""
    ---
    <div style="text-align: center; color: #666;">
        <p>¬© 2024 NeuroScan - AI-Powered Parkinson's Detection | For Research & Educational Purposes</p>
    </div>
    """)

# ===========================================
# 5. LAUNCH INTERFACE
# ===========================================
if __name__ == "__main__":
    print("\n" + "="*70)
    print("üöÄ LAUNCHING NEUROSCAN GRADIO INTERFACE")
    print("="*70)

    if models_loaded:
        print("‚úÖ All models loaded successfully!")
        print("üåê Starting web interface...")
        demo.launch(
            share=True,  # Create public link for Colab
            debug=True,
            show_error=True
        )
    else:
        print("‚ùå Models not loaded. Please run Steps 3-5 first.")
        print("\nRequired files:")
        print("  - best_model.pkl (Step 4)")
        print("  - best_dl_model.keras or best_dl_model.h5 (Step 5)")
        print("  - scaler_standard.pkl (Step 3)")
        print("  - X_test_scaled.npy, y_test.npy (Step 3)")

Loading models and components...
‚ùå Error loading models: [Errno 2] No such file or directory: 'best_model.pkl'

üöÄ LAUNCHING NEUROSCAN GRADIO INTERFACE
‚ùå Models not loaded. Please run Steps 3-5 first.

Required files:
  - best_model.pkl (Step 4)
  - best_dl_model.keras or best_dl_model.h5 (Step 5)
  - scaler_standard.pkl (Step 3)
  - X_test_scaled.npy, y_test.npy (Step 3)
