In [None]:
!pip install -r requirments.txt

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb
import seaborn as sns
import joblib
import matplotlib.pyplot as plt


In [None]:
featuresPath = "Features/train_audio_features.csv" # Path to extracted features from training audio files
trainLabelsPath = 'train_labels.csv' # Path to training labels
metadataPath = 'metadata.csv' # Path to the metadata file provided for the competition

In [None]:
DLData = pd.read_csv(featuresPath)
DLData['uid'] = DLData['file_name'].apply(lambda x: x[:-4])
DLData.drop('file_name', axis=1, inplace=True)

DLData

In [None]:
DLData.drop('augmentation_type', axis=1, inplace=True)

In [None]:
data = pd.read_csv(trainLabelsPath)
meta = pd.read_csv(metadataPath)[['uid','age','gender']]
meta['gender'] = meta['gender'].apply(lambda x : 1 if x == 'male' else 0)
data['label'] = (data['diagnosis_control'] + 2*data['diagnosis_mci'] + 3*data['diagnosis_adrd'] -1).astype(np.uint8)
data.drop(['diagnosis_control'	,'diagnosis_mci'	,'diagnosis_adrd'], axis=1,inplace=True)
training_data = pd.merge(DLData, meta, on='uid')
training_data = pd.merge(training_data, data, on='uid')
data = data.merge(meta)

In [None]:
uid_series = training_data['uid']
df = training_data.drop('uid', axis=1).copy()
X = df.drop(columns=["label"])
y = df["label"]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
class_labels = label_encoder.classes_


xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(np.unique(y_encoded)),
    eval_metric='mlogloss',
    tree_method='gpu_hist',       
    predictor='gpu_predictor',   
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=20,
    seed=42,
    use_label_encoder=False,
    n_jobs=-1  
)


best_pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_classif, k='all')),  
    ('xgb', xgb_model)  
])


gkf = GroupKFold(n_splits=5)

overall_metrics = {
    'Fold': [],
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': [],
    'Log Loss': []
}

per_class_metrics = {}
for class_idx, class_label in enumerate(class_labels):
    per_class_metrics[f'Class_{class_label}_Sensitivity'] = []
    per_class_metrics[f'Class_{class_label}_Specificity'] = []

metrics = {**overall_metrics, **per_class_metrics}

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y_encoded, groups=uid_series), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

    fold_pipeline = clone(best_pipeline)
    fold_pipeline.fit(X_train, y_train)
    y_val_proba = fold_pipeline.predict_proba(X_val)
    y_val_pred = fold_pipeline.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='weighted', zero_division=0)
    recall = recall_score(y_val, y_val_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val, y_val_pred, average='weighted', zero_division=0)
    loss = log_loss(y_val, y_val_proba)

    metrics['Fold'].append(fold)
    metrics['Accuracy'].append(acc)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['F1-Score'].append(f1)
    metrics['Log Loss'].append(loss)

    cm = confusion_matrix(y_val, y_val_pred, labels=range(len(class_labels)))

    for class_idx, class_label in enumerate(class_labels):
        TP = cm[class_idx, class_idx]
        FN = cm[class_idx, :].sum() - TP
        FP = cm[:, class_idx].sum() - TP
        TN = cm.sum() - (TP + FP + FN)

        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0.0

        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0

        metrics[f'Class_{class_label}_Sensitivity'].append(sensitivity)
        metrics[f'Class_{class_label}_Specificity'].append(specificity)

    print(f"Fold {fold} Metrics:")
    print(f"  Accuracy: {acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    print(f"  Log Loss: {loss:.4f}")
    for class_label in class_labels:
        sens = metrics[f'Class_{class_label}_Sensitivity'][-1]
        spec = metrics[f'Class_{class_label}_Specificity'][-1]
        print(f"  Class {class_label} Sensitivity: {sens:.4f}")
        print(f"  Class {class_label} Specificity: {spec:.4f}")
    print("\n")

metrics_df = pd.DataFrame(metrics)

overall_summary = metrics_df.drop('Fold', axis=1).iloc[:, :5].agg(['mean', 'std']).transpose()
overall_summary = overall_summary.rename(columns={'mean': 'Mean', 'std': 'Std'})
per_class_summary = metrics_df.drop('Fold', axis=1).iloc[:, 5:].agg(['mean', 'std']).transpose()
per_class_summary = per_class_summary.rename(columns={'mean': 'Mean', 'std': 'Std'})
metrics_summary = pd.concat([overall_summary, per_class_summary])
metrics_summary = metrics_summary.reset_index().rename(columns={'index': 'Metric'})

print("Cross-Validation Metrics Summary:")
print(metrics_summary)

best_fold_idx = np.argmin(metrics['Log Loss'])
best_log_loss = metrics['Log Loss'][best_fold_idx]
best_pipeline = clone(best_pipeline)
best_pipeline.fit(X, y_encoded)
print(f"\nBest fold: {metrics['Fold'][best_fold_idx]} with Log Loss: {best_log_loss:.4f}")



In [None]:
metrics_summary

In [None]:
metrics_summary.to_csv("train_validation_results.csv")

In [None]:
xgb_model_trained = best_pipeline.named_steps['xgb']
feature_importances = xgb_model_trained.feature_importances_

feature_names = X.columns.tolist()

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

top_features_df = importance_df.sort_values(by='Importance', ascending=False).head(100).reset_index(drop=True)

def categorize_feature(feature_name):
    if feature_name.startswith('Embedding1_'):
        return 'Wav2Vec2'
    elif feature_name.startswith('Embedding2_'):
        return 'Whisper'
    elif feature_name == 'age':
        return 'age'
    elif feature_name == 'Gender':
        return 'Gender'
    else:
        return 'OpenSmile'

top_features_df['Category'] = top_features_df['Feature'].apply(categorize_feature)

plt.figure(figsize=(20, 15))
sns.set_style("whitegrid")

category_palette = {
    'Wav2Vec2': '#1f77b4', 
    'Whisper': '#ff7f0e',   
    'age': '#2ca02c',        
    'Gender': '#d62728',        
    'OpenSmile': '#9467bd'         
}
top_features_df_sorted = top_features_df.sort_values(by='Importance', ascending=True)

sns.barplot(
    x='Importance',
    y='Feature',
    hue='Category',
    data=top_features_df_sorted,
    palette=category_palette
)

plt.title('Top 100 Feature Importances Categorized', fontsize=20)
plt.xlabel('Importance', fontsize=16)
plt.ylabel('Feature', fontsize=16)
plt.legend(title='Category', fontsize=14, title_fontsize=16, loc='lower right')
plt.tight_layout()
plt.show()

category_importance = top_features_df.groupby('Category')['Importance'].sum().reset_index()

category_importance_avg = top_features_df.groupby('Category')['Importance'].mean().reset_index()

plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

sns.barplot(
    x='Category',
    y='Importance',
    data=category_importance,
    palette=category_palette
)

plt.title('Total Feature Importance by Category', fontsize=20)
plt.xlabel('Category', fontsize=16)
plt.ylabel('Total Importance', fontsize=16)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
sns.barplot(
    x='Category',
    y='Importance',
    data=category_importance_avg,
    palette=category_palette
)
plt.title('Average Feature Importance by Category', fontsize=20)
plt.xlabel('Category', fontsize=16)
plt.ylabel('Average Importance', fontsize=16)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

plt.figure(figsize=(20, 15))
sns.barplot(
    x='Importance',
    y='Feature',
    hue='Category',
    data=top_features_df_sorted,
    palette=category_palette
)
plt.title('Top 100 Feature Importances Categorized', fontsize=20)
plt.xlabel('Importance', fontsize=16)
plt.ylabel('Feature', fontsize=16)
plt.legend(title='Category', fontsize=14, title_fontsize=16, loc='lower right')
plt.tight_layout()
plt.savefig('top_100_feature_importances.png', dpi=300)
plt.close()

plt.figure(figsize=(10, 6))
sns.barplot(
    x='Category',
    y='Importance',
    data=category_importance,
    palette=category_palette
)
plt.title('Total Feature Importance by Category', fontsize=20)
plt.xlabel('Category', fontsize=16)
plt.ylabel('Total Importance', fontsize=16)
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('category_feature_importances_total.png', dpi=300)
plt.close()

plt.figure(figsize=(10, 6))
sns.barplot(
    x='Category',
    y='Importance',
    data=category_importance_avg,
    palette=category_palette
)
plt.title('Average Feature Importance by Category', fontsize=20)
plt.xlabel('Category', fontsize=16)
plt.ylabel('Average Importance', fontsize=16)
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('category_feature_importances_average.png', dpi=300)
plt.close()


In [None]:
joblib.dump(best_pipeline, 'Model/model.joblib')