In [19]:
# Imports
import pandas as pd
import glob
import os
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.utils import compute_class_weight
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
import joblib

In [28]:
# Load CSVs and build label dicts

train_file = pd.read_csv('00_dataset_daicwoz/train_split_Depression_AVEC2017.csv')
val_file = pd.read_csv('00_dataset_daicwoz/dev_split_Depression_AVEC2017.csv')
test_file = pd.read_csv('00_dataset_daicwoz/full_test_split.csv')

train_label_dict = train_file.set_index('Participant_ID')['PHQ8_Binary'].to_dict()
val_label_dict = val_file.set_index('Participant_ID')['PHQ8_Binary'].to_dict()
test_label_dict = test_file.set_index('Participant_ID')['PHQ_Binary'].to_dict()

In [29]:
# Load files

train_files = []
val_files = []
test_files = []

all_files = glob.glob('03_extracted_features/features_join_right/*')

for file in all_files:
    basename = file.split('/')[-1]
    participant_id_str = basename.split('_')[0]
    try:
        participant_id = int(participant_id_str)
    except ValueError:
        print(f"Skipping file {file}, can't parse participant ID")
        continue
    if participant_id in train_label_dict:
        train_files.append(file)
    elif participant_id in val_label_dict:
        val_files.append(file)
    elif participant_id in test_label_dict:
        test_files.append(file)

print(f"Train files: {len(train_files)}, Val files: {len(val_files)}, Test files: {len(test_files)}")

train_items = [(file, train_label_dict[int(file.split('/')[-1].split('_')[0])]) for file in train_files]
val_items = [(file, val_label_dict[int(file.split('/')[-1].split('_')[0])]) for file in val_files]
test_items = [(file, test_label_dict[int(file.split('/')[-1].split('_')[0])]) for file in test_files]

Train files: 106, Val files: 35, Test files: 46


In [30]:
# Training DataFrame
train_df = pd.concat([
    pd.read_csv(file).assign(label=label)
    for file, label in train_items
], ignore_index=True)

# Validation DataFrame
val_df = pd.concat([
    pd.read_csv(file).assign(label=label)
    for file, label in val_items
], ignore_index=True)

# Test DataFrame
test_df = pd.concat([
    pd.read_csv(file).assign(label=label)
    for file, label in test_items
], ignore_index=True)

In [31]:
# Seperate label from features
X_train = train_df.drop(columns=["label"])  # or + ["id_col"] if necessary
y_train = train_df["label"]

X_val = val_df.drop(columns=["label"])  # or + ["id_col"]
y_val = val_df["label"]

X_test = test_df.drop(columns=["label"])
y_test = test_df["label"]

In [32]:
# Clean Sets

# Replace inf/-inf with NaN
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_val = X_val.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Fit SimpleImputer on full X_train
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train)

# Transform full datasets
X_train_transformed = imputer.transform(X_train)
X_val_transformed = imputer.transform(X_val)
X_test_transformed = imputer.transform(X_test)

# Get back only columns that were successfully imputed
# (all-NaN columns were skipped)
valid_cols_mask = ~np.isnan(imputer.statistics_)
valid_columns = X_train.columns[valid_cols_mask]
invalid_columns = X_train.columns[~valid_cols_mask]

# Convert transformed arrays back to DataFrames with valid columns
X_train_imputed = pd.DataFrame(X_train_transformed, columns=valid_columns, index=X_train.index)
X_val_imputed = pd.DataFrame(X_val_transformed, columns=valid_columns, index=X_val.index)
X_test_imputed = pd.DataFrame(X_test_transformed, columns=valid_columns, index=X_test.index)

# Add missing (all-NaN) columns back with fill value (e.g. 0)
for col in invalid_columns:
    X_train_imputed[col] = 0
    X_val_imputed[col] = 0
    X_test_imputed[col] = 0

# Restore original column order
X_train_imputed = X_train_imputed[X_train.columns]
X_val_imputed = X_val_imputed[X_train.columns]
X_test_imputed = X_test_imputed[X_train.columns]

# Save imputer
joblib.dump(imputer, "mean_imputer.pkl");

 'HMPDM_1_kurt' 'HMPDM_2_skew' 'HMPDM_2_kurt' 'HMPDM_3_skew'
 'HMPDM_3_kurt']. At least one non-missing value is needed for imputation with strategy='mean'.
 'HMPDM_1_kurt' 'HMPDM_2_skew' 'HMPDM_2_kurt' 'HMPDM_3_skew'
 'HMPDM_3_kurt']. At least one non-missing value is needed for imputation with strategy='mean'.
 'HMPDM_1_kurt' 'HMPDM_2_skew' 'HMPDM_2_kurt' 'HMPDM_3_skew'
 'HMPDM_3_kurt']. At least one non-missing value is needed for imputation with strategy='mean'.


In [35]:
# Should be all False
print("Train contains inf:", np.isinf(X_train).values.any())
print("Train contains NaN:", np.isnan(X_train).values.any())

print("Val contains inf:", np.isinf(X_val).values.any())
print("Val contains NaN:", np.isnan(X_val).values.any())

print("Test contains inf:", np.isinf(X_test).values.any())
print("Test contains NaN:", np.isnan(X_test).values.any())

Train contains inf: False
Train contains NaN: True
Val contains inf: False
Val contains NaN: True
Test contains inf: False
Test contains NaN: True


In [36]:
print("Number of training samples:", X_train.shape[0])
print("Number of training features:", X_train.shape[1])

print("Number of validation samples:", X_val.shape[0])
print("Number of validation features:", X_val.shape[1])

print("Number of test samples:", X_test.shape[0])
print("Number of test features:", X_test.shape[1])

print("Training label distribution:\n", y_train.value_counts())
print("Validation label distribution:\n", y_val.value_counts())
print("Test label distribution:\n", y_test.value_counts())

Number of training samples: 106
Number of training features: 718
Number of validation samples: 35
Number of validation features: 718
Number of test samples: 46
Number of test features: 718
Training label distribution:
 label
0    76
1    30
Name: count, dtype: int64
Validation label distribution:
 label
0    23
1    12
Name: count, dtype: int64
Test label distribution:
 label
0    32
1    14
Name: count, dtype: int64


In [37]:
# Define scoring metrics
f1 = make_scorer(f1_score, average='binary')
auroc = make_scorer(roc_auc_score, response_method="predict_proba")

# Parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [100, 200, 300, 500],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.5, 1, 2],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0.5, 1, 1.5, 2]
}


# weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
# scale_pos_weight = weights[1] / weights[0]
# scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
scale_pos_weight = 3.0
print(scale_pos_weight)

xgb = XGBClassifier(
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)

stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=20,
    scoring={'f1': f1, 'roc_auc': auroc},
    refit='f1',
    cv=stratified_cv,
    verbose=1,
    n_jobs=-1,
    error_score='raise',
    random_state=42
)

fit_params = {
    "verbose": False
}

random_search.fit(X_train, y_train, **fit_params)

3.0
Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [39]:
# Evaluation
y_val_pred = random_search.best_estimator_.predict(X_val)
y_val_proba = random_search.best_estimator_.predict_proba(X_val)[:, 1]

print("Best parameters found:")
print(random_search.best_score_)

print("\nBest F1 score (CV):", random_search.best_score_)
print("Validation AUROC:", roc_auc_score(y_val, y_val_proba))

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

Best parameters found:
0.3922043969102793

Best F1 score (CV): 0.3922043969102793
Validation AUROC: 0.5072463768115942

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.61      0.67        23
           1       0.44      0.58      0.50        12

    accuracy                           0.60        35
   macro avg       0.59      0.60      0.58        35
weighted avg       0.63      0.60      0.61        35



In [None]:
# Best parameters found:
# 0.3922043969102793

# Best F1 score (CV): 0.3922043969102793
# Validation AUROC: 0.5181159420289855

# Validation Classification Report:
#               precision    recall  f1-score   support

#            0       0.74      0.61      0.67        23
#            1       0.44      0.58      0.50        12

#     accuracy                           0.60        35
#    macro avg       0.59      0.60      0.58        35
# weighted avg       0.63      0.60      0.61        35

In [38]:
# Test Evaluation
y_test_pred = random_search.best_estimator_.predict(X_test_imputed)
y_test_proba = random_search.best_estimator_.predict_proba(X_test_imputed)[:, 1]

print("Test Set Evaluation:")
print("Test AUROC:", roc_auc_score(y_test, y_test_proba))
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


Test Set Evaluation:
Test AUROC: 0.5736607142857143
Test Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.62      0.69        32
           1       0.40      0.57      0.47        14

    accuracy                           0.61        46
   macro avg       0.58      0.60      0.58        46
weighted avg       0.66      0.61      0.62        46



In [None]:
# Test Set Evaluation:
# Test AUROC: 0.5736607142857143
# Test Classification Report:
#               precision    recall  f1-score   support

#            0       0.77      0.62      0.69        32
#            1       0.40      0.57      0.47        14

#     accuracy                           0.61        46
#    macro avg       0.58      0.60      0.58        46
# weighted avg       0.66      0.61      0.62        46

In [16]:
# to save the best model for late fusion
import pickle

with open("03_best_tabular_model.pkl", "wb") as f:
    pickle.dump(random_search.best_estimator_, f)