In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import statsmodels.api as sm
from statsmodels.regression.mixed_linear_model import MixedLM
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

Libraries loaded successfully!


## 1. Load Data

In [9]:
INPUT_CONCAT = "../data/processed/eeg_features_concat.csv"
INPUT_REACT = "../data/processed/eeg_features_reactivity.csv"

df_concat = pd.read_csv(INPUT_CONCAT)
df_react = pd.read_csv(INPUT_REACT)

print(f"Concatenation features: {df_concat.shape}")
print(f"Reactivity features: {df_react.shape}")
print(f"Number of subjects: {df_concat['subject_id'].nunique()}")
print(f"Number of videos: {df_concat['video_id'].nunique()}")
print(f"Trials per subject: {len(df_concat) / df_concat['subject_id'].nunique():.1f}")

Concatenation features: (414, 118)
Reactivity features: (414, 63)
Number of subjects: 23
Number of videos: 18
Trials per subject: 18.0


In [None]:
def evaluate_subject_identification_DDM(df, approach_name, n_features=20):
    """
    Distance Discrimination Method (DDM) for subject identification.
    
    METHOD:
    For each trial, calculate:
    - Distance to all other trials from SAME subject (within-subject)
    - Distance to all trials from DIFFERENT subjects (between-subject)
    - Classification: Is this trial's nearest neighbor from the same subject?
    
    CONTROL: Also check if nearest neighbor is from same video (content-driven)
    
    Parameters:
    -----------
    df : DataFrame
        Feature dataframe with 'subject_id', 'video_id', etc.
    approach_name : str
        Name of the approach (e.g., 'Concatenation', 'Reactivity')
    n_features : int
        Number of top features to select (reduces overfitting)
    """
    print(f"\n{'='*60}")
    print(f"   {approach_name.upper()} APPROACH - DDM")
    print(f"{'='*60}")
    
    # Prepare data
    metadata_cols = ['subject_id', 'video_id', 'valence', 'arousal']
    X = df.drop(columns=metadata_cols, errors='ignore')
    y_subject = df['subject_id'].values
    y_video = df['video_id'].values
    
    print(f"Features: {X.shape[1]}")
    print(f"Subjects: {len(np.unique(y_subject))}")
    print(f"Videos: {len(np.unique(y_video))}")
    print(f"Total trials: {len(df)}")
    
    # Feature selection (reduce overfitting)
    # For DDM: Select features with highest variance (most informative for Euclidean distances)
    print(f"\nSelecting top {n_features} features by variance...")
    feature_variances = np.var(X, axis=0)
    top_k_indices = np.argsort(feature_variances)[-min(n_features, X.shape[1]):]
    X_selected = X.iloc[:, top_k_indices].values
    selected_features = X.columns[top_k_indices].tolist()
    print(f"Selected features (by variance): {selected_features[:5]}...")
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_selected)
    
    print("\nRunning Distance Discrimination Method...")
    print("For each trial: Is the nearest neighbor from the same subject/video?")
    
    from sklearn.metrics import pairwise_distances
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import LeaveOneOut
    
    # Method 1: Leave-One-Out k-NN (k=1) for SUBJECT identification
    print("\n[Method 1] Leave-One-Out Nearest Neighbor Classification (SUBJECT)")
    knn = KNeighborsClassifier(n_neighbors=1)
    loo = LeaveOneOut()
    
    correct_subject = 0
    correct_video = 0
    total = 0
    
    for train_idx, test_idx in loo.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y_subject[train_idx], y_subject[test_idx]
        
        knn.fit(X_train, y_train)
        pred = knn.predict(X_test)
        
        if pred[0] == y_test[0]:
            correct_subject += 1
        total += 1
    
    loo_accuracy = correct_subject / total
    chance_level_subject = 1.0 / len(np.unique(y_subject))
    chance_level_video = 1.0 / len(np.unique(y_video))
    
    # Method 2: Detailed DDM with distance analysis
    print("\n[Method 2] Detailed Distance Discrimination Analysis")
    
    distance_matrix = pairwise_distances(X_scaled, metric='euclidean')
    
    within_subject_dists = []
    between_subject_dists = []
    correct_subject_identifications = []
    correct_video_identifications = []
    same_subject_same_video = 0
    same_subject_diff_video = 0
    diff_subject_same_video = 0
    diff_subject_diff_video = 0
    
    for i in range(len(y_subject)):
        # Get distances from trial i to all other trials
        dists_from_i = distance_matrix[i]
        
        # Separate into same-subject and different-subject distances
        same_subject_mask = (y_subject == y_subject[i]) & (np.arange(len(y_subject)) != i)
        diff_subject_mask = (y_subject != y_subject[i])
        
        same_subject_dists = dists_from_i[same_subject_mask]
        diff_subject_dists = dists_from_i[diff_subject_mask]
        
        if len(same_subject_dists) > 0:
            # Store distances for statistics
            within_subject_dists.extend(same_subject_dists)
            between_subject_dists.extend(diff_subject_dists)
            
            # Find nearest neighbor (excluding self)
            nearest_idx = np.argmin(dists_from_i[dists_from_i > 0])
            # Adjust index (since we excluded i=0)
            actual_indices = np.where(dists_from_i > 0)[0]
            nearest_actual_idx = actual_indices[nearest_idx]
            
            # Check if nearest neighbor is from same subject
            if y_subject[nearest_actual_idx] == y_subject[i]:
                correct_subject_identifications.append(1)
            else:
                correct_subject_identifications.append(0)
            
            # Check if nearest neighbor is from same video
            if y_video[nearest_actual_idx] == y_video[i]:
                correct_video_identifications.append(1)
            else:
                correct_video_identifications.append(0)
            
            # Categorize the nearest neighbor
            same_subj = (y_subject[nearest_actual_idx] == y_subject[i])
            same_vid = (y_video[nearest_actual_idx] == y_video[i])
            
            if same_subj and same_vid:
                same_subject_same_video += 1
            elif same_subj and not same_vid:
                same_subject_diff_video += 1
            elif not same_subj and same_vid:
                diff_subject_same_video += 1
            else:
                diff_subject_diff_video += 1
    
    ddm_accuracy_subject = np.mean(correct_subject_identifications)
    ddm_accuracy_video = np.mean(correct_video_identifications)
    
    # Calculate distance statistics
    avg_within = np.mean(within_subject_dists)
    std_within = np.std(within_subject_dists)
    avg_between = np.mean(between_subject_dists)
    std_between = np.std(between_subject_dists)
    
    # Separation metrics
    separation_ratio = avg_between / avg_within if avg_within > 0 else 0
    cohen_d = (avg_between - avg_within) / np.sqrt((std_within**2 + std_between**2) / 2)
    
    # Statistical tests
    from scipy import stats
    
    # Test 1: Are within and between distances significantly different?
    t_stat, p_value_dist = stats.ttest_ind(within_subject_dists, 
                                            np.random.choice(between_subject_dists, 
                                                           size=min(len(within_subject_dists), 
                                                                   len(between_subject_dists))))
    
    # Test 2: Is subject accuracy significantly above chance?
    from scipy.stats import binomtest
    n_trials = len(correct_subject_identifications)
    n_successes_subject = sum(correct_subject_identifications)
    binom_result_subject = binomtest(n_successes_subject, n_trials, chance_level_subject, alternative='greater')
    p_value_acc_subject = binom_result_subject.pvalue
    
    # Test 3: Is video accuracy significantly above chance?
    n_successes_video = sum(correct_video_identifications)
    binom_result_video = binomtest(n_successes_video, n_trials, chance_level_video, alternative='greater')
    p_value_acc_video = binom_result_video.pvalue
    
    # Print results
    print(f"\n{'='*60}")
    print(f"RESULTS:")
    print(f"{'='*60}")
    
    print(f"\nüìä SUBJECT IDENTIFICATION (Primary Goal):")
    print(f"  LOO k-NN Accuracy:    {loo_accuracy*100:.2f}%")
    print(f"  DDM Accuracy:         {ddm_accuracy_subject*100:.2f}%")
    print(f"  Chance Level:         {chance_level_subject*100:.2f}%")
    print(f"  Above Chance:         {(ddm_accuracy_subject - chance_level_subject)*100:.2f} percentage points")
    print(f"  Improvement Factor:   {ddm_accuracy_subject / chance_level_subject:.2f}x")
    
    print(f"\nüé¨ VIDEO IDENTIFICATION (Control Check):")
    print(f"  DDM Accuracy:         {ddm_accuracy_video*100:.2f}%")
    print(f"  Chance Level:         {chance_level_video*100:.2f}%")
    print(f"  Above Chance:         {(ddm_accuracy_video - chance_level_video)*100:.2f} percentage points")
    print(f"  Improvement Factor:   {ddm_accuracy_video / chance_level_video:.2f}x")
    
    # Interpretation
    print(f"\nüîç NEAREST NEIGHBOR BREAKDOWN:")
    print(f"  Same Subject, Same Video:   {same_subject_same_video:3d} ({same_subject_same_video/n_trials*100:.1f}%)")
    print(f"  Same Subject, Diff Video:   {same_subject_diff_video:3d} ({same_subject_diff_video/n_trials*100:.1f}%) ‚úÖ IDEAL")
    print(f"  Diff Subject, Same Video:   {diff_subject_same_video:3d} ({diff_subject_same_video/n_trials*100:.1f}%) ‚ö†Ô∏è  CONFOUND")
    print(f"  Diff Subject, Diff Video:   {diff_subject_diff_video:3d} ({diff_subject_diff_video/n_trials*100:.1f}%)")
    
    print(f"\nüí° INTERPRETATION:")
    if ddm_accuracy_subject > ddm_accuracy_video:
        print(f"  ‚úÖ PERSON-DRIVEN: Features capture individual identity > video content")
    elif ddm_accuracy_video > ddm_accuracy_subject:
        print(f"  ‚ö†Ô∏è  CONTENT-DRIVEN: Features capture video content > individual identity")
    else:
        print(f"  ‚öñÔ∏è  MIXED: Similar influence from person and video")
    
    print(f"\nüìè DISTANCE ANALYSIS:")
    print(f"  Within-subject:   {avg_within:.4f} ¬± {std_within:.4f}")
    print(f"  Between-subject:  {avg_between:.4f} ¬± {std_between:.4f}")
    print(f"  Separation Ratio: {separation_ratio:.4f}")
    print(f"  Cohen's d:        {cohen_d:.4f}")
    
    if separation_ratio > 1.2:
        print(f"  ‚úÖ STRONG separability (ratio > 1.2)")
    elif separation_ratio > 1.1:
        print(f"  ‚úÖ Good separability (ratio > 1.1)")
    elif separation_ratio > 1.0:
        print(f"  ‚ö†Ô∏è  Weak separability (ratio > 1.0)")
    else:
        print(f"  ‚ùå NO separability (ratio ‚â§ 1.0)")
    
    print(f"\nüìà STATISTICAL SIGNIFICANCE:")
    print(f"  Distance test (t-test):")
    print(f"    t = {t_stat:.3f}, p = {p_value_dist:.4f}")
    if p_value_dist < 0.001:
        print(f"    ‚úÖ Highly significant (p < 0.001)")
    elif p_value_dist < 0.05:
        print(f"    ‚úÖ Significant (p < 0.05)")
    else:
        print(f"    ‚ùå Not significant (p ‚â• 0.05)")
    
    print(f"\n  Subject accuracy test (binomial):")
    print(f"    {n_successes_subject}/{n_trials} correct, p = {p_value_acc_subject:.4f}")
    if p_value_acc_subject < 0.001:
        print(f"    ‚úÖ Highly significant (p < 0.001)")
    elif p_value_acc_subject < 0.05:
        print(f"    ‚úÖ Significant (p < 0.05)")
    else:
        print(f"    ‚ùå Not significant (p ‚â• 0.05)")
    
    print(f"\n  Video accuracy test (binomial):")
    print(f"    {n_successes_video}/{n_trials} correct, p = {p_value_acc_video:.4f}")
    if p_value_acc_video < 0.001:
        print(f"    ‚ö†Ô∏è  Highly significant (p < 0.001) - Video confound!")
    elif p_value_acc_video < 0.05:
        print(f"    ‚ö†Ô∏è  Significant (p < 0.05) - Video confound!")
    else:
        print(f"    ‚úÖ Not significant - Good! No video confound")
    
    return {
        'approach': approach_name,
        'loo_accuracy': loo_accuracy,
        'ddm_accuracy_subject': ddm_accuracy_subject,
        'ddm_accuracy_video': ddm_accuracy_video,
        'chance_level_subject': chance_level_subject,
        'chance_level_video': chance_level_video,
        'within_dist_mean': avg_within,
        'within_dist_std': std_within,
        'between_dist_mean': avg_between,
        'between_dist_std': std_between,
        'separation_ratio': separation_ratio,
        'cohen_d': cohen_d,
        'n_features': X_selected.shape[1],
        'selected_features': selected_features,
        'p_value_distance': p_value_dist,
        'p_value_acc_subject': p_value_acc_subject,
        'p_value_acc_video': p_value_acc_video,
        'same_subject_same_video': same_subject_same_video,
        'same_subject_diff_video': same_subject_diff_video,
        'diff_subject_same_video': diff_subject_same_video,
        'diff_subject_diff_video': diff_subject_diff_video
    }


In [15]:
results_concat = evaluate_subject_identification_DDM(df_concat, "Concatenation", n_features=20)


   CONCATENATION APPROACH - DDM
Features: 114
Subjects: 23
Videos: 18
Total trials: 414

Selecting top 20 features...
Selected features: ['Stim_RMS_AF3', 'Stim_RMS_F7', 'Stim_RMS_F3', 'Stim_RMS_FC5', 'Base_Beta_AF3']...

Running Distance Discrimination Method...
For each trial: Is the nearest neighbor from the same subject/video?

[Method 1] Leave-One-Out Nearest Neighbor Classification (SUBJECT)

[Method 2] Detailed Distance Discrimination Analysis

RESULTS:

üìä SUBJECT IDENTIFICATION (Primary Goal):
  LOO k-NN Accuracy:    51.69%
  DDM Accuracy:         51.69%
  Chance Level:         4.35%
  Above Chance:         47.34 percentage points
  Improvement Factor:   11.89x

üé¨ VIDEO IDENTIFICATION (Control Check):
  DDM Accuracy:         2.66%
  Chance Level:         5.56%
  Above Chance:         -2.90 percentage points
  Improvement Factor:   0.48x

üîç NEAREST NEIGHBOR BREAKDOWN:
  Same Subject, Same Video:     0 (0.0%)
  Same Subject, Diff Video:   214 (51.7%) ‚úÖ IDEAL
  Diff Sub

In [16]:
results_react = evaluate_subject_identification_DDM(df_react, "Reactivity", n_features=20)


   REACTIVITY APPROACH - DDM
Features: 59
Subjects: 23
Videos: 18
Total trials: 414

Selecting top 20 features...
Selected features: ['ŒîFAA', 'ŒîCorr_F3F4', 'ŒîCorr_F7F8', 'ŒîRMS_AF3', 'ŒîRMS_F7']...

Running Distance Discrimination Method...
For each trial: Is the nearest neighbor from the same subject/video?

[Method 1] Leave-One-Out Nearest Neighbor Classification (SUBJECT)

[Method 2] Detailed Distance Discrimination Analysis

RESULTS:

üìä SUBJECT IDENTIFICATION (Primary Goal):
  LOO k-NN Accuracy:    20.53%
  DDM Accuracy:         20.53%
  Chance Level:         4.35%
  Above Chance:         16.18 percentage points
  Improvement Factor:   4.72x

üé¨ VIDEO IDENTIFICATION (Control Check):
  DDM Accuracy:         5.07%
  Chance Level:         5.56%
  Above Chance:         -0.48 percentage points
  Improvement Factor:   0.91x

üîç NEAREST NEIGHBOR BREAKDOWN:
  Same Subject, Same Video:     0 (0.0%)
  Same Subject, Diff Video:    85 (20.5%) ‚úÖ IDEAL
  Diff Subject, Same Video:   

## 3. Save Results & Compare Approaches

In [17]:
# Save ML results
ml_results_df = pd.DataFrame([
    {
        'model': f"DDM - {results_concat['approach']}",
        'subject_accuracy': results_concat['ddm_accuracy_subject'],
        'video_accuracy': results_concat['ddm_accuracy_video'],
        'loo_accuracy': results_concat['loo_accuracy'],
        'chance_level_subject': results_concat['chance_level_subject'],
        'chance_level_video': results_concat['chance_level_video'],
        'separation_ratio': results_concat['separation_ratio'],
        'cohen_d': results_concat['cohen_d'],
        'n_features': results_concat['n_features'],
        'p_value_subject': results_concat['p_value_acc_subject'],
        'p_value_video': results_concat['p_value_acc_video'],
        'p_value_distance': results_concat['p_value_distance'],
        'same_subj_same_vid': results_concat['same_subject_same_video'],
        'same_subj_diff_vid': results_concat['same_subject_diff_video'],
        'diff_subj_same_vid': results_concat['diff_subject_same_video'],
        'diff_subj_diff_vid': results_concat['diff_subject_diff_video']
    },
    {
        'model': f"DDM - {results_react['approach']}",
        'subject_accuracy': results_react['ddm_accuracy_subject'],
        'video_accuracy': results_react['ddm_accuracy_video'],
        'loo_accuracy': results_react['loo_accuracy'],
        'chance_level_subject': results_react['chance_level_subject'],
        'chance_level_video': results_react['chance_level_video'],
        'separation_ratio': results_react['separation_ratio'],
        'cohen_d': results_react['cohen_d'],
        'n_features': results_react['n_features'],
        'p_value_subject': results_react['p_value_acc_subject'],
        'p_value_video': results_react['p_value_acc_video'],
        'p_value_distance': results_react['p_value_distance'],
        'same_subj_same_vid': results_react['same_subject_same_video'],
        'same_subj_diff_vid': results_react['same_subject_diff_video'],
        'diff_subj_same_vid': results_react['diff_subject_same_video'],
        'diff_subj_diff_vid': results_react['diff_subject_diff_video']
    }
])

ml_results_df.to_csv('../outputs/models/ml_comparison_DDM.csv', index=False)
print("\n‚úÖ Saved DDM results to: ../outputs/models/ml_comparison_DDM.csv")

print("\n" + "="*60)
print("DDM ANALYSIS COMPLETE")
print("="*60)
print("\nüìã SUMMARY:")
print(f"  Concatenation: {results_concat['ddm_accuracy_subject']*100:.2f}% subject accuracy")
print(f"  Reactivity:    {results_react['ddm_accuracy_subject']*100:.2f}% subject accuracy")
print(f"  Difference:    {(results_concat['ddm_accuracy_subject'] - results_react['ddm_accuracy_subject'])*100:.2f} percentage points")
print(f"\n  Video confound: {'‚úÖ NONE (both approaches)' if results_concat['p_value_acc_video'] > 0.05 and results_react['p_value_acc_video'] > 0.05 else '‚ö†Ô∏è  DETECTED'}")



‚úÖ Saved DDM results to: ../outputs/models/ml_comparison_DDM.csv

DDM ANALYSIS COMPLETE

üìã SUMMARY:
  Concatenation: 51.69% subject accuracy
  Reactivity:    20.53% subject accuracy
  Difference:    31.16 percentage points

  Video confound: ‚úÖ NONE (both approaches)
