In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pingouin as pg


INPUT_FILE = "../data/processed/eeg_features_concat.csv"
OUTPUT_ML_FILE = "../outputs/models/ml_comparison_concat.csv"
OUTPUT_VAR_FILE = "../outputs/models/variance_components_concat.csv"


print(f"Loading data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)
print(f"Data loaded. Shape: {df.shape}")

# Inspect columns to ensure we have Base_ and Stim_
print("Columns found:", df.columns[:5].tolist(), "...")


print("\n" + "="*40)
print("   RUNNING SUBJECT IDENTIFICATION (RF)")
print("="*40)


metadata_cols = ['subject_id', 'video_id', 'valence', 'arousal']
X = df.drop(columns=metadata_cols, errors='ignore')
y = df['subject_id']


# Scale features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Cross-Validation (5-Fold) - StratifiedKFold for subject identification
# This splits TRIALS while keeping class balance, allowing the model to identify subjects
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_scaled, y, cv=cv, scoring='accuracy')

# Calculate Chance Level
n_classes = len(np.unique(y))
chance_level = 1.0 / n_classes

print(f"Number of Subjects: {n_classes}")
print(f"Chance Level:       {chance_level*100:.2f}%")
print(f"Model Accuracy:     {scores.mean()*100:.2f}% (+/- {scores.std()*100:.2f}%)")



ml_results = pd.DataFrame({
    'model': ['Random Forest'],
    'accuracy': [scores.mean()],
    'chance_level': [chance_level]
})
ml_results.to_csv(OUTPUT_ML_FILE, index=False)
print(f"Saved ML results to {OUTPUT_ML_FILE}")


print("\n" + "="*40)
print("   WHY SUBTRACTION FAILED (ICC CHECK)")
print("="*40)
print("Checking Subject Stability (ICC) for Baseline vs Stimulus...")



# Check all Alpha features across different channels to show the pattern
features_to_check = [
    'Base_Alpha_AF3', 'Stim_Alpha_AF3',
    'Base_Alpha_F3', 'Stim_Alpha_F3',
    'Base_Alpha_F4', 'Stim_Alpha_F4',
    'Base_Beta_AF3',  'Stim_Beta_AF3',
    'Base_RMS_AF3', 'Stim_RMS_AF3',
    'Base_FAA',       'Stim_FAA'
]

results = []

for feat in features_to_check:
    if feat in df.columns:
        # Use pingouin for proper ICC calculation with crossed random effects
        # ICC2 is appropriate for crossed design (subjects × videos)
        icc_df = pg.intraclass_corr(
            data=df, 
            targets='subject_id',
            raters='video_id', 
            ratings=feat
        )
        
        # Get ICC2 (two-way random effects, absolute agreement)
        icc2_row = icc_df[icc_df['Type'] == 'ICC2']
        icc_value = icc2_row['ICC'].values[0]
        ci_lower = icc2_row['CI95%'].values[0][0]
        ci_upper = icc2_row['CI95%'].values[0][1]
        
        results.append({
            'Feature': feat,
            'ICC (Trait Stability)': icc_value,
            'ICC_CI_Lower': ci_lower,
            'ICC_CI_Upper': ci_upper
        })

var_df = pd.DataFrame(results)
print("\n", var_df[['Feature', 'ICC (Trait Stability)', 'ICC_CI_Lower', 'ICC_CI_Upper']])

var_df.to_csv(OUTPUT_VAR_FILE, index=False)
print(f"\nSaved variance analysis to {OUTPUT_VAR_FILE}")

Loading data from ../data/processed/eeg_features_concat.csv...
Data loaded. Shape: (414, 118)
Columns found: ['subject_id', 'video_id', 'valence', 'arousal', 'Base_RMS_AF3'] ...

   RUNNING SUBJECT IDENTIFICATION (RF)
Number of Subjects: 23
Chance Level:       4.35%
Model Accuracy:     0.00% (+/- 0.00%)
Saved ML results to ../outputs/models/ml_comparison_concat.csv

   WHY SUBTRACTION FAILED (ICC CHECK)
Checking Subject Stability (ICC) for Baseline vs Stimulus...

            Feature  ICC (Trait Stability)  ICC_CI_Lower  ICC_CI_Upper
0   Base_Alpha_AF3               0.019635         -0.01          0.09
1   Stim_Alpha_AF3               0.312425          0.20          0.49
2    Base_Alpha_F3               0.016432         -0.01          0.08
3    Stim_Alpha_F3               0.200939          0.11          0.36
4    Base_Alpha_F4               0.020921         -0.01          0.09
5    Stim_Alpha_F4               0.172193          0.09          0.32
6    Base_Beta_AF3               0.40908

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import pingouin as pg


INPUT_FILE = "../data/processed/eeg_features_reactivity.csv"
OUTPUT_ML_FILE = "../outputs/models/ml_results_reactivity.csv"
OUTPUT_VAR_FILE = "../outputs/models/variance_components_reactivity.csv"

print(f"Loading data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)


print("Running Random Forest Classifier...")


metadata_cols = ['subject_id', 'video_id', 'valence', 'arousal']
X = df.drop(columns=metadata_cols, errors='ignore')
y = df['subject_id']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Cross-Validation (5-Fold) - StratifiedKFold for subject identification
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_scaled, y, cv=cv, scoring='accuracy')

# Calculate Metrics
accuracy = scores.mean()
chance_level = 1.0 / len(np.unique(y))

print(f"Model Accuracy:     {accuracy*100:.2f}%")
print(f"Chance Level:       {chance_level*100:.2f}%")

# Save ML Results
pd.DataFrame({
    'model': ['Random Forest (Reactivity)'],
    'accuracy': [accuracy],
    'chance_level': [chance_level]
}).to_csv(OUTPUT_ML_FILE, index=False)

print("Calculating Intraclass Correlation Coefficients (ICC)...")

# Check reactivity features to show why they fail
features_to_check = [
    'ΔAlpha_AF3', 'ΔAlpha_F3', 'ΔAlpha_F4',
    'ΔBeta_AF3', 'ΔBeta_F3',
    'ΔRMS_AF3', 'ΔRMS_F3',
    'ΔFAA', 'ΔCorr_F3F4', 'ΔCorr_F7F8'
]
icc_results = []

for feat in features_to_check:
    if feat in df.columns:
        try:
            # Use pingouin for proper ICC calculation with crossed random effects
            icc_df = pg.intraclass_corr(
                data=df, 
                targets='subject_id',
                raters='video_id', 
                ratings=feat
            )
            
            # Get ICC2 (two-way random effects, absolute agreement)
            icc2_row = icc_df[icc_df['Type'] == 'ICC2']
            icc_value = icc2_row['ICC'].values[0]
            ci_lower = icc2_row['CI95%'].values[0][0]
            ci_upper = icc2_row['CI95%'].values[0][1]
            
            icc_results.append({
                'Feature': feat,
                'ICC': icc_value,
                'ICC_CI_Lower': ci_lower,
                'ICC_CI_Upper': ci_upper
            })
        except Exception as e:
            print(f"Could not calculate ICC for {feat}: {e}")


var_df = pd.DataFrame(icc_results)
print(var_df[['Feature', 'ICC']])
var_df.to_csv(OUTPUT_VAR_FILE, index=False)

print("Analysis complete.")

Loading data from ../data/processed/eeg_features_reactivity.csv...
Running Random Forest Classifier...
Model Accuracy:     49.50%
Chance Level:       4.35%
Calculating Intraclass Correlation Coefficients (ICC)...
      Feature       ICC
0  ΔAlpha_AF3  0.002569
1   ΔAlpha_F3  0.001050
2   ΔAlpha_F4  0.009240
3   ΔBeta_AF3 -0.011487
4    ΔBeta_F3 -0.037253
5    ΔRMS_AF3  0.050876
6     ΔRMS_F3  0.031724
7        ΔFAA  0.069474
8  ΔCorr_F3F4  0.080634
9  ΔCorr_F7F8  0.152099
Analysis complete.
