In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf


INPUT_FILE = "../data/processed/eeg_features_concat.csv"
OUTPUT_ML_FILE = "../outputs/models/ml_comparison_concat.csv"
OUTPUT_VAR_FILE = "../outputs/models/variance_components_concat.csv"


print(f"Loading data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)
print(f"Data loaded. Shape: {df.shape}")

# Inspect columns to ensure we have Base_ and Stim_
print("Columns found:", df.columns[:5].tolist(), "...")


print("\n" + "="*40)
print("   RUNNING SUBJECT IDENTIFICATION (RF)")
print("="*40)


metadata_cols = ['subject_id', 'video_id', 'valence', 'arousal']
X = df.drop(columns=metadata_cols, errors='ignore')
y = df['subject_id']


# Scale features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Cross-Validation (5-Fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_scaled, y, cv=cv, scoring='accuracy')

# Calculate Chance Level
n_classes = len(np.unique(y))
chance_level = 1.0 / n_classes

print(f"Number of Subjects: {n_classes}")
print(f"Chance Level:       {chance_level*100:.2f}%")
print(f"Model Accuracy:     {scores.mean()*100:.2f}% (+/- {scores.std()*100:.2f}%)")



ml_results = pd.DataFrame({
    'model': ['Random Forest'],
    'accuracy': [scores.mean()],
    'chance_level': [chance_level]
})
ml_results.to_csv(OUTPUT_ML_FILE, index=False)
print(f"Saved ML results to {OUTPUT_ML_FILE}")


print("\n" + "="*40)
print("   WHY SUBTRACTION FAILED (ICC CHECK)")
print("="*40)
print("Checking Subject Stability (ICC) for Baseline vs Stimulus...")



features_to_check = [
    'Base_Alpha_AF3', 'Stim_Alpha_AF3',
    'Base_Beta_AF3',  'Stim_Beta_AF3',
    'Base_FAA',       'Stim_FAA'
]

results = []

for feat in features_to_check:
    if feat in df.columns:

        md = smf.mixedlm(f"{feat} ~ 1", df, groups=df["subject_id"])
        mdf = md.fit()

        var_subj = mdf.cov_re.iloc[0, 0]
        var_resid = mdf.scale
        icc = var_subj / (var_subj + var_resid)
        
        results.append({
            'Feature': feat,
            'ICC (Trait Stability)': icc,
            'Subject Variance': var_subj,
            'Noise Variance': var_resid
        })

var_df = pd.DataFrame(results)
print("\n", var_df[['Feature', 'ICC (Trait Stability)']])

var_df.to_csv(OUTPUT_VAR_FILE, index=False)
print(f"\nSaved variance analysis to {OUTPUT_VAR_FILE}")


Loading data from ../data/processed/eeg_features_concat.csv...
Data loaded. Shape: (414, 118)
Columns found: ['subject_id', 'video_id', 'valence', 'arousal', 'Base_RMS_AF3'] ...

   RUNNING SUBJECT IDENTIFICATION (RF)
Number of Subjects: 23
Chance Level:       4.35%
Model Accuracy:     95.89% (+/- 1.96%)
Saved ML results to ../outputs/models/ml_comparison_concat.csv

   WHY SUBTRACTION FAILED (ICC CHECK)
Checking Subject Stability (ICC) for Baseline vs Stimulus...

           Feature  ICC (Trait Stability)
0  Base_Alpha_AF3               0.019562
1  Stim_Alpha_AF3               0.312834
2   Base_Beta_AF3               0.409372
3   Stim_Beta_AF3               0.424405
4        Base_FAA               0.577797
5        Stim_FAA               0.696113

Saved variance analysis to ../outputs/models/variance_components_concat.csv

DONE. Ready for plotting.


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf


INPUT_FILE = "../data/processed/eeg_features_reactivity.csv"
OUTPUT_ML_FILE = "../outputs/models/ml_results_reactivity.csv"
OUTPUT_VAR_FILE = "variance_components_reactivity.csv"

print(f"Loading data from {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)


print("Running Random Forest Classifier...")


metadata_cols = ['subject_id', 'video_id', 'valence', 'arousal']
X = df.drop(columns=metadata_cols, errors='ignore')
y = df['subject_id']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_scaled, y, cv=cv, scoring='accuracy')

# Calculate Metrics
accuracy = scores.mean()
chance_level = 1.0 / len(np.unique(y))

print(f"Model Accuracy:     {accuracy*100:.2f}%")
print(f"Chance Level:       {chance_level*100:.2f}%")

# Save ML Results
pd.DataFrame({
    'model': ['Random Forest (Reactivity)'],
    'accuracy': [accuracy],
    'chance_level': [chance_level]
}).to_csv(OUTPUT_ML_FILE, index=False)

print("Calculating Intraclass Correlation Coefficients (ICC)...")

features_to_check = ['ΔAlpha_AF3', 'ΔBeta_AF3', 'ΔFAA']
icc_results = []

for feat in features_to_check:
    if feat in df.columns:
        try:

            md = smf.mixedlm(f"Q('{feat}') ~ 1", df, groups=df["subject_id"])
            mdf = md.fit()
            
            var_subj = mdf.cov_re.iloc[0, 0]
            var_resid = mdf.scale
            icc = var_subj / (var_subj + var_resid)
            
            icc_results.append({
                'Feature': feat,
                'ICC': icc,
                'Subject Variance': var_subj,
                'Noise Variance': var_resid
            })
        except Exception as e:
            print(f"Could not calculate ICC for {feat}: {e}")


var_df = pd.DataFrame(icc_results)
print(var_df[['Feature', 'ICC']])
var_df.to_csv(OUTPUT_VAR_FILE, index=False)

print("Analysis complete.")

Loading data from ../data/processed/eeg_features_reactivity.csv...
Running Random Forest Classifier...
Model Accuracy:     47.34%
Chance Level:       4.35%
Calculating Intraclass Correlation Coefficients (ICC)...
      Feature       ICC
0  ΔAlpha_AF3  0.002165
1   ΔBeta_AF3  0.084773
2        ΔFAA  0.100570
Analysis complete.


