In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pickle
from dataclasses import dataclass
from typing import Optional
warnings.filterwarnings('ignore')





final_cols = ['anger_intensity',
'arousal_change_magnitude',
'arousal_deviation_from_neutral',
'arousal_onset_detected',
'arousal_stability',
'attention_focus_distracted',
'attention_focus_focused',
'attention_focus_moderate',
'attention_stability_index',
'avg_blink_duration_sec',
'avg_fixation_duration_sec',
'avg_saccade_amplitude',
'behavioral_complexity',
'behavioral_state_normal',
'blink_completeness_score',
'blink_rate_per_minute',
'blink_rhythm_score',
'cognitive_load_index',
'disengagement_indicator',
'disgust_intensity',
'emotion_quadrant_negative_high_arousal',
'emotion_quadrant_negative_low_arousal',
'emotion_quadrant_neutral',
'emotion_quadrant_positive_high_arousal',
'emotion_transition_frequency',
'engagement_proxy_score',
'engagement_score',
'engagement_state_low',
'expression_arousal_sync',
'expression_change_rate',
'eye_openness_score',
'eyebrow_furrow_intensity',
'eyebrow_raise_intensity',
'facial_asymmetry',
'fixation_count_per_window',
'frown_intensity',
'gaze_consistency_score',
'gaze_direction_center',
'gaze_direction_down',
'gaze_direction_down_left',
'gaze_direction_down_right',
'gaze_direction_left',
'gaze_direction_right',
'gaze_direction_up',
'gaze_direction_up_left',
'gaze_direction_up_right',
'gaze_head_coordination',
'head_Tx_velocity',
'head_Ty_velocity',
'head_Tz_velocity',
'head_gaze_alignment_score',
'head_movement_jerk',
'head_movement_stability',
'head_pitch',
'head_pitch_acceleration',
'head_pitch_velocity',
'head_roll',
'head_roll_acceleration',
'head_roll_velocity',
'head_tilt_direction_center',
'head_tilt_direction_left',
'head_yaw',
'head_yaw_acceleration',
'head_yaw_velocity',
'left_eye_aperture',
'micro_expression_frequency_per_min',
'mouth_openness',
'multimodal_consistency',
'nostril_flare_intensity',
'pupil_size_mean',
'pupil_size_std',
'right_eye_aperture',
'saccade_frequency_per_sec',
'sadness_intensity',
'smile_intensity',
'surprise_intensity',
'temporal_alignment_score',
'valence_deviation_from_neutral',
'valence_stability']


def apply_one_hot_encoding(df: pd.DataFrame, columns_to_encode: list) -> pd.DataFrame:
    """
    Applies one-hot encoding to specified categorical columns in a DataFrame.

    Args:
        df: The input pandas DataFrame containing the features.
        columns_to_encode: A list of column names to be one-hot encoded.

    Returns:
        A new DataFrame with the specified columns one-hot encoded.
        The original categorical columns will be dropped.
    """
    
    # Filter for columns that actually exist in the DataFrame
    existing_columns_to_encode = [col for col in columns_to_encode if col in df.columns]
    
    if not existing_columns_to_encode:
        print("No specified categorical columns found in the DataFrame to encode.")
        return df # Return original df if no columns exist
        

    # Use pd.get_dummies for one-hot encoding
    # prefix ensures unique column names (e.g., 'head_tilt_direction_left')
    # dtype=int ensures the new columns are integers (0 or 1)
    df_encoded = pd.get_dummies(
        df,
        columns=existing_columns_to_encode,
        prefix=existing_columns_to_encode,
        dtype=int
    )
    
    return df_encoded

In [2]:
class Config:
    # Data Configuration - 30 second windows for production
    SEQUENCE_LENGTH = 750  # 30 seconds at 25 FPS
    FEATURE_DIM = 456  # Your total features
    FRAME_RATE = 25
    
    # Model Architecture - Optimized for 2x16GB
    D_MODEL = 768
    N_HEADS = 12
    N_ENCODER_LAYERS = 8
    DROPOUT = 0.1

    # SSL Configuration
    SSL_TASKS = ['temporal_prediction', 'behavioral_consistency', 'attention_flow']
    PREDICTION_HORIZON = 50  # Predict next 2 seconds
    CONSISTENCY_WINDOW = 125  # 5 seconds for consistency check
    
    # Training - Full utilization of 2x16GB setup
    BATCH_SIZE = 16  # Maximize GPU utilization
    SSL_EPOCHS = 250  # Testing epochs
    SSL_LR = 2e-4
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    


class BehavioralDataset(Dataset):
    """Dataset for temporal behavioral data with SSL objectives"""
    
    def __init__(self, data_folder, sequence_length=750, is_train=True, max_files=None):
        self.data_folder = Path(data_folder)
        self.sequence_length = sequence_length
        self.is_train = is_train
        
        # Collect all CSV files
        self.csv_files = list(self.data_folder.glob("*.csv"))
        
        # Limit files for testing if specified
        if max_files:
            self.csv_files = self.csv_files[:max_files]
        
        print(f"Found {len(self.csv_files)} videos in {data_folder}")
        
        # Get feature dimension from first file
        if len(self.csv_files) > 0:
            sample_df = pd.read_csv(self.csv_files[0])
            self.feature_columns = final_cols
            self.actual_feature_dim = len(final_cols)
        else:
            raise ValueError("No CSV files found in the directory")
    

    
    def __len__(self):
        return len(self.csv_files)
    
    def __getitem__(self, idx):
        try:
            csv_path = self.csv_files[idx]
            df = pd.read_csv(csv_path)
            categorical_features_for_encoding = [
                    "head_tilt_direction",
                    "emotion_quadrant",
                    "engagement_state",
                    "attention_focus",
                    "behavioral_state"
                ]

            df = apply_one_hot_encoding(df, categorical_features_for_encoding)
            
            if len(df) == 0:
                print(f"Warning: Empty dataframe in {csv_path}")
                # Return dummy data
                features = np.zeros((self.sequence_length, self.actual_feature_dim), dtype=np.float32)
            else:
                # Use the pre-determined feature columns for consistency
                for col in self.feature_columns:
                    if col not in df.columns:
                        df[col] = 0

                feature_data = df[self.feature_columns].copy()
                
                # Convert all columns to numeric, forcing errors to NaN
                for col in feature_data.columns:
                    feature_data[col] = pd.to_numeric(feature_data[col], errors='coerce')

                feature_data = feature_data.fillna(feature_data.mean()).fillna(0)
                
                # Handle variable length videos
                if len(feature_data) < self.sequence_length:
                    # Pad shorter videos by repeating last frame
                    padding_needed = self.sequence_length - len(feature_data)
                    if len(feature_data) > 0:
                        last_values = feature_data.iloc[-1:].values
                        padding = np.repeat(last_values, padding_needed, axis=0)
                        features = np.vstack([feature_data.values, padding])
                    else:
                        features = np.zeros((self.sequence_length, len(feature_data.columns)), dtype=np.float32)
                        
                elif len(feature_data) > self.sequence_length:
                    # For longer videos, sample random window during training
                    if self.is_train:
                        start_idx = np.random.randint(0, len(feature_data) - self.sequence_length + 1)
                        features = feature_data.iloc[start_idx:start_idx + self.sequence_length].values
                    else:
                        # Use first window for validation
                        features = feature_data.iloc[:self.sequence_length].values
                else:
                    features = feature_data.values
                
                # Ensure we have the right shape and type
                features = features.astype(np.float32)
                
                # Handle any remaining NaN or inf values
                features = np.nan_to_num(features, nan=0.0, posinf=1.0, neginf=-1.0)
                
                # Normalize features to prevent extreme values
                features = np.clip(features, -10, 10)
            
            # Create SSL targets
            ssl_targets = self._create_ssl_targets(features)
            
            return {
                'features': torch.tensor(features),
                'ssl_targets': ssl_targets,
                'video_name': csv_path.stem
            }
            
        except Exception as e:
            print(f"Error processing {csv_path}: {str(e)}")
            dummy_features = np.zeros((self.sequence_length, self.actual_feature_dim), dtype=np.float32)
            ssl_targets = self._create_ssl_targets(dummy_features)
            return {
                'features': torch.tensor(dummy_features),
                'ssl_targets': ssl_targets,
                'video_name': f'error_{idx}'
            }
    
    def _create_ssl_targets(self, features):
        """Create multiple SSL objectives with fixed tensor dimensions"""
        targets = {}
        seq_len, feat_dim = features.shape
        
        # 1. Temporal Prediction: Fixed to always produce same sequence length
        horizon = Config.PREDICTION_HORIZON
        
        if seq_len > horizon:
            # Take input context from beginning, predict future frames
            context_frames = features[:-horizon]  # Remove last 'horizon' frames
            future_frames = features[horizon:]    # Remove first 'horizon' frames
            
            # Ensure both have same length (should be seq_len - horizon)
            min_len = min(len(context_frames), len(future_frames))
            targets['temporal_context'] = torch.tensor(context_frames[:min_len], dtype=torch.float32)
            targets['temporal_future'] = torch.tensor(future_frames[:min_len], dtype=torch.float32)
        else:
            # For short sequences, predict next frame
            targets['temporal_context'] = torch.tensor(features[:-1] if seq_len > 1 else features, dtype=torch.float32)
            targets['temporal_future'] = torch.tensor(features[1:] if seq_len > 1 else features, dtype=torch.float32)
        
        # 2. Behavioral Consistency: Use different feature subsets
        # Split features into different behavioral modalities
        third = feat_dim // 3
        attention_features = features[:, :third] if third > 0 else features
        engagement_features = features[:, third:2*third] if third > 0 else features
        emotion_features = features[:, 2*third:] if third > 0 else features
        
        targets['attention_trajectory'] = torch.tensor(attention_features, dtype=torch.float32)
        targets['engagement_trajectory'] = torch.tensor(engagement_features, dtype=torch.float32)
        targets['emotion_trajectory'] = torch.tensor(emotion_features, dtype=torch.float32)
        
        # 3. Cross-modal alignment
        targets['cross_modal_pairs'] = (
            torch.tensor(attention_features, dtype=torch.float32),
            torch.tensor(engagement_features, dtype=torch.float32)
        )
        
        return targets

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):  # Increased for longer sequences
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class BehavioralTransformer(nn.Module):
    """Transformer designed for behavioral temporal sequences - Full scale"""
    
    def __init__(self, config, actual_feature_dim):
        super().__init__()
        self.config = config
        self.actual_feature_dim = actual_feature_dim
        
        # Input projection with layer norm
        self.input_projection = nn.Sequential(
            nn.Linear(actual_feature_dim, config.D_MODEL),
            nn.LayerNorm(config.D_MODEL),
            nn.Dropout(config.DROPOUT)
        )
        
        self.pos_encoding = PositionalEncoding(config.D_MODEL, max_len=config.SEQUENCE_LENGTH)
        
        # Transformer encoder - Full scale
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.D_MODEL,
            nhead=config.N_HEADS,
            dim_feedforward=config.D_MODEL * 4,
            dropout=config.DROPOUT,
            batch_first=True,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, config.N_ENCODER_LAYERS)
        
        # SSL heads with better architectures
        self.temporal_predictor = nn.Sequential(
            nn.Linear(config.D_MODEL, config.D_MODEL),
            nn.GELU(),
            nn.LayerNorm(config.D_MODEL),
            nn.Dropout(config.DROPOUT),
            nn.Linear(config.D_MODEL, config.D_MODEL // 2),
            nn.GELU(),
            nn.Linear(config.D_MODEL // 2, actual_feature_dim)
        )
        
        self.consistency_projector = nn.Sequential(
            nn.Linear(config.D_MODEL, config.D_MODEL // 2),
            nn.GELU(),
            nn.LayerNorm(config.D_MODEL // 2),
            nn.Dropout(config.DROPOUT),
            nn.Linear(config.D_MODEL // 2, 256)  # Consistency embedding
        )
        
        self.attention_flow_predictor = nn.Sequential(
            nn.Linear(config.D_MODEL, config.D_MODEL // 2),
            nn.GELU(),
            nn.LayerNorm(config.D_MODEL // 2),
            nn.Dropout(config.DROPOUT),
            nn.Linear(config.D_MODEL // 2, max(actual_feature_dim // 3, 64))  # Attention flow
        )
        
        # Cross-modal predictors
        self.cross_modal_predictor = nn.Sequential(
            nn.Linear(config.D_MODEL, config.D_MODEL // 2),
            nn.GELU(),
            nn.Dropout(config.DROPOUT),
            nn.Linear(config.D_MODEL // 2, max(actual_feature_dim // 3, 64))
        )
    
    def forward(self, x, return_embeddings=False):
        # x shape: (batch, sequence, features)
        batch_size, seq_len, _ = x.shape
        
        # Project to model dimension
        x = self.input_projection(x)  # (batch, seq, d_model)
        
        # Add positional encoding
        x = x.transpose(0, 1)  # (seq, batch, d_model)
        x = self.pos_encoding(x)
        x = x.transpose(0, 1)  # (batch, seq, d_model)
        
        # Transformer encoding
        embeddings = self.transformer(x)  # (batch, seq, d_model)
        
        if return_embeddings:
            return embeddings
        
        # SSL predictions
        ssl_outputs = {}
        
        # Temporal prediction - predict for reduced sequence length
        prediction_length = seq_len - Config.PREDICTION_HORIZON
        if prediction_length > 0:
            temporal_embeddings = embeddings[:, :prediction_length]  # Match context length
            ssl_outputs['temporal'] = self.temporal_predictor(temporal_embeddings)
        else:
            # Fallback for short sequences
            ssl_outputs['temporal'] = self.temporal_predictor(embeddings[:, :-1] if seq_len > 1 else embeddings)
        
        # Behavioral consistency (use mean pooling)
        ssl_outputs['consistency'] = self.consistency_projector(embeddings.mean(dim=1))
        
        # Attention flow
        ssl_outputs['attention_flow'] = self.attention_flow_predictor(embeddings)
        
        # Cross-modal prediction
        ssl_outputs['cross_modal'] = self.cross_modal_predictor(embeddings)
        
        return ssl_outputs

class BehavioralSSLLoss(nn.Module):
    """Multi-objective SSL loss for behavioral data with fixed tensor handling"""
    
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.mse_loss = nn.MSELoss()
        self.cosine_loss = nn.CosineEmbeddingLoss()
        self.huber_loss = nn.HuberLoss(delta=1.0)
        
    def forward(self, predictions, targets):
        total_loss = 0
        loss_components = {}
        batch_size = None
        device = None
        
        # Get batch size and device from any available tensor
        for key, value in predictions.items():
            if isinstance(value, torch.Tensor):
                batch_size = value.size(0)
                device = value.device
                break
        
        if batch_size is None or device is None:
            # Create a default tensor on CPU if no predictions available
            device = torch.device('cpu')
            return torch.tensor(0.0, device=device), {}
        
        # 1. Temporal Prediction Loss (main objective)
        if 'temporal_future' in targets and 'temporal' in predictions:
            future_frames = targets['temporal_future']
            pred_frames = predictions['temporal']
            
            # Ensure both tensors are on the same device
            if future_frames.device != device:
                future_frames = future_frames.to(device)
            
            # Ensure batch dimension matches
            if future_frames.size(0) == batch_size and pred_frames.size(0) == batch_size:
                # Match sequence lengths
                min_seq_len = min(future_frames.size(1), pred_frames.size(1))
                min_feat_dim = min(future_frames.size(2), pred_frames.size(2))
                
                if min_seq_len > 0 and min_feat_dim > 0:
                    temporal_loss = self.huber_loss(
                        pred_frames[:, :min_seq_len, :min_feat_dim], 
                        future_frames[:, :min_seq_len, :min_feat_dim]
                    )
                    loss_components['temporal'] = temporal_loss
                    total_loss += 2.0 * temporal_loss  # Higher weight for main task
        
        # 2. Behavioral Consistency Loss
        if 'attention_trajectory' in targets and 'consistency' in predictions:
            try:
                consistency_loss = self._compute_consistency_loss(
                    predictions['consistency'], 
                    targets['attention_trajectory'],
                    device
                )
                loss_components['consistency'] = consistency_loss
                total_loss += 0.5 * consistency_loss
            except Exception as e:
                print(f"Consistency loss error: {e}")
        
        # 3. Attention Flow Smoothness Loss
        if 'attention_flow' in predictions:
            try:
                flow_loss = self._compute_flow_smoothness_loss(
                    predictions['attention_flow']
                )
                loss_components['flow'] = flow_loss
                total_loss += 0.3 * flow_loss
            except Exception as e:
                print(f"Flow loss error: {e}")
        
        # 4. Cross-modal alignment loss
        if 'cross_modal_pairs' in targets and 'cross_modal' in predictions:
            try:
                cross_modal_loss = self._compute_cross_modal_loss(
                    predictions['cross_modal'],
                    targets['cross_modal_pairs'],
                    device
                )
                loss_components['cross_modal'] = cross_modal_loss
                total_loss += 0.4 * cross_modal_loss
            except Exception as e:
                print(f"Cross-modal loss error: {e}")
        
        # Ensure we have at least some loss
        if total_loss == 0:
            total_loss = torch.tensor(0.001, device=device, requires_grad=True)
        
        return total_loss, loss_components
    
    def _compute_consistency_loss(self, embeddings, attention_trajectories, device):
        """Encourage similar attention patterns to have similar embeddings"""
        batch_size = embeddings.size(0)
        
        if batch_size < 2:
            return torch.tensor(0.0, device=device)
        
        # Ensure attention_trajectories is on the correct device
        if attention_trajectories.device != device:
            attention_trajectories = attention_trajectories.to(device)
        
        # Compute attention similarity matrix
        attention_flat = attention_trajectories.view(batch_size, -1)
        attention_sim = F.cosine_similarity(
            attention_flat.unsqueeze(1), 
            attention_flat.unsqueeze(0), 
            dim=2
        )
        
        # Compute embedding similarity matrix
        embedding_sim = F.cosine_similarity(
            embeddings.unsqueeze(1), 
            embeddings.unsqueeze(0), 
            dim=2
        )
        
        # Loss: embedding similarity should match attention similarity
        consistency_loss = self.mse_loss(embedding_sim, attention_sim)
        return consistency_loss
    
    def _compute_flow_smoothness_loss(self, attention_flow):
        """Encourage smooth attention transitions"""
        if attention_flow.size(1) <= 1:
            return torch.tensor(0.0, device=attention_flow.device)
            
        # Compute temporal differences
        flow_diff = attention_flow[:, 1:] - attention_flow[:, :-1]
        
        # Penalize large jumps in attention flow
        smoothness_loss = torch.mean(torch.abs(flow_diff))
        return smoothness_loss
    
    def _compute_cross_modal_loss(self, predictions, target_pairs, device):
        """Cross-modal alignment loss"""
        attention_targets, engagement_targets = target_pairs
        
        # Ensure all tensors are on the correct device
        if attention_targets.device != device:
            attention_targets = attention_targets.to(device)
        if engagement_targets.device != device:
            engagement_targets = engagement_targets.to(device)
        
        # Predict engagement from attention features
        pred_engagement = predictions
        target_engagement = engagement_targets.mean(dim=1)  # Pool over sequence: [batch, features]
        
        # Pool predictions over sequence to match target dimensions
        pred_engagement_pooled = pred_engagement.mean(dim=1)  # [batch, seq, features] -> [batch, features]
        
        # Match dimensions
        min_dim = min(pred_engagement_pooled.size(-1), target_engagement.size(-1))
        
        if min_dim > 0:
            cross_modal_loss = self.mse_loss(
                pred_engagement_pooled[:, :min_dim], 
                target_engagement[:, :min_dim]
            )
            return cross_modal_loss
        else:
            # Return zero loss tensor on the correct device
            return torch.tensor(0.0, device=device, requires_grad=True)

class BehavioralSSLTrainer:
    """SSL Trainer for behavioral data - Full GPU utilization"""
    
    def __init__(self, config, actual_feature_dim):
        self.config = config
        self.device = config.DEVICE
        
        # Model
        self.model = BehavioralTransformer(config, actual_feature_dim).to(self.device)
        
        # Data parallel if multiple GPUs - maximize utilization
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
            self.model = nn.DataParallel(self.model)
        
        # Loss
        self.criterion = BehavioralSSLLoss(config)
        
        # Optimizer - higher learning rate for larger model
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(), 
            lr=config.SSL_LR,
            weight_decay=1e-4,
            betas=(0.9, 0.95),  # Better for transformers
            eps=1e-8
        )
        
        # Scheduler with warmup
        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
            self.optimizer,
            max_lr=config.SSL_LR,
            epochs=config.SSL_EPOCHS,
            steps_per_epoch=100,  # Approximate
            pct_start=0.1,
            anneal_strategy='cos'
        )
        
        # Gradient scaler for mixed precision
        self.scaler = torch.cuda.amp.GradScaler() if self.device == 'cuda' else None
    
    def train_epoch(self, dataloader):
        self.model.train()
        total_loss = 0
        loss_components_sum = {}
        num_batches = 0
        
        pbar = tqdm(dataloader, desc='Training')
        for batch_idx, batch in enumerate(pbar):
            try:
                features = batch['features'].to(self.device, non_blocking=True)
                ssl_targets = batch['ssl_targets']
                
                # Move targets to device - Fixed to handle all tensor types properly
                for key, value in ssl_targets.items():
                    if isinstance(value, torch.Tensor):
                        ssl_targets[key] = value.to(self.device, non_blocking=True)
                    elif isinstance(value, tuple):
                        ssl_targets[key] = tuple(
                            v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v 
                            for v in value
                        )
                
                # Forward pass with mixed precision
                if self.scaler is not None:
                    with torch.cuda.amp.autocast():
                        predictions = self.model(features)
                        loss, loss_components = self.criterion(predictions, ssl_targets)
                    
                    # Backward pass
                    self.optimizer.zero_grad()
                    self.scaler.scale(loss).backward()
                    self.scaler.unscale_(self.optimizer)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.scaler.step(self.optimizer)
                    self.scaler.update()
                else:
                    predictions = self.model(features)
                    loss, loss_components = self.criterion(predictions, ssl_targets)
                    
                    # Backward pass
                    self.optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.optimizer.step()
                
                self.scheduler.step()
                
                total_loss += loss.item()
                num_batches += 1
                
                # Accumulate loss components
                for key, value in loss_components.items():
                    if key not in loss_components_sum:
                        loss_components_sum[key] = 0
                    loss_components_sum[key] += value.item()
                
                pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'lr': f'{self.optimizer.param_groups[0]["lr"]:.2e}',
                    'gpu_mem': f'{torch.cuda.memory_allocated()/1e9:.1f}GB' if torch.cuda.is_available() else 'N/A'
                })
                
            except Exception as e:
                print(f"Error in batch {batch_idx}: {str(e)}")
                continue
        
        return total_loss / max(num_batches, 1), loss_components_sum
    
    def validate(self, dataloader):
        self.model.eval()
        total_loss = 0
        num_batches = 0
        
        with torch.no_grad():
            pbar = tqdm(dataloader, desc='Validation')
            for batch in pbar:
                try:
                    features = batch['features'].to(self.device, non_blocking=True)
                    ssl_targets = batch['ssl_targets']
                    
                    # Move targets to device - Fixed to handle all tensor types properly
                    for key, value in ssl_targets.items():
                        if isinstance(value, torch.Tensor):
                            ssl_targets[key] = value.to(self.device, non_blocking=True)
                        elif isinstance(value, tuple):
                            ssl_targets[key] = tuple(
                                v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v 
                                for v in value
                            )
                    
                    if self.scaler is not None:
                        with torch.cuda.amp.autocast():
                            predictions = self.model(features)
                            loss, _ = self.criterion(predictions, ssl_targets)
                    else:
                        predictions = self.model(features)
                        loss, _ = self.criterion(predictions, ssl_targets)
                    
                    total_loss += loss.item()
                    num_batches += 1
                    
                    pbar.set_postfix({
                        'val_loss': f'{loss.item():.4f}',
                        'gpu_mem': f'{torch.cuda.memory_allocated()/1e9:.1f}GB' if torch.cuda.is_available() else 'N/A'
                    })
                    
                except Exception as e:
                    print(f"Error in validation batch: {str(e)}")
                    continue
        
        return total_loss / max(num_batches, 1)
    
    def save_model(self, path):
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        torch.save({
            'model_state_dict': model_to_save.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'config': self.config,
            'actual_feature_dim': model_to_save.actual_feature_dim
        }, path)
    
    def load_model(self, path):
        checkpoint = torch.load(path, map_location=self.device)
        if hasattr(self.model, 'module'):
            self.model.module.load_state_dict(checkpoint['model_state_dict'])
        else:
            self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        if 'scheduler_state_dict' in checkpoint:
            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])



def test_dataset_loading(data_path, max_files=5):
    """Test dataset loading with detailed diagnostics"""
    print(f"Testing dataset loading from: {data_path}")
    
    if not os.path.exists(data_path):
        print(f"Path does not exist: {data_path}")
        return False
    
    try:
        test_dataset = BehavioralDataset(data_path, sequence_length=750, is_train=True, max_files=max_files)
        
        if len(test_dataset) == 0:
            print("No data found in dataset")
            return False
        
        # Test loading multiple samples
        print(f"\nTesting {min(3, len(test_dataset))} samples:")
        for i in range(min(3, len(test_dataset))):
            sample = test_dataset[i]
            print(f"Sample {i}:")
            print(f"  Features shape: {sample['features'].shape}")
            print(f"  Video name: {sample['video_name']}")
            print(f"  SSL targets keys: {list(sample['ssl_targets'].keys())}")
            
            # Check temporal target shapes
            if 'temporal_future' in sample['ssl_targets']:
                print(f"  Temporal future shape: {sample['ssl_targets']['temporal_future'].shape}")
            if 'temporal_context' in sample['ssl_targets']:
                print(f"  Temporal context shape: {sample['ssl_targets']['temporal_context'].shape}")
            
            # Check for actual data (not all zeros)
            if torch.sum(torch.abs(sample['features'])) > 0:
                print(f"  ✓ Contains non-zero data")
            else:
                print(f"  ⚠ Warning: All zeros detected")
        
        return True
        
    except Exception as e:
        print(f"Error testing dataset: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [3]:
import torch
import math
def load_trainer_from_checkpoint(path):
    """Load BehavioralSSLTrainer from saved .pth checkpoint"""
    print(f"Loading model from: {path}")
    
    # Load checkpoint
    checkpoint = torch.load(path, map_location=Config.DEVICE, weights_only = False)
    
    # Extract feature dim
    actual_feature_dim = checkpoint['actual_feature_dim']
    
    # Instantiate trainer (this builds model, optimizer, scheduler)
    trainer = BehavioralSSLTrainer(Config, actual_feature_dim)
    
    # Load model state dict
    if hasattr(trainer.model, 'module'):
        trainer.model.module.load_state_dict(checkpoint['model_state_dict'])
    else:
        trainer.model.load_state_dict(checkpoint['model_state_dict'])
    
    # Load optimizer and scheduler
    trainer.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if 'scheduler_state_dict' in checkpoint:
        trainer.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    
    print("Model loaded successfully")
    return trainer

model_path = "/kaggle/input/ssl-base-77-features-pre-attention-finetuning/pytorch/default/1/best_behavioral_ssl_model_imni_86_feat.pth"
trainer = load_trainer_from_checkpoint(model_path)
pretrained_ssl_model = trainer.model


Loading model from: /kaggle/input/ssl-base-77-features-pre-attention-finetuning/pytorch/default/1/best_behavioral_ssl_model_imni_86_feat.pth
Using 2 GPUs with DataParallel
Model loaded successfully


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import os
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

# Use the existing Config and model definitions from the base SSL code
class MacroAttentionConfig(Config):
    """Extended config for macro-attention training"""
    
    # Macro-attention specific settings
    MACRO_ATTENTION_WINDOW = 30  # seconds - psychologically justified
    MACRO_SEQUENCE_LENGTH = 750  # 30 seconds at 25 FPS
    
    # Attention head architecture
    ATTENTION_HEAD_HIDDEN = 512
    ATTENTION_HEAD_DROPOUT = 0.2
    
    # Training for attention head
    ATTENTION_EPOCHS = 250
    ATTENTION_LR = 1e-4
    ATTENTION_BATCH_SIZE = 32
    
    # Paths
    TRAIN_PATH = '/kaggle/input/daisee-feature-processed/dataset_final/Train-final'
    TEST_PATH = '/kaggle/input/daisee-feature-processed/dataset_final/Test-final'
    LABELS_PATH = '/kaggle/input/daisee-feature-processed/all_labels.csv'  # DAiSEE labels file
    WEIGHTS_PATH = '/kaggle/input/ssl-base-77-features-pre-attention-finetuning/pytorch/default/1/best_behavioral_ssl_model_imni_86_feat.pth' # Pre-trained SSL weights

class MacroAttentionDataset(Dataset):
    """Dataset for macro-attention training using 30-second windows"""
    
    def __init__(self, data_folder, labels_df, sequence_length=750, is_train=True):
        self.data_folder = Path(data_folder)
        self.labels_df = labels_df
        self.sequence_length = sequence_length
        self.is_train = is_train
        
        # Load all CSV files
        self.csv_files = list(self.data_folder.glob("*.csv"))

     
        
        # Create mapping from CSV filename to DAiSEE labels
        self.label_mapping = self._create_label_mapping()
        
        # Filter files that have corresponding labels
        self.valid_files = [f for f in self.csv_files if self._get_clip_id(f) in self.label_mapping]
        
        print(f"Found {len(self.csv_files)} CSV files, {len(self.valid_files)} with labels")
        
        # Get feature columns from first valid file
        if len(self.valid_files) > 0:
            sample_df = pd.read_csv(self.valid_files[0])
            # Use the same final_cols as in the base model
            self.feature_columns = final_cols
            self.actual_feature_dim = len(final_cols)
        else:
            raise ValueError("No valid CSV files found with corresponding labels")
    
    def _create_label_mapping(self):
        """Create mapping from ClipID to DAiSEE labels"""
        mapping = {}
        for _, row in self.labels_df.iterrows():
            clip_id = row['ClipID']
            # Convert .avi to .csv for matching
            csv_name = str(clip_id+".csv")
            mapping[csv_name] = {
                'E': row['Engagement'],
                'B': row['Boredom'], 
                'C': row['Confusion'],
                'F': row['Frustration ']
            }
        return mapping
    
    def _get_clip_id(self, csv_path):
        """Extract clip ID from CSV filename"""
        return csv_path.name
    
    def _calculate_attention_proxy_score(self, e, b, c, f):
        """
        Calculate attention proxy score using psychological formula
        Attention_Proxy_Score = (E - 0.29 * B - 0.17 * C - 0.54 * F) / 3
        """
        return (e - 0.29 * b - 0.17 * c - 0.54 * f) / 3
    
    def __len__(self):
        return len(self.valid_files)
    
    def __getitem__(self, idx):
        try:
            csv_path = self.valid_files[idx]
            df = pd.read_csv(csv_path)
            
            # Get corresponding DAiSEE labels
            clip_id = self._get_clip_id(csv_path)
            labels = self.label_mapping[clip_id]
            
            # Calculate attention proxy score
            attention_score = self._calculate_attention_proxy_score(
                labels['E'], labels['B'], labels['C'], labels['F']
            )
            
            # Apply one-hot encoding for categorical features
            categorical_features_for_encoding = [
                "head_tilt_direction",
                "emotion_quadrant", 
                "engagement_state",
                "attention_focus",
                "behavioral_state"
            ]
            
            df = apply_one_hot_encoding(df, categorical_features_for_encoding)
            
            if len(df) == 0:
                print(f"Warning: Empty dataframe in {csv_path}")
                features = np.zeros((self.sequence_length, self.actual_feature_dim), dtype=np.float32)
            else:
                # Ensure all required columns exist
                for col in self.feature_columns:
                    if col not in df.columns:
                        df[col] = 0
                
                feature_data = df[self.feature_columns].copy()
                
                # Convert to numeric
                for col in feature_data.columns:
                    feature_data[col] = pd.to_numeric(feature_data[col], errors='coerce')
                
                feature_data = feature_data.fillna(feature_data.mean()).fillna(0)
                
                # Handle variable length videos - 30-second macro-attention window
                if len(feature_data) < self.sequence_length:
                    # Pad shorter videos (< 30 seconds)
                    padding_needed = self.sequence_length - len(feature_data)
                    if len(feature_data) > 0:
                        last_values = feature_data.iloc[-1:].values
                        padding = np.repeat(last_values, padding_needed, axis=0)
                        features = np.vstack([feature_data.values, padding])
                    else:
                        features = np.zeros((self.sequence_length, len(feature_data.columns)), dtype=np.float32)
                        
                elif len(feature_data) > self.sequence_length:
                    # Clip longer videos (> 30 seconds) - take first 30 seconds
                    features = feature_data.iloc[:self.sequence_length].values
                else:
                    features = feature_data.values
                
                # Data preprocessing
                features = features.astype(np.float32)
                features = np.nan_to_num(features, nan=0.0, posinf=1.0, neginf=-1.0)
                features = np.clip(features, -10, 10)
            
            return {
                'features': torch.tensor(features),
                'attention_score': torch.tensor(attention_score, dtype=torch.float32),
                'video_name': csv_path.stem,
                'daisee_labels': labels
            }
            
        except Exception as e:
            print(f"Error processing {csv_path}: {str(e)}")
            dummy_features = np.zeros((self.sequence_length, self.actual_feature_dim), dtype=np.float32)
            return {
                'features': torch.tensor(dummy_features),
                'attention_score': torch.tensor(0.0, dtype=torch.float32),
                'video_name': f'error_{idx}',
                'daisee_labels': {'E': 0, 'B': 0, 'C': 0, 'F': 0}
            }

class AttentionHead(nn.Module):
    """Shallow attention head for macro-attention regression"""
    
    def __init__(self, input_dim, hidden_dim=512, dropout=0.2):
        super().__init__()
        
        self.attention_head = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
        )
        
    def forward(self, x):
        # x shape: (batch_size, sequence_length, d_model)
        # Global average pooling to get fixed-size representation
        pooled = x.mean(dim=1)  # (batch_size, d_model)
        
        # Predict attention score
        attention_score = self.attention_head(pooled)  # (batch_size, 1)
        
        return attention_score.squeeze(-1)  # (batch_size,)

class MacroAttentionModel(nn.Module):
    """Complete model: SSL Transformer + Attention Head"""
    
    def __init__(self, config, actual_feature_dim, freeze_encoder=True, pretrained_ssl_model=None):
        super().__init__()
        
        # Use the passed model if provided
        if pretrained_ssl_model is not None:
            if isinstance(pretrained_ssl_model, nn.DataParallel):
                self.ssl_transformer = pretrained_ssl_model.module
            else:
                self.ssl_transformer = pretrained_ssl_model
        else:
            self.ssl_transformer = BehavioralTransformer(config, actual_feature_dim)
        
        # Freeze encoder if needed
        if freeze_encoder:
            for param in self.ssl_transformer.parameters():
                param.requires_grad = False
        
        self.attention_head = AttentionHead(
            input_dim=config.D_MODEL,
            hidden_dim=config.ATTENTION_HEAD_HIDDEN,
            dropout=config.ATTENTION_HEAD_DROPOUT
        )
    
    def forward(self, features):
        # Get embeddings from SSL transformer
        embeddings = self.ssl_transformer(features, return_embeddings=True)
        
        # Predict attention score
        attention_score = self.attention_head(embeddings)
        
        return attention_score

class MacroAttentionTrainer:
    """Trainer for macro-attention head"""
    
    def __init__(self, config, actual_feature_dim, pretrained_ssl_model = pretrained_ssl_model):
        self.config = config
        self.device = config.DEVICE
        
        # Model
        self.model = MacroAttentionModel(config, actual_feature_dim, freeze_encoder=True)
        
        # Load pre-trained SSL weights
        self._load_ssl_weights()
        
        # Move to device and setup DataParallel
        self.model = self.model.to(self.device)
        if torch.cuda.device_count() > 1:
            print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
            self.model = nn.DataParallel(self.model)
        
        # Loss function for regression
        self.criterion = nn.MSELoss()
        
        # Optimizer - only train attention head
        attention_params = []
        if hasattr(self.model, 'module'):
            attention_params = list(self.model.module.attention_head.parameters())
        else:
            attention_params = list(self.model.attention_head.parameters())
            
        self.optimizer = torch.optim.AdamW(
            attention_params,
            lr=config.ATTENTION_LR,
            weight_decay=1e-4
        )
        
        # Scheduler
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer,
            mode='min',
            factor=0.5,
            patience=10,
            verbose=True
        )
        
        # Training history
        self.train_losses = []
        self.val_losses = []
        self.best_val_loss = float('inf')


    
    def _load_ssl_weights(self):

        try:
            with open(self.config.WEIGHTS_PATH, 'rb') as f:
                buffer = io.BytesIO(f.read())
    
            # Manually extract only the state dict
            with zipfile.ZipFile(buffer) as archive:
                with archive.open('archive/data.pkl') as pkl_file:
                    state = torch.load(pkl_file, map_location=self.device, weights_only=True)
            
            ssl_state_dict = state['model_state_dict']
    
            # Filter relevant weights
            ssl_weights = {
                k: v for k, v in ssl_state_dict.items()
                if not k.startswith('ssl_') and not k.startswith('temporal_') and not k.startswith('consistency_')
            }
    
            self.model.ssl_transformer.load_state_dict(ssl_weights, strict=False)
            print("Successfully loaded pre-trained SSL weights")
        except:
            print("Continuing with the pretrained one...")
    
    def train_epoch(self, train_loader):
        """Train one epoch"""
        self.model.train()
        
        # Only attention head should be trainable
        if isinstance(self.model, nn.DataParallel):
            self.model.module.ssl_transformer.eval()
        else:
            self.model.ssl_transformer.eval()

        
        total_loss = 0
        num_batches = 0
        
        pbar = tqdm(train_loader, desc='Training Attention Head')
        for batch in pbar:
            features = batch['features'].to(self.device, non_blocking=True)
            attention_scores = batch['attention_score'].to(self.device, non_blocking=True)
            
            # Forward pass
            predicted_scores = self.model(features)
            
            # Calculate loss
            loss = self.criterion(predicted_scores, attention_scores)
            
            # Backward pass
            self.optimizer.zero_grad()
            loss.backward()
            attention_head = self.model.module.attention_head if hasattr(self.model, 'module') else self.model.attention_head

            torch.nn.utils.clip_grad_norm_(attention_head.parameters(), 1.0)

            self.optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
            
            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'lr': f'{self.optimizer.param_groups[0]["lr"]:.2e}',
                'gpu_mem': f'{torch.cuda.memory_allocated()/1e9:.1f}GB' if torch.cuda.is_available() else 'N/A'
            })
        
        avg_loss = total_loss / max(num_batches, 1)
        self.train_losses.append(avg_loss)
        return avg_loss
    
    def validate(self, val_loader):
        """Validate model"""
        self.model.eval()
        
        total_loss = 0
        num_batches = 0
        predictions = []
        targets = []
        
        with torch.no_grad():
            pbar = tqdm(val_loader, desc='Validation')
            for batch in pbar:
                features = batch['features'].to(self.device, non_blocking=True)
                attention_scores = batch['attention_score'].to(self.device, non_blocking=True)
                
                # Forward pass
                predicted_scores = self.model(features)
                
                # Calculate loss
                loss = self.criterion(predicted_scores, attention_scores)
                
                total_loss += loss.item()
                num_batches += 1
                
                # Store predictions and targets for metrics
                predictions.extend(predicted_scores.cpu().numpy())
                targets.extend(attention_scores.cpu().numpy())
                
                pbar.set_postfix({
                    'val_loss': f'{loss.item():.4f}',
                    'gpu_mem': f'{torch.cuda.memory_allocated()/1e9:.1f}GB' if torch.cuda.is_available() else 'N/A'
                })
        
        avg_loss = total_loss / max(num_batches, 1)
        self.val_losses.append(avg_loss)
        
        return avg_loss, predictions, targets
    
    def train(self, train_loader, val_loader):
        """Complete training loop"""
        print("Starting Macro-Attention Head Training...")
        print(f"Training for {self.config.ATTENTION_EPOCHS} epochs")
        
        for epoch in range(self.config.ATTENTION_EPOCHS):
            print(f"\nEpoch {epoch+1}/{self.config.ATTENTION_EPOCHS}")
            
            # Train
            train_loss = self.train_epoch(train_loader)
            
            # Validate
            val_loss, predictions, targets = self.validate(val_loader)
            
            # Update scheduler
            self.scheduler.step(val_loss)
            
            # Calculate validation metrics
            mse = mean_squared_error(targets, predictions)
            mae = mean_absolute_error(targets, predictions)
            r2 = r2_score(targets, predictions)
            correlation, p_value = pearsonr(targets, predictions)
            
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")
            print(f"Val MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
            print(f"Correlation: {correlation:.4f} (p={p_value:.4f})")
            
            # Save best model
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.save_model(f'best_macro_attention_model.pth')
                print(f"New best model saved (Val Loss: {val_loss:.4f})")
            
            # Early stopping
            if len(self.val_losses) > 20:
                if all(self.val_losses[-i] >= self.val_losses[-20] for i in range(1, 11)):
                    print("Early stopping triggered")
                    break
        
        print("Training completed!")
        return self.train_losses, self.val_losses
    
    def save_model(self, path):
        """Save model"""
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        torch.save({
            'model_state_dict': model_to_save.state_dict(),
            'attention_head_state_dict': model_to_save.attention_head.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'config': self.config,
            'train_losses': self.train_losses,
            'val_losses': self.val_losses,
            'best_val_loss': self.best_val_loss
        }, path)
    
    def load_model(self, path):
        """Load model"""
        checkpoint = torch.load(path, map_location=self.device)
        
        if hasattr(self.model, 'module'):
            self.model.module.load_state_dict(checkpoint['model_state_dict'])
        else:
            self.model.load_state_dict(checkpoint['model_state_dict'])
        
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.train_losses = checkpoint.get('train_losses', [])
        self.val_losses = checkpoint.get('val_losses', [])
        self.best_val_loss = checkpoint.get('best_val_loss', float('inf'))






class MacroAttentionValidator:
    """Comprehensive validation for macro-attention model"""
    
    def __init__(self, model, device):
        self.model = model
        self.device = device
        self.model.eval()
    
    def comprehensive_evaluation(self, val_loader):
        """Comprehensive evaluation with multiple metrics"""
        predictions = []
        targets = []
        video_names = []
        daisee_labels = []
        
        print("Running comprehensive evaluation...")
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Evaluating'):
                features = batch['features'].to(self.device, non_blocking=True)
                attention_scores = batch['attention_score'].to(self.device, non_blocking=True)
                
                # Forward pass
                predicted_scores = self.model(features)
                
                # Store results
                predictions.extend(predicted_scores.cpu().numpy())
                targets.extend(attention_scores.cpu().numpy())
                video_names.extend(batch['video_name'])
                daisee_labels.extend(batch['daisee_labels'])
        
        # Convert to numpy arrays
        predictions = np.array(predictions)
        targets = np.array(targets)
        
        # Calculate metrics
        metrics = self._calculate_metrics(predictions, targets)
        
        # Generate report
        self._generate_report(predictions, targets, video_names, daisee_labels, metrics)
        
        return video_names, targets, predictions, metrics

    
    def _calculate_metrics(self, predictions, targets):
        """Calculate comprehensive metrics"""
        metrics = {}
        
        # Regression metrics
        metrics['mse'] = mean_squared_error(targets, predictions)
        metrics['rmse'] = np.sqrt(metrics['mse'])
        metrics['mae'] = mean_absolute_error(targets, predictions)
        metrics['r2'] = r2_score(targets, predictions)
        
        # Correlation
        correlation, p_value = pearsonr(targets, predictions)
        metrics['correlation'] = correlation
        metrics['correlation_p_value'] = p_value
        
        # Distribution statistics
        metrics['pred_mean'] = np.mean(predictions)
        metrics['pred_std'] = np.std(predictions)
        metrics['target_mean'] = np.mean(targets)
        metrics['target_std'] = np.std(targets)
        
        return metrics
    
    def _generate_report(self, predictions, targets, video_names, daisee_labels, metrics):
        """Generate comprehensive evaluation report"""
        print("\n" + "="*60)
        print("MACRO-ATTENTION MODEL EVALUATION REPORT")
        print("="*60)
        
        print(f"\nDATASET STATISTICS:")
        print(f"Number of videos: {len(predictions)}")
        print(f"Prediction range: [{np.min(predictions):.4f}, {np.max(predictions):.4f}]")
        print(f"Target range: [{np.min(targets):.4f}, {np.max(targets):.4f}]")
        
        print(f"\nREGRESSION METRICS:")
        print(f"Mean Squared Error (MSE): {metrics['mse']:.4f}")
        print(f"Root Mean Squared Error (RMSE): {metrics['rmse']:.4f}")
        print(f"Mean Absolute Error (MAE): {metrics['mae']:.4f}")
        print(f"R-squared (R²): {metrics['r2']:.4f}")
        
        print(f"\nCORRELATION ANALYSIS:")
        print(f"Pearson Correlation: {metrics['correlation']:.4f}")
        print(f"P-value: {metrics['correlation_p_value']:.4f}")
        significance = "Significant" if metrics['correlation_p_value'] < 0.05 else "Not significant"
        print(f"Statistical significance: {significance}")
        
        print(f"\nDISTRIBUTION ANALYSIS:")
        print(f"Predictions - Mean: {metrics['pred_mean']:.4f}, Std: {metrics['pred_std']:.4f}")
        print(f"Targets - Mean: {metrics['target_mean']:.4f}, Std: {metrics['target_std']:.4f}")
        
        # Performance interpretation
        print(f"\nPERFORMANCE INTERPRETATION:")
        if metrics['r2'] > 0.7:
            print("✓ Excellent predictive performance (R² > 0.7)")
        elif metrics['r2'] > 0.5:
            print("✓ Good predictive performance (R² > 0.5)")
        elif metrics['r2'] > 0.3:
            print("⚠ Moderate predictive performance (R² > 0.3)")
        else:
            print("✗ Poor predictive performance (R² < 0.3)")
        
        if abs(metrics['correlation']) > 0.7:
            print("✓ Strong correlation with ground truth")
        elif abs(metrics['correlation']) > 0.5:
            print("✓ Moderate correlation with ground truth")
        else:
            print("⚠ Weak correlation with ground truth")
        
        print("="*60)
    
    def visualize_results(self, predictions, targets, video_names, save_path="validation_plots.png"):
        """Create visualization plots"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Scatter plot
        axes[0, 0].scatter(targets, predictions, alpha=0.6)
        axes[0, 0].plot([min(targets), max(targets)], [min(targets), max(targets)], 'r--', lw=2)
        axes[0, 0].set_xlabel('Ground Truth Attention Score')
        axes[0, 0].set_ylabel('Predicted Attention Score')
        axes[0, 0].set_title('Predicted vs Ground Truth')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Residual plot
        residuals = predictions - targets
        axes[0, 1].scatter(targets, residuals, alpha=0.6)
        axes[0, 1].axhline(y=0, color='r', linestyle='--')
        axes[0, 1].set_xlabel('Ground Truth Attention Score')
        axes[0, 1].set_ylabel('Residuals')
        axes[0, 1].set_title('Residual Plot')
        axes[0, 1].grid(True, alpha=0.3)
        
        # Distribution comparison
        axes[1, 0].hist(targets, bins=30, alpha=0.7, label='Ground Truth', density=True)
        axes[1, 0].hist(predictions, bins=30, alpha=0.7, label='Predictions', density=True)
        axes[1, 0].set_xlabel('Attention Score')
        axes[1, 0].set_ylabel('Density')
        axes[1, 0].set_title('Distribution Comparison')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # Error distribution
        axes[1, 1].hist(residuals, bins=30, alpha=0.7, color='red')
        axes[1, 1].set_xlabel('Residuals')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].set_title('Error Distribution')
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"Validation plots saved to {save_path}")
        plt.show()

In [5]:
"""Main training and validation pipeline"""
import io
import zipfile
# Configuration
config = MacroAttentionConfig()



# Load labels
print("Loading labels...")
labels_df = pd.read_csv(config.LABELS_PATH)
print(f"Loaded {len(labels_df)} labels")
print(f"Label columns: {labels_df.columns.tolist()}")

# Ensure E, B, C, F are in range [0, 1] (normalize from [0, 3] if needed)
for col in ['Engagement', 'Boredom', 'Confusion', 'Frustration ']:
    if col in labels_df.columns:
        labels_df[col] = labels_df[col] / 3.0  # Normalize from [0,3] to [0,1]

print(labels_df.head())

# Create datasets
print("Creating datasets...")
train_dataset = MacroAttentionDataset(
    config.TRAIN_PATH, 
    labels_df,
    config.SEQUENCE_LENGTH
)

val_dataset = MacroAttentionDataset(
    config.TEST_PATH,
    labels_df,
    config.SEQUENCE_LENGTH,
    is_train=False
)

# Data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)


sample_batch = next(iter(train_loader))
actual_feature_dim = sample_batch['features'].shape[-1]
print(f"Actual feature dimension: {actual_feature_dim}")

attention_trainer = MacroAttentionTrainer(
    config=config,
    actual_feature_dim=actual_feature_dim,
    pretrained_ssl_model=pretrained_ssl_model
)


# Train model
print("Starting training...")
attention_trainer.train(train_loader, val_loader)

# Save trained model
attention_trainer.save_model('visiofocus_attention_model_final.pth')

# Comprehensive validation
print("\nStarting comprehensive validation...")
validation_framework = MacroAttentionValidator(attention_trainer.model, config.DEVICE)


video_names, targets, predictions, metrics = validation_framework.comprehensive_evaluation(val_loader)

results_df = pd.DataFrame({
    'video_name': video_names,
    'true_attention': targets,
    'predicted_attention': predictions,
})


results_df.to_csv('validation_results.csv', index=False)

print("\nValidation complete!")
print(f"Results saved to 'validation_results.csv'")
print(f"Final correlation: {metrics['correlation']:.4f}")
print(f"Final R² score: {metrics['r2']:.4f}")

Loading labels...
Loaded 8925 labels
Label columns: ['ClipID', 'Boredom', 'Engagement', 'Confusion', 'Frustration ']
       ClipID  Boredom  Engagement  Confusion  Frustration 
0  1100011002      0.0    0.666667        0.0           0.0
1  1100011003      0.0    0.666667        0.0           0.0
2  1100011004      0.0    1.000000        0.0           0.0
3  1100011005      0.0    1.000000        0.0           0.0
4  1100011006      0.0    1.000000        0.0           0.0
Creating datasets...
Found 6511 CSV files, 6511 with labels
Found 1720 CSV files, 1577 with labels
Actual feature dimension: 79
Continuing with the pretrained one...
Using 2 GPUs with DataParallel
Starting training...
Starting Macro-Attention Head Training...
Training for 250 epochs

Epoch 1/250


Training Attention Head: 100%|██████████| 407/407 [03:08<00:00,  2.16it/s, loss=0.0153, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:45<00:00,  2.17it/s, val_loss=0.0068, gpu_mem=1.0GB]


Train Loss: 0.0154
Val Loss: 0.0105
Val MSE: 0.0105, MAE: 0.0828, R²: -0.0630
Correlation: -0.0022 (p=0.9308)
New best model saved (Val Loss: 0.0105)

Epoch 2/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.47it/s, loss=0.0135, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0062, gpu_mem=1.0GB]


Train Loss: 0.0119
Val Loss: 0.0104
Val MSE: 0.0104, MAE: 0.0824, R²: -0.0536
Correlation: 0.0934 (p=0.0002)
New best model saved (Val Loss: 0.0104)

Epoch 3/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0209, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0069, gpu_mem=1.0GB]


Train Loss: 0.0118
Val Loss: 0.0113
Val MSE: 0.0113, MAE: 0.0865, R²: -0.1434
Correlation: 0.1149 (p=0.0000)

Epoch 4/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0087, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0116
Val Loss: 0.0099
Val MSE: 0.0100, MAE: 0.0803, R²: -0.0047
Correlation: 0.1536 (p=0.0000)
New best model saved (Val Loss: 0.0099)

Epoch 5/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0201, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.49it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0116
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0794, R²: 0.0117
Correlation: 0.1535 (p=0.0000)
New best model saved (Val Loss: 0.0098)

Epoch 6/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0058, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0062, gpu_mem=1.0GB]


Train Loss: 0.0115
Val Loss: 0.0103
Val MSE: 0.0103, MAE: 0.0818, R²: -0.0385
Correlation: 0.1711 (p=0.0000)

Epoch 7/250


Training Attention Head: 100%|██████████| 407/407 [02:42<00:00,  2.50it/s, loss=0.0078, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.50it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0114
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0797, R²: 0.0099
Correlation: 0.1667 (p=0.0000)

Epoch 8/250


Training Attention Head: 100%|██████████| 407/407 [02:43<00:00,  2.50it/s, loss=0.0090, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0060, gpu_mem=1.0GB]


Train Loss: 0.0113
Val Loss: 0.0099
Val MSE: 0.0099, MAE: 0.0800, R²: 0.0027
Correlation: 0.1769 (p=0.0000)

Epoch 9/250


Training Attention Head: 100%|██████████| 407/407 [02:42<00:00,  2.50it/s, loss=0.0125, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0113
Val Loss: 0.0098
Val MSE: 0.0099, MAE: 0.0800, R²: 0.0038
Correlation: 0.1793 (p=0.0000)

Epoch 10/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.47it/s, loss=0.0246, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:38<00:00,  2.56it/s, val_loss=0.0060, gpu_mem=1.0GB]


Train Loss: 0.0112
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0798, R²: 0.0085
Correlation: 0.1867 (p=0.0000)

Epoch 11/250


Training Attention Head: 100%|██████████| 407/407 [02:43<00:00,  2.49it/s, loss=0.0089, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.46it/s, val_loss=0.0057, gpu_mem=1.0GB]


Train Loss: 0.0112
Val Loss: 0.0099
Val MSE: 0.0099, MAE: 0.0801, R²: 0.0034
Correlation: 0.1390 (p=0.0000)

Epoch 12/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0092, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.50it/s, val_loss=0.0056, gpu_mem=1.0GB]


Train Loss: 0.0113
Val Loss: 0.0099
Val MSE: 0.0099, MAE: 0.0801, R²: 0.0027
Correlation: 0.1601 (p=0.0000)

Epoch 13/250


Training Attention Head: 100%|██████████| 407/407 [02:42<00:00,  2.50it/s, loss=0.0074, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.46it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0113
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0797, R²: 0.0119
Correlation: 0.1959 (p=0.0000)
New best model saved (Val Loss: 0.0098)

Epoch 14/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.46it/s, loss=0.0143, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.46it/s, val_loss=0.0062, gpu_mem=1.0GB]


Train Loss: 0.0112
Val Loss: 0.0102
Val MSE: 0.0102, MAE: 0.0814, R²: -0.0267
Correlation: 0.1999 (p=0.0000)

Epoch 15/250


Training Attention Head: 100%|██████████| 407/407 [02:46<00:00,  2.45it/s, loss=0.0167, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.50it/s, val_loss=0.0056, gpu_mem=1.0GB]


Train Loss: 0.0110
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0790, R²: 0.0260
Correlation: 0.1933 (p=0.0000)
New best model saved (Val Loss: 0.0096)

Epoch 16/250


Training Attention Head: 100%|██████████| 407/407 [02:46<00:00,  2.44it/s, loss=0.0091, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0056, gpu_mem=1.0GB]


Train Loss: 0.0111
Val Loss: 0.0097
Val MSE: 0.0097, MAE: 0.0791, R²: 0.0239
Correlation: 0.1924 (p=0.0000)

Epoch 17/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.47it/s, loss=0.0071, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:38<00:00,  2.55it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0112
Val Loss: 0.0099
Val MSE: 0.0099, MAE: 0.0804, R²: -0.0013
Correlation: 0.1756 (p=0.0000)

Epoch 18/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0061, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0111
Val Loss: 0.0100
Val MSE: 0.0100, MAE: 0.0809, R²: -0.0139
Correlation: 0.1972 (p=0.0000)

Epoch 19/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0058, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.50it/s, val_loss=0.0057, gpu_mem=1.0GB]


Train Loss: 0.0111
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0798, R²: 0.0130
Correlation: 0.1765 (p=0.0000)

Epoch 20/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.46it/s, loss=0.0165, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.51it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0112
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0785, R²: 0.0364
Correlation: 0.2028 (p=0.0000)
New best model saved (Val Loss: 0.0095)

Epoch 21/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0092, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0066, gpu_mem=1.0GB]


Train Loss: 0.0111
Val Loss: 0.0104
Val MSE: 0.0105, MAE: 0.0828, R²: -0.0559
Correlation: 0.2116 (p=0.0000)

Epoch 22/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.45it/s, loss=0.0151, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.48it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0110
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0800, R²: 0.0095
Correlation: 0.2126 (p=0.0000)

Epoch 23/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0140, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.52it/s, val_loss=0.0057, gpu_mem=1.0GB]


Train Loss: 0.0111
Val Loss: 0.0102
Val MSE: 0.0102, MAE: 0.0818, R²: -0.0305
Correlation: 0.1360 (p=0.0000)

Epoch 24/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.46it/s, loss=0.0162, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:41<00:00,  2.41it/s, val_loss=0.0056, gpu_mem=1.0GB]


Train Loss: 0.0110
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0786, R²: 0.0360
Correlation: 0.1956 (p=0.0000)

Epoch 25/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0120, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0110
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0786, R²: 0.0388
Correlation: 0.2115 (p=0.0000)
New best model saved (Val Loss: 0.0095)

Epoch 26/250


Training Attention Head: 100%|██████████| 407/407 [02:42<00:00,  2.50it/s, loss=0.0119, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:38<00:00,  2.56it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0110
Val Loss: 0.0097
Val MSE: 0.0097, MAE: 0.0794, R²: 0.0207
Correlation: 0.2089 (p=0.0000)

Epoch 27/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.46it/s, loss=0.0123, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0110
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0790, R²: 0.0304
Correlation: 0.2083 (p=0.0000)

Epoch 28/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.45it/s, loss=0.0111, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.45it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0109
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0785, R²: 0.0408
Correlation: 0.2144 (p=0.0000)
New best model saved (Val Loss: 0.0095)

Epoch 29/250


Training Attention Head: 100%|██████████| 407/407 [02:47<00:00,  2.44it/s, loss=0.0091, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0109
Val Loss: 0.0097
Val MSE: 0.0097, MAE: 0.0794, R²: 0.0233
Correlation: 0.2080 (p=0.0000)

Epoch 30/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.45it/s, loss=0.0131, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.51it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0109
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0789, R²: 0.0330
Correlation: 0.1983 (p=0.0000)

Epoch 31/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.46it/s, loss=0.0163, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:41<00:00,  2.41it/s, val_loss=0.0060, gpu_mem=1.0GB]


Train Loss: 0.0109
Val Loss: 0.0101
Val MSE: 0.0101, MAE: 0.0813, R²: -0.0220
Correlation: 0.2072 (p=0.0000)

Epoch 32/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0115, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.50it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0109
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0788, R²: 0.0360
Correlation: 0.2208 (p=0.0000)

Epoch 33/250


Training Attention Head: 100%|██████████| 407/407 [02:43<00:00,  2.49it/s, loss=0.0086, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.49it/s, val_loss=0.0062, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0099
Val MSE: 0.0099, MAE: 0.0805, R²: -0.0031
Correlation: 0.2219 (p=0.0000)

Epoch 34/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0108, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.49it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0109
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0786, R²: 0.0409
Correlation: 0.2038 (p=0.0000)
New best model saved (Val Loss: 0.0095)

Epoch 35/250


Training Attention Head: 100%|██████████| 407/407 [02:43<00:00,  2.49it/s, loss=0.0069, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.52it/s, val_loss=0.0056, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0784, R²: 0.0431
Correlation: 0.2160 (p=0.0000)
New best model saved (Val Loss: 0.0095)

Epoch 36/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.48it/s, loss=0.0069, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.52it/s, val_loss=0.0061, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0096
Val MSE: 0.0097, MAE: 0.0793, R²: 0.0247
Correlation: 0.1958 (p=0.0000)

Epoch 37/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0166, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.53it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0791, R²: 0.0304
Correlation: 0.2102 (p=0.0000)

Epoch 38/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0135, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:38<00:00,  2.54it/s, val_loss=0.0057, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0789, R²: 0.0332
Correlation: 0.1895 (p=0.0000)

Epoch 39/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.46it/s, loss=0.0083, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.46it/s, val_loss=0.0062, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0097
Val MSE: 0.0097, MAE: 0.0796, R²: 0.0178
Correlation: 0.2121 (p=0.0000)

Epoch 40/250


Training Attention Head: 100%|██████████| 407/407 [02:44<00:00,  2.47it/s, loss=0.0065, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.48it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0095
Val MSE: 0.0095, MAE: 0.0788, R²: 0.0375
Correlation: 0.2079 (p=0.0000)

Epoch 41/250


Training Attention Head: 100%|██████████| 407/407 [02:47<00:00,  2.43it/s, loss=0.0050, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.45it/s, val_loss=0.0063, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0800, R²: 0.0089
Correlation: 0.1965 (p=0.0000)

Epoch 42/250


Training Attention Head: 100%|██████████| 407/407 [02:48<00:00,  2.42it/s, loss=0.0200, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.48it/s, val_loss=0.0061, gpu_mem=1.0GB]


Train Loss: 0.0108
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0800, R²: 0.0113
Correlation: 0.2049 (p=0.0000)

Epoch 43/250


Training Attention Head: 100%|██████████| 407/407 [02:46<00:00,  2.45it/s, loss=0.0050, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.51it/s, val_loss=0.0058, gpu_mem=1.0GB]


Train Loss: 0.0107
Val Loss: 0.0098
Val MSE: 0.0098, MAE: 0.0800, R²: 0.0076
Correlation: 0.1604 (p=0.0000)

Epoch 44/250


Training Attention Head: 100%|██████████| 407/407 [02:47<00:00,  2.43it/s, loss=0.0104, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.49it/s, val_loss=0.0057, gpu_mem=1.0GB]


Train Loss: 0.0107
Val Loss: 0.0097
Val MSE: 0.0097, MAE: 0.0796, R²: 0.0168
Correlation: 0.1926 (p=0.0000)

Epoch 45/250


Training Attention Head: 100%|██████████| 407/407 [02:46<00:00,  2.45it/s, loss=0.0120, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:39<00:00,  2.49it/s, val_loss=0.0061, gpu_mem=1.0GB]


Train Loss: 0.0107
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0792, R²: 0.0288
Correlation: 0.2117 (p=0.0000)

Epoch 46/250


Training Attention Head: 100%|██████████| 407/407 [02:45<00:00,  2.45it/s, loss=0.0124, lr=1.00e-04, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:41<00:00,  2.40it/s, val_loss=0.0059, gpu_mem=1.0GB]


Train Loss: 0.0107
Val Loss: 0.0097
Val MSE: 0.0098, MAE: 0.0797, R²: 0.0154
Correlation: 0.1914 (p=0.0000)

Epoch 47/250


Training Attention Head: 100%|██████████| 407/407 [02:46<00:00,  2.44it/s, loss=0.0089, lr=5.00e-05, gpu_mem=1.0GB]
Validation: 100%|██████████| 99/99 [00:40<00:00,  2.47it/s, val_loss=0.0061, gpu_mem=1.0GB]


Train Loss: 0.0107
Val Loss: 0.0096
Val MSE: 0.0096, MAE: 0.0791, R²: 0.0294
Correlation: 0.2154 (p=0.0000)
Early stopping triggered
Training completed!

Starting comprehensive validation...
Running comprehensive evaluation...


Evaluating: 100%|██████████| 99/99 [00:39<00:00,  2.51it/s]


MACRO-ATTENTION MODEL EVALUATION REPORT

DATASET STATISTICS:
Number of videos: 1577
Prediction range: [0.1297, 0.2551]
Target range: [-0.3144, 0.3333]

REGRESSION METRICS:
Mean Squared Error (MSE): 0.0096
Root Mean Squared Error (RMSE): 0.0981
Mean Absolute Error (MAE): 0.0791
R-squared (R²): 0.0294

CORRELATION ANALYSIS:
Pearson Correlation: 0.2154
P-value: 0.0000
Statistical significance: Significant

DISTRIBUTION ANALYSIS:
Predictions - Mean: 0.2003, Std: 0.0204
Targets - Mean: 0.2132, Std: 0.0995

PERFORMANCE INTERPRETATION:
✗ Poor predictive performance (R² < 0.3)
⚠ Weak correlation with ground truth

Validation complete!
Results saved to 'validation_results.csv'
Final correlation: 0.2154
Final R² score: 0.0294



