In [None]:
"""
MABe Challenge 2025 - Starter Code with CV and Competition Metric
Multi-Agent Behavior Recognition in Mice

Key features:
1. Proper cross-validation strategy
2. Competition F-beta metric implementation
3. Local CV score tracking for LB correlation
4. Stratified sampling to ensure behavior diversity
"""

import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
import warnings
import gc
import json
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass, field
from tqdm import tqdm
import pickle
import random
from scipy import signal, stats
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import f1_score
import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

@dataclass
class Config:
    """Configuration with CV strategy"""
    # Paths
    data_path: Path = Path('/kaggle/input/MABe-mouse-behavior-detection')
    output_path: Path = Path('/kaggle/working')
    
    # Data processing
    max_train_videos: int = 100  # Balanced for accuracy vs speed
    max_test_videos: Optional[int] = None
    sample_rate: float = 0.5  # Sample 50% of windows
    
    # Feature extraction
    window_size: int = 45
    window_stride: int = 15  # Balance between coverage and speed
    min_window_fill: float = 0.4
    use_advanced_features: bool = True
    
    # Cross-validation strategy
    cv_strategy: str = 'group'  # 'group' for video-based, 'stratified' for behavior-based
    n_folds: int = 5  # 5-fold CV for robust validation
    validation_metric: str = 'f_beta'  # Use competition metric
    beta: float = 1.0  # F1 score
    
    # Model parameters
    use_ensemble: bool = True
    n_models: int = 2  # LightGBM + XGBoost
    random_state: int = 42
    early_stopping_rounds: int = 50
    
    # Behavior classes
    behavior_classes: List[str] = field(default_factory=lambda: [
        'grooming', 'sniff', 'chase', 'attack', 
        'mount', 'investigate', 'escape', 'approach'
    ])
    
    # Thresholds (will be optimized during CV)
    confidence_thresholds: Dict[str, float] = field(default_factory=lambda: {
        'grooming': 0.3,
        'sniff': 0.25,
        'chase': 0.35,
        'attack': 0.4,
        'mount': 0.35,
        'investigate': 0.25,
        'escape': 0.35,
        'approach': 0.3
    })
    
    # Processing
    batch_size: int = 100
    max_features: int = 100  # Reduced to prevent overfitting

config = Config()

# ============================================================================
# COMPETITION METRIC IMPLEMENTATION
# ============================================================================

class MABeMetric:
    """Competition F-beta metric implementation"""
    
    @staticmethod
    def prepare_solution_data(annotations: pd.DataFrame, video_id: str, lab_id: str = 'lab1') -> pd.DataFrame:
        """Convert annotations to solution format"""
        if annotations.empty:
            return pd.DataFrame()
        
        solution_rows = []
        for _, ann in annotations.iterrows():
            # Create solution row
            row = {
                'video_id': video_id,
                'agent_id': ann.get('agent_id', 'mouse1'),
                'target_id': ann.get('target_id', 'mouse2'),
                'action': ann.get('action', 'unknown'),
                'start_frame': int(ann['start_frame']),
                'stop_frame': int(ann.get('end_frame', ann.get('stop_frame', ann['start_frame'] + 30))),
                'lab_id': lab_id
            }
            solution_rows.append(row)
        
        if solution_rows:
            solution_df = pd.DataFrame(solution_rows)
            
            # Add behaviors_labeled column (required by metric)
            unique_behaviors = solution_df.apply(
                lambda x: f"{x['agent_id']},{x['target_id']},{x['action']}", axis=1
            ).unique().tolist()
            solution_df['behaviors_labeled'] = json.dumps(unique_behaviors)
            
            return solution_df
        
        return pd.DataFrame()
    
    @staticmethod
    def single_lab_f1(lab_solution: pl.DataFrame, lab_submission: pl.DataFrame, beta: float = 1) -> float:
        """Calculate F1 for a single lab"""
        if lab_solution.is_empty() or lab_submission.is_empty():
            return 0.0
        
        label_frames = defaultdict(set)
        prediction_frames = defaultdict(set)
        
        # Build label frames
        for row in lab_solution.to_dicts():
            key = f"{row['video_id']}_{row['agent_id']}_{row['target_id']}_{row['action']}"
            label_frames[key].update(range(row['start_frame'], row['stop_frame']))
        
        # Build prediction frames
        for row in lab_submission.to_dicts():
            key = f"{row['video_id']}_{row['agent_id']}_{row['target_id']}_{row['action']}"
            prediction_frames[key].update(range(row['start_frame'], row['stop_frame']))
        
        # Calculate metrics per action
        tps = defaultdict(int)
        fns = defaultdict(int)
        fps = defaultdict(int)
        
        all_keys = set(label_frames.keys()) | set(prediction_frames.keys())
        actions = set()
        
        for key in all_keys:
            action = key.split('_')[-1]
            actions.add(action)
            
            label_set = label_frames.get(key, set())
            pred_set = prediction_frames.get(key, set())
            
            tps[action] += len(label_set & pred_set)
            fns[action] += len(label_set - pred_set)
            fps[action] += len(pred_set - label_set)
        
        # Calculate F-beta per action
        action_scores = []
        for action in actions:
            tp = tps[action]
            fn = fns[action]
            fp = fps[action]
            
            if tp + fn + fp == 0:
                continue
            
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            
            if precision + recall > 0:
                f_score = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)
            else:
                f_score = 0
            
            action_scores.append(f_score)
        
        return np.mean(action_scores) if action_scores else 0.0
    
    @staticmethod
    def calculate_metric(solution: pd.DataFrame, submission: pd.DataFrame, beta: float = 1) -> float:
        """Calculate competition F-beta metric"""
        if solution.empty or submission.empty:
            return 0.0
        
        # Convert to polars for efficiency
        solution_pl = pl.DataFrame(solution)
        submission_pl = pl.DataFrame(submission)
        
        # Calculate per lab
        lab_scores = []
        for lab_id in solution['lab_id'].unique():
            lab_solution = solution_pl.filter(pl.col('lab_id') == lab_id)
            lab_videos = lab_solution['video_id'].unique().to_list()
            lab_submission = submission_pl.filter(pl.col('video_id').is_in(lab_videos))
            
            score = MABeMetric.single_lab_f1(lab_solution, lab_submission, beta)
            lab_scores.append(score)
        
        return np.mean(lab_scores) if lab_scores else 0.0

# ============================================================================
# FEATURE EXTRACTION
# ============================================================================

class FeatureExtractor:
    """Feature extraction with focus on reducing overfitting"""
    
    def __init__(self, window_size: int = 45, stride: int = 15):
        self.window_size = window_size
        self.stride = stride
    
    def extract_movement_features(self, trajectory: pd.DataFrame) -> Dict[str, float]:
        """Extract core movement features"""
        features = {}
        
        if 'x' not in trajectory.columns or len(trajectory) < 3:
            return features
        
        x = trajectory['x'].values
        y = trajectory['y'].values
        
        # Velocity
        dx = np.diff(x)
        dy = np.diff(y)
        velocity = np.sqrt(dx**2 + dy**2)
        
        # Core statistics
        features['vel_mean'] = np.mean(velocity)
        features['vel_std'] = np.std(velocity)
        features['vel_max'] = np.max(velocity)
        features['vel_median'] = np.median(velocity)
        features['vel_q25'] = np.percentile(velocity, 25)
        features['vel_q75'] = np.percentile(velocity, 75)
        
        # Acceleration
        if len(velocity) > 1:
            acceleration = np.diff(velocity)
            features['acc_mean'] = np.mean(np.abs(acceleration))
            features['acc_std'] = np.std(acceleration)
        
        # Path metrics
        features['total_dist'] = np.sum(velocity)
        features['net_disp'] = np.sqrt((x[-1] - x[0])**2 + (y[-1] - y[0])**2)
        features['path_efficiency'] = features['net_disp'] / (features['total_dist'] + 1e-6)
        
        # Spatial
        features['x_range'] = np.max(x) - np.min(x)
        features['y_range'] = np.max(y) - np.min(y)
        features['area'] = features['x_range'] * features['y_range']
        
        # Angular features
        if len(dx) > 1:
            angles = np.arctan2(dy, dx)
            angle_changes = np.diff(angles)
            angle_changes = np.arctan2(np.sin(angle_changes), np.cos(angle_changes))
            
            features['turn_mean'] = np.mean(np.abs(angle_changes))
            features['turn_std'] = np.std(angle_changes)
            features['turn_max'] = np.max(np.abs(angle_changes))
        
        # Motion patterns
        features['stationary_ratio'] = np.mean(velocity < 2.0)
        features['slow_ratio'] = np.mean((velocity >= 2.0) & (velocity < 10.0))
        features['fast_ratio'] = np.mean(velocity >= 10.0)
        
        return features
    
    def extract_temporal_features(self, trajectory: pd.DataFrame) -> Dict[str, float]:
        """Extract temporal dynamics"""
        features = {}
        
        if 'x' not in trajectory.columns or len(trajectory) < 10:
            return features
        
        x = trajectory['x'].values
        y = trajectory['y'].values
        
        # Autocorrelation
        if len(x) > 15:
            x_centered = x - np.mean(x)
            y_centered = y - np.mean(y)
            
            # Lag-1 autocorrelation
            features['x_autocorr'] = np.corrcoef(x_centered[:-1], x_centered[1:])[0, 1]
            features['y_autocorr'] = np.corrcoef(y_centered[:-1], y_centered[1:])[0, 1]
        
        # Trend
        time = np.arange(len(x))
        x_trend = np.polyfit(time, x, 1)[0]
        y_trend = np.polyfit(time, y, 1)[0]
        features['x_trend'] = x_trend
        features['y_trend'] = y_trend
        
        return features
    
    def extract_windows(self, data: pd.DataFrame, video_id: str, 
                       sample_rate: float = 1.0) -> pd.DataFrame:
        """Extract feature windows from tracking data"""
        if data.empty:
            return pd.DataFrame()
        
        # Get trajectory
        if 'frame' not in data.columns:
            data['frame'] = data.index
        
        trajectory = data.groupby('frame').agg({
            'x': 'mean',
            'y': 'mean'
        }).reset_index()
        
        if len(trajectory) < self.window_size:
            return pd.DataFrame()
        
        all_features = []
        min_frame = trajectory['frame'].min()
        max_frame = trajectory['frame'].max()
        
        # Generate windows
        window_starts = list(range(int(min_frame), 
                                  int(max_frame) - self.window_size + 1, 
                                  self.stride))
        
        # Sample if needed
        if sample_rate < 1.0:
            n_samples = max(1, int(len(window_starts) * sample_rate))
            window_starts = random.sample(window_starts, min(n_samples, len(window_starts)))
        
        for start_frame in window_starts:
            end_frame = start_frame + self.window_size
            
            window = trajectory[(trajectory['frame'] >= start_frame) & 
                               (trajectory['frame'] < end_frame)]
            
            if len(window) < self.window_size * config.min_window_fill:
                continue
            
            # Extract features
            features = {
                'video_id': video_id,
                'start_frame': start_frame,
                'end_frame': end_frame,
            }
            
            # Movement features
            move_feats = self.extract_movement_features(window)
            features.update(move_feats)
            
            # Temporal features
            if config.use_advanced_features:
                temp_feats = self.extract_temporal_features(window)
                features.update(temp_feats)
            
            all_features.append(features)
        
        return pd.DataFrame(all_features)

# ============================================================================
# CROSS-VALIDATION STRATEGY
# ============================================================================

class CVStrategy:
    """Cross-validation with competition metric"""
    
    def __init__(self, n_folds: int = 5, strategy: str = 'group'):
        self.n_folds = n_folds
        self.strategy = strategy
        self.cv_scores = []
        self.best_thresholds = {}
    
    def create_folds(self, features_df: pd.DataFrame, labels: np.ndarray) -> List[Tuple]:
        """Create CV folds based on strategy"""
        if self.strategy == 'group':
            # Group by video to prevent leakage
            groups = features_df['video_id'].values
            gkf = GroupKFold(n_splits=self.n_folds)
            folds = list(gkf.split(features_df, labels, groups))
        else:
            # Stratified by behavior presence
            skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=42)
            # Create binary labels for stratification
            binary_labels = (labels > 0).astype(int) if labels.ndim > 1 else labels
            folds = list(skf.split(features_df, binary_labels))
        
        return folds
    
    def optimize_thresholds(self, y_true: np.ndarray, y_pred: np.ndarray, 
                           features_df: pd.DataFrame) -> Dict[str, float]:
        """Optimize thresholds using validation data"""
        best_thresholds = {}
        
        # Simple threshold optimization
        for threshold in np.arange(0.1, 0.6, 0.05):
            # Convert predictions to submission format
            predictions = []
            for idx, prob in enumerate(y_pred):
                if prob > threshold:
                    predictions.append({
                        'video_id': features_df.iloc[idx]['video_id'],
                        'agent_id': 'mouse1',
                        'target_id': 'mouse2',
                        'action': 'unknown',
                        'start_frame': int(features_df.iloc[idx]['start_frame']),
                        'stop_frame': int(features_df.iloc[idx]['end_frame'])
                    })
            
            if predictions:
                # Calculate score (simplified)
                pred_binary = (y_pred > threshold).astype(int)
                score = f1_score(y_true, pred_binary)
                
                if 'best_score' not in locals() or score > best_score:
                    best_score = score
                    best_threshold = threshold
        
        return {'global': best_threshold if 'best_threshold' in locals() else 0.3}

# ============================================================================
# MODEL WITH CV
# ============================================================================

class BehaviorDetectorCV:
    """Model with cross-validation"""
    
    def __init__(self):
        self.models = []
        self.scalers = []
        self.feature_columns = None
        self.cv_strategy = CVStrategy(n_folds=config.n_folds, strategy=config.cv_strategy)
        self.cv_scores = []
        self.feature_importance = None
    
    def prepare_features(self, features_df: pd.DataFrame) -> np.ndarray:
        """Prepare features"""
        numeric_cols = features_df.select_dtypes(include=[np.number]).columns
        exclude = ['start_frame', 'end_frame']
        self.feature_columns = [c for c in numeric_cols if c not in exclude]
        
        if not self.feature_columns:
            return np.zeros((len(features_df), 1))
        
        X = features_df[self.feature_columns].fillna(0).values
        
        # Remove constant features
        std = np.std(X, axis=0)
        valid_features = std > 1e-10
        X = X[:, valid_features]
        self.feature_columns = [col for col, valid in zip(self.feature_columns, valid_features) if valid]
        
        return X
    
    def train_with_cv(self, features_df: pd.DataFrame, labels: np.ndarray, 
                     annotations_df: pd.DataFrame = None):
        """Train with cross-validation"""
        print("\n" + "="*60)
        print("TRAINING WITH CROSS-VALIDATION")
        print("="*60)
        
        X = self.prepare_features(features_df)
        
        if X.shape[0] < 100 or X.shape[1] == 0:
            print("Insufficient data for training")
            return
        
        # Binary classification for simplicity
        y = (labels > 0).astype(int) if labels.ndim > 1 else labels
        
        print(f"Data shape: {X.shape}")
        print(f"Positive rate: {100*np.mean(y):.1f}%")
        
        # Create CV folds
        folds = self.cv_strategy.create_folds(features_df, y)
        print(f"Created {len(folds)} CV folds")
        
        # Track feature importance
        feature_importance_sum = np.zeros(X.shape[1])
        
        # Train on each fold
        for fold_idx, (train_idx, val_idx) in enumerate(folds):
            print(f"\nFold {fold_idx + 1}/{len(folds)}")
            
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            # Check for minimum positive samples
            if np.sum(y_train) < 10 or np.sum(y_val) < 5:
                print(f"  Skipping fold - insufficient positive samples")
                continue
            
            # Scale
            scaler = RobustScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            
            # Train LightGBM
            lgb_model = lgb.LGBMClassifier(
                n_estimators=200,
                max_depth=6,
                num_leaves=40,
                learning_rate=0.05,
                min_child_samples=30,
                subsample=0.7,
                colsample_bytree=0.7,
                reg_alpha=0.3,
                reg_lambda=0.3,
                class_weight='balanced' if np.mean(y_train) < 0.3 else None,
                random_state=config.random_state + fold_idx,
                verbosity=-1
            )
            
            lgb_model.fit(
                X_train_scaled, y_train,
                eval_set=[(X_val_scaled, y_val)],
                callbacks=[lgb.early_stopping(config.early_stopping_rounds), 
                          lgb.log_evaluation(0)]
            )
            
            # Track feature importance
            if hasattr(lgb_model, 'feature_importances_'):
                feature_importance_sum += lgb_model.feature_importances_
            
            # Validate
            y_pred_proba = lgb_model.predict_proba(X_val_scaled)[:, 1]
            y_pred = (y_pred_proba > 0.3).astype(int)
            
            # Calculate metrics
            val_f1 = f1_score(y_val, y_pred)
            print(f"  Validation F1: {val_f1:.3f}")
            
            # If we have annotations, calculate competition metric
            if annotations_df is not None:
                val_features = features_df.iloc[val_idx]
                val_predictions = []
                
                for idx, prob in enumerate(y_pred_proba):
                    if prob > 0.3:
                        val_predictions.append({
                            'video_id': val_features.iloc[idx]['video_id'],
                            'agent_id': 'mouse1',
                            'target_id': 'mouse2',
                            'action': 'unknown',
                            'start_frame': int(val_features.iloc[idx]['start_frame']),
                            'stop_frame': int(val_features.iloc[idx]['end_frame'])
                        })
                
                if val_predictions:
                    submission_df = pd.DataFrame(val_predictions)
                    
                    # Get corresponding annotations
                    val_videos = val_features['video_id'].unique()
                    val_annotations = annotations_df[annotations_df['video_id'].isin(val_videos)]
                    
                    if not val_annotations.empty:
                        # Convert to solution format
                        solution_df = MABeMetric.prepare_solution_data(
                            val_annotations, 
                            val_videos[0] if len(val_videos) > 0 else 'unknown'
                        )
                        
                        if not solution_df.empty:
                            competition_score = MABeMetric.calculate_metric(
                                solution_df, submission_df, beta=config.beta
                            )
                            print(f"  Competition F-beta: {competition_score:.3f}")
                            self.cv_scores.append(competition_score)
            
            # Store model
            self.models.append(lgb_model)
            self.scalers.append(scaler)
            
            # Train XGBoost if ensemble enabled
            if config.use_ensemble and config.n_models > 1:
                xgb_model = xgb.XGBClassifier(
                    n_estimators=150,
                    max_depth=5,
                    learning_rate=0.05,
                    subsample=0.7,
                    colsample_bytree=0.7,
                    reg_alpha=0.3,
                    reg_lambda=0.3,
                    random_state=config.random_state + fold_idx + 100,
                    use_label_encoder=False,
                    eval_metric='logloss'
                )
                
                xgb_model.fit(
                    X_train_scaled, y_train,
                    eval_set=[(X_val_scaled, y_val)],
                    early_stopping_rounds=config.early_stopping_rounds,
                    verbose=False
                )
                
                self.models.append(xgb_model)
        
        # Store average feature importance
        if feature_importance_sum.any():
            self.feature_importance = feature_importance_sum / len(folds)
            
            # Print top features
            top_features_idx = np.argsort(self.feature_importance)[-10:][::-1]
            print("\nTop 10 important features:")
            for idx in top_features_idx:
                if idx < len(self.feature_columns):
                    print(f"  {self.feature_columns[idx]}: {self.feature_importance[idx]:.2f}")
        
        # Print CV summary
        if self.cv_scores:
            print(f"\nCV Summary:")
            print(f"  Mean F-beta: {np.mean(self.cv_scores):.3f} (+/- {np.std(self.cv_scores):.3f})")
            print(f"  Min: {np.min(self.cv_scores):.3f}, Max: {np.max(self.cv_scores):.3f}")
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """Predict using ensemble"""
        if not self.models:
            return np.zeros(X.shape[0])
        
        predictions = []
        
        for i, model in enumerate(self.models):
            scaler_idx = min(i // (config.n_models if config.use_ensemble else 1), len(self.scalers) - 1)
            X_scaled = self.scalers[scaler_idx].transform(X)
            
            if hasattr(model, 'predict_proba'):
                pred = model.predict_proba(X_scaled)[:, 1]
            else:
                pred = model.predict(X_scaled)
            
            predictions.append(pred)
        
        # Average predictions
        return np.mean(predictions, axis=0)

# ============================================================================
# MAIN PIPELINE WITH CV
# ============================================================================

class MABePipelineCV:
    """Main pipeline with proper CV"""
    
    def __init__(self):
        self.feature_extractor = FeatureExtractor(
            window_size=config.window_size,
            stride=config.window_stride
        )
        self.detector = BehaviorDetectorCV()
        self.all_annotations = []
    
    def load_and_process_file(self, tracking_path: Path, annotation_path: Path = None) -> Tuple[pd.DataFrame, np.ndarray, pd.DataFrame]:
        """Load and process a single file"""
        try:
            # Load tracking
            data = pd.read_parquet(tracking_path)
            video_id = tracking_path.stem
            
            # Extract features
            features = self.feature_extractor.extract_windows(
                data, video_id, sample_rate=config.sample_rate
            )
            
            if features.empty:
                return pd.DataFrame(), np.array([]), pd.DataFrame()
            
            # Load annotations if available
            annotations = pd.DataFrame()
            labels = np.zeros(len(features))
            
            if annotation_path and annotation_path.exists():
                annotations = pd.read_parquet(annotation_path)
                
                # Map column names
                col_mapping = {
                    'start': 'start_frame',
                    'end': 'end_frame',
                    'stop': 'end_frame',
                    'stop_frame': 'end_frame'
                }
                
                for old_col, new_col in col_mapping.items():
                    if old_col in annotations.columns and new_col not in annotations.columns:
                        annotations[new_col] = annotations[old_col]
                
                # Create labels
                if 'start_frame' in annotations.columns and 'end_frame' in annotations.columns:
                    for idx, window in features.iterrows():
                        overlaps = ((annotations['start_frame'] <= window['end_frame']) & 
                                  (annotations['end_frame'] >= window['start_frame']))
                        labels[idx] = 1 if overlaps.any() else 0
                    
                    # Add video_id to annotations
                    annotations['video_id'] = video_id
            
            return features, labels, annotations
            
        except Exception as e:
            print(f"Error processing {tracking_path.name}: {e}")
            return pd.DataFrame(), np.array([]), pd.DataFrame()
    
    def process_training_data(self):
        """Process training data with CV in mind"""
        print("\n" + "="*60)
        print("PROCESSING TRAINING DATA FOR CV")
        print("="*60)
        
        # Find files
        tracking_files = []
        train_path = config.data_path / 'train_tracking'
        
        if train_path.exists():
            for lab_dir in train_path.iterdir():
                if lab_dir.is_dir():
                    files = list(lab_dir.glob('*.parquet'))
                    tracking_files.extend(files)
        
        # Sample files
        if config.max_train_videos and len(tracking_files) > config.max_train_videos:
            # Stratified sampling by lab
            lab_files = defaultdict(list)
            for f in tracking_files:
                lab_files[f.parent.name].append(f)
            
            sampled_files = []
            files_per_lab = config.max_train_videos // len(lab_files)
            
            for lab, files in lab_files.items():
                n_sample = min(files_per_lab, len(files))
                sampled_files.extend(random.sample(files, n_sample))
            
            tracking_files = sampled_files[:config.max_train_videos]
        
        print(f"Processing {len(tracking_files)} files")
        
        all_features = []
        all_labels = []
        all_annotations = []
        
        # Process files
        for i in range(0, len(tracking_files), config.batch_size):
            batch = tracking_files[i:i+config.batch_size]
            print(f"\nBatch {i//config.batch_size + 1}/{(len(tracking_files)-1)//config.batch_size + 1}")
            
            for file_path in tqdm(batch, desc="Processing"):
                # Get annotation path
                ann_path = config.data_path / 'train_annotation' / file_path.parent.name / f"{file_path.stem}.parquet"
                
                # Process file
                features, labels, annotations = self.load_and_process_file(file_path, ann_path)
                
                if not features.empty and len(labels) > 0:
                    all_features.append(features)
                    all_labels.append(labels)
                    if not annotations.empty:
                        all_annotations.append(annotations)
            
            gc.collect()
        
        # Combine
        if all_features:
            features_df = pd.concat(all_features, ignore_index=True)
            labels = np.concatenate(all_labels)
            
            annotations_df = pd.concat(all_annotations, ignore_index=True) if all_annotations else pd.DataFrame()
            
            print(f"\nTotal samples: {len(features_df)}")
            print(f"Positive rate: {100*np.mean(labels):.1f}%")
            print(f"Number of videos: {features_df['video_id'].nunique()}")
            
            return features_df, labels, annotations_df
        
        return None, None, None
    
    def train_model(self, features_df: pd.DataFrame, labels: np.ndarray, annotations_df: pd.DataFrame = None):
        """Train model with CV"""
        self.detector.train_with_cv(features_df, labels, annotations_df)
    
    def generate_predictions(self) -> pd.DataFrame:
        """Generate test predictions"""
        print("\n" + "="*60)
        print("GENERATING PREDICTIONS")
        print("="*60)
        
        test_path = config.data_path / 'test_tracking'
        test_files = []
        
        if test_path.exists():
            for lab_dir in test_path.iterdir():
                if lab_dir.is_dir():
                    test_files.extend(list(lab_dir.glob('*.parquet')))
        
        print(f"Found {len(test_files)} test files")
        
        predictions = []
        
        for test_file in tqdm(test_files, desc="Predicting"):
            # Process file
            features, _, _ = self.load_and_process_file(test_file)
            
            if features.empty:
                continue
            
            # Prepare features
            X = self.detector.prepare_features(features)
            
            # Predict
            proba = self.detector.predict(X)
            
            # Use optimized threshold or default
            threshold = config.confidence_thresholds.get('global', 0.3)
            
            # Convert to predictions
            for idx, prob in enumerate(proba):
                if prob > threshold:
                    predictions.append({
                        'video_id': features.iloc[idx]['video_id'],
                        'agent_id': 'mouse1',
                        'target_id': 'mouse2',
                        'action': 'sniff',  # Default action
                        'start_frame': int(features.iloc[idx]['start_frame']),
                        'stop_frame': int(features.iloc[idx]['end_frame'])
                    })
        
        if predictions:
            return pd.DataFrame(predictions)
        else:
            # Minimal valid submission
            return pd.DataFrame({
                'video_id': ['test_video'],
                'agent_id': ['mouse1'],
                'target_id': ['mouse2'],
                'action': ['grooming'],
                'start_frame': [0],
                'stop_frame': [30]
            })
    
    def create_submission(self, predictions: pd.DataFrame) -> pd.DataFrame:
        """Create submission file"""
        predictions['row_id'] = range(len(predictions))
        submission = predictions[['row_id', 'video_id', 'agent_id', 'target_id', 
                                 'action', 'start_frame', 'stop_frame']]
        
        submission.to_csv('submission.csv', index=False)
        print(f"\nCreated submission.csv with {len(submission)} predictions")
        return submission

# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    """Main execution with CV"""
    import time
    start_time = time.time()
    
    print("\n" + "="*80)
    print("MABe CHALLENGE 2025 - WITH PROPER CV")
    print("="*80)
    
    print(f"\nConfiguration:")
    print(f"  CV Strategy: {config.cv_strategy}")
    print(f"  CV Folds: {config.n_folds}")
    print(f"  Max training videos: {config.max_train_videos}")
    print(f"  Ensemble models: {config.n_models}")
    
    # Initialize pipeline
    pipeline = MABePipelineCV()
    
    # Process training data
    features_df, labels, annotations_df = pipeline.process_training_data()
    
    if features_df is not None and len(features_df) > 500:
        # Train with CV
        pipeline.train_model(features_df, labels, annotations_df)
        
        # Print CV results
        if pipeline.detector.cv_scores:
            print("\n" + "="*60)
            print("CROSS-VALIDATION RESULTS")
            print("="*60)
            print(f"Mean CV F-beta: {np.mean(pipeline.detector.cv_scores):.4f}")
            print(f"Std CV F-beta: {np.std(pipeline.detector.cv_scores):.4f}")
            print(f"Individual fold scores: {pipeline.detector.cv_scores}")
            print("\nExpected LB correlation:")
            print(f"  Optimistic (mean + std): {np.mean(pipeline.detector.cv_scores) + np.std(pipeline.detector.cv_scores):.4f}")
            print(f"  Expected (mean): {np.mean(pipeline.detector.cv_scores):.4f}")
            print(f"  Conservative (mean - std): {np.mean(pipeline.detector.cv_scores) - np.std(pipeline.detector.cv_scores):.4f}")
        
        # Save model
        with open('cv_model.pkl', 'wb') as f:
            pickle.dump(pipeline.detector, f)
        print("\nModel saved to cv_model.pkl")
        
        # Generate predictions
        predictions = pipeline.generate_predictions()
        
        # Create submission
        submission = pipeline.create_submission(predictions)
        
        # Report runtime
        elapsed = time.time() - start_time
        print(f"\nTotal runtime: {elapsed/3600:.2f} hours")
        
        print("\n" + "="*80)
        print("PIPELINE COMPLETE!")
        print("="*80)
        
        return submission
    else:
        print("\nInsufficient training data!")
        return None

if __name__ == "__main__":
    submission = main()
    if submission is not None:
        print("\nâœ“ Execution completed successfully!")
        print(f"Submission shape: {submission.shape}")