In [7]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb




def inspect_data_structure(train_path, demo_path):
    """
    Inspect the structure of both sensor and demographic data
    """
    print("=== DATA STRUCTURE INSPECTION ===")
    
    # Load and inspect sensor data
    print("\n1. SENSOR DATA STRUCTURE:")
    try:
        train_df = pd.read_csv(train_path)
        print(f"Shape: {train_df.shape}")
        print(f"Columns: {list(train_df.columns)}")
        print(f"First few rows:")
        print(train_df.head())
        
        # Check for subject-related columns
        subject_cols = [col for col in train_df.columns if 'subject' in col.lower()]
        print(f"Subject-related columns: {subject_cols}")
        
    except Exception as e:
        print(f"Error loading sensor data: {e}")
    
    # Load and inspect demographic data
    print("\n2. DEMOGRAPHIC DATA STRUCTURE:")
    try:
        demo_df = pd.read_csv(demo_path)
        print(f"Shape: {demo_df.shape}")
        print(f"Columns: {list(demo_df.columns)}")
        print(f"First few rows:")
        print(demo_df.head())
        
    except Exception as e:
        print(f"Error loading demographic data: {e}")

def fix_subject_column_mapping(df, demographic_data):
    """
    Fix subject column mapping issues
    """
    print("\n=== FIXING SUBJECT COLUMN MAPPING ===")
    
    # Find potential subject columns
    potential_subject_cols = []
    for col in df.columns:
        if any(keyword in col.lower() for keyword in ['subject', 'user', 'participant', 'id']):
            potential_subject_cols.append(col)
    
    print(f"Potential subject columns: {potential_subject_cols}")
    
    if not potential_subject_cols:
        print("No obvious subject column found. Checking data patterns...")
        
        # Look for columns with values that match demographic subjects
        demo_subjects = set(demographic_data.index)
        print(f"Demographic subjects sample: {list(demo_subjects)[:10]}")
        
        for col in df.columns:
            if df[col].dtype in ['int64', 'object']:
                col_values = set(df[col].unique())
                overlap = len(col_values.intersection(demo_subjects))
                if overlap > 0:
                    print(f"Column '{col}' has {overlap} matching values with demographics")
    
    return potential_subject_cols

class EnhancedBFRBDetectionPipeline:
    """
    Enhanced version with better subject column and label handling
    """
    
    def __init__(self, config=None):
        self.config = config or self._get_default_config()
        self.cnn_model = None
        self.binary_classifier = None
        self.multiclass_classifier = None
        self.scalers = {}
        self.label_encoders = {}
        self.feature_cache = {}
        self.demographic_data = None
        self.subject_column = None
        
    def _get_default_config(self):
        """Optimized configuration with demographic support"""
        return {
            'sequence_length': 256,
            'cnn_filters': [32, 64, 128],
            'cnn_kernel_size': 5,
            'cnn_feature_dim': 64,
            'xgb_params': {
                'n_estimators': 150,
                'max_depth': 5,
                'learning_rate': 0.15,
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'random_state': 42,
                'n_jobs': -1,
                'tree_method': 'hist'
            },
            'cv_folds': 5,
            'batch_size': 64,
            'epochs': 30,
            'early_stopping_patience': 5,
            'use_demographics': True
        }
    
    def load_demographic_data(self, train_path, test_path=None):
        """
        Enhanced demographic data loading with better error handling
        """
        print("Loading demographic data...")
        
        try:
            # Load train demographics
            train_demo = pd.read_csv(train_path)
            print(f"Train demographics shape: {train_demo.shape}")
            print(f"Train demographics columns: {list(train_demo.columns)}")
            
            # Load test demographics if provided
            if test_path:
                test_demo = pd.read_csv(test_path)
                print(f"Test demographics shape: {test_demo.shape}")
                demo_data = pd.concat([train_demo, test_demo], axis=0)
            else:
                demo_data = train_demo.copy()
            
            # Identify subject column in demographics
            if 'subject' in demo_data.columns:
                subject_col = 'subject'
            elif 'subject_id' in demo_data.columns:
                subject_col = 'subject_id'
            else:
                # Try to find subject column
                potential_cols = [col for col in demo_data.columns if 'subject' in col.lower()]
                if potential_cols:
                    subject_col = potential_cols[0]
                    print(f"Using '{subject_col}' as subject column")
                else:
                    raise ValueError("No subject column found in demographic data")
            
            # Convert categorical variables
            categorical_cols = ['adult_child', 'sex', 'handedness']
            for col in categorical_cols:
                if col in demo_data.columns:
                    demo_data[col] = demo_data[col].astype('category')
            
            # Normalize continuous variables
            continuous_cols = ['age', 'height_cm', 'shoulder_to_wrist_cm', 'elbow_to_wrist_cm']
            existing_continuous = [col for col in continuous_cols if col in demo_data.columns]
            
            if existing_continuous:
                demo_data[existing_continuous] = demo_data[existing_continuous].apply(
                    lambda x: (x - x.mean()) / x.std()
                )
            
            # Set subject as index
            self.demographic_data = demo_data.set_index(subject_col)
            print(f"Loaded demographic data for {len(self.demographic_data)} subjects")
            
        except Exception as e:
            print(f"Error loading demographic data: {e}")
            self.demographic_data = None
    
    def auto_detect_subject_column(self, df):
        """
        Automatically detect the subject column in the sensor data
        """
        if self.demographic_data is None:
            return None
        
        demo_subjects = set(self.demographic_data.index)
        
        # Check common subject column names first
        common_names = ['subject_id', 'subject', 'user_id', 'participant_id', 'id']
        
        for col_name in common_names:
            if col_name in df.columns:
                col_values = set(df[col_name].unique())
                overlap = len(col_values.intersection(demo_subjects))
                if overlap > 0:
                    print(f"Found subject column: '{col_name}' (matches {overlap} demographic subjects)")
                    return col_name
        
        # If no common names work, check all columns
        for col in df.columns:
            if df[col].dtype in ['int64', 'object', 'string']:
                try:
                    col_values = set(df[col].unique())
                    overlap = len(col_values.intersection(demo_subjects))
                    if overlap > len(demo_subjects) * 0.5:  # At least 50% overlap
                        print(f"Detected subject column: '{col}' (matches {overlap} demographic subjects)")
                        return col
                except:
                    continue
        
        print("Could not automatically detect subject column")
        return None
    
    def _add_demographic_features(self, df, subject_col=None):
        """
        Enhanced demographic feature addition with auto-detection
        """
        if not self.config['use_demographics'] or self.demographic_data is None:
            return df
        
        # Auto-detect subject column if not provided
        if subject_col is None:
            if self.subject_column is None:
                self.subject_column = self.auto_detect_subject_column(df)
            subject_col = self.subject_column
        
        if subject_col is None or subject_col not in df.columns:
            print(f"Warning: Subject column not found. Available columns: {list(df.columns)}")
            return df
        
        print(f"Merging demographic data using column: '{subject_col}'")
        
        try:
            # Merge demographic data
            merged = df.merge(
                self.demographic_data,
                left_on=subject_col,
                right_index=True,
                how='left'
            )
            
            # Check merge success
            demo_cols = self.demographic_data.columns
            merged_demo_cols = [col for col in demo_cols if col in merged.columns]
            
            if merged_demo_cols:
                print(f"Successfully merged {len(merged_demo_cols)} demographic features")
                
                # Fill missing demographics with mode/median
                for col in merged_demo_cols:
                    missing_count = merged[col].isnull().sum()
                    if missing_count > 0:
                        print(f"Filling {missing_count} missing values in '{col}'")
                        if merged[col].dtype == 'object' or merged[col].dtype.name == 'category':
                            mode_val = merged[col].mode()
                            if len(mode_val) > 0:
                                merged[col] = merged[col].fillna(mode_val[0])
                        else:
                            merged[col] = merged[col].fillna(merged[col].median())
            else:
                print("Warning: No demographic features were successfully merged")
            
            return merged
        except Exception as e:
            print(f"Error merging demographic data: {e}")
            return df
    
    def preprocess_data(self, df, is_training=True):
        """
        Enhanced preprocessing with better subject column handling
        Returns the processed dataframe
        """
        print("Stage 1: Data Preprocessing...")
        print(f"Input data shape: {df.shape}")
        print(f"Input columns: {list(df.columns)}")
        
        # Add demographic features if available
        df = self._add_demographic_features(df)
        
        # Identify sensor columns
        imu_cols = [col for col in df.columns if 'imu' in col.lower() or 'acc_' in col.lower() or 'rot_' in col.lower()]
        temp_cols = [col for col in df.columns if 'temp' in col.lower() or 'thermopile' in col.lower() or 'thm_' in col.lower()]
        tof_cols = [col for col in df.columns if 'tof' in col.lower() or 'distance' in col.lower()]
        
        print(f"Found {len(imu_cols)} IMU columns, {len(temp_cols)} temperature columns, {len(tof_cols)} ToF columns")
        
        processed_df = df.copy()
        
        # IMU preprocessing
        for col in imu_cols:
            if col in processed_df.columns:
                processed_df[col] = processed_df[col].fillna(method='ffill').fillna(method='bfill')
        
        # Temperature preprocessing
        for col in temp_cols:
            if col in processed_df.columns:
                median_val = processed_df[col].median()
                processed_df[col] = processed_df[col].fillna(median_val)
        
        # Time-of-Flight preprocessing
        for col in tof_cols:
            if col in processed_df.columns:
                processed_df[col] = processed_df[col].replace(-1, np.nan)
                median_val = processed_df[col].median()
                processed_df[col] = processed_df[col].fillna(median_val)
        
        # Remove outliers (only for numeric columns, excluding ID columns)
        numeric_cols = processed_df.select_dtypes(include=[np.number]).columns
        id_cols = [col for col in numeric_cols if any(keyword in col.lower() 
                  for keyword in ['subject', 'session', 'gesture', 'id'])]
        
        for col in numeric_cols:
            if col not in id_cols:
                Q1 = processed_df[col].quantile(0.25)
                Q3 = processed_df[col].quantile(0.75)
                IQR = Q3 - Q1
                if IQR > 0:  # Avoid division by zero
                    lower_bound = Q1 - 1.5 * IQR
                    upper_bound = Q3 + 1.5 * IQR
                    processed_df[col] = processed_df[col].clip(lower=lower_bound, upper=upper_bound)
        
        return processed_df
    
    def _extract_sequence_features(self, sequence):
        """
        Extract comprehensive features from a sequence
        """
        features = []
        
        # Statistical features
        features.extend([
            np.mean(sequence, axis=0).flatten(),
            np.std(sequence, axis=0).flatten(),
            np.min(sequence, axis=0).flatten(),
            np.max(sequence, axis=0).flatten(),
            np.median(sequence, axis=0).flatten()
        ])
        
        # Flatten all features
        features = np.concatenate([f.flatten() if hasattr(f, 'flatten') else [f] for f in features])
        
        return features.tolist()
    
    def extract_features(self, df, sequence_col=None, label_col=None):
        """
        Enhanced feature extraction that handles both sequence and non-sequence data
        """
        print("Stage 2: Feature Engineering...")
        
        features = []
        labels = []
        subjects = []
        
        # If no sequence column specified, use all numeric features
        if sequence_col is None or sequence_col not in df.columns:
            print("No sequence column found - using all numeric features")
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            exclude_cols = ['id', 'subject_id', 'session_id', 'gesture_id']
            feature_cols = [col for col in numeric_cols if col not in exclude_cols]
            
            for idx, row in df.iterrows():
                try:
                    feature_vector = [row[col] for col in feature_cols]
                    
                    # Add demographic features if available
                    if self.subject_column and self.subject_column in row and self.demographic_data is not None:
                        subject = row[self.subject_column]
                        if subject in self.demographic_data.index:
                            demo_features = self.demographic_data.loc[subject].values
                            feature_vector.extend(demo_features)
                    
                    features.append(feature_vector)
                    
                    if label_col and label_col in row:
                        labels.append(row[label_col])
                    
                    if self.subject_column and self.subject_column in row:
                        subjects.append(row[self.subject_column])
                        
                except Exception as e:
                    print(f"Error processing row {idx}: {e}")
                    continue
        else:
            # Original sequence handling code
            for idx, row in df.iterrows():
                try:
                    sequence = np.array(row[sequence_col])
                    if len(sequence.shape) == 1:
                        sequence = sequence.reshape(-1, 1)
                    
                    # Pad/truncate sequence
                    seq_len = self.config['sequence_length']
                    if sequence.shape[0] > seq_len:
                        sequence = sequence[:seq_len]
                    elif sequence.shape[0] < seq_len:
                        padding = np.zeros((seq_len - sequence.shape[0], sequence.shape[1]))
                        sequence = np.vstack([sequence, padding])
                    
                    feature_vector = self._extract_sequence_features(sequence)
                    
                    # Add demographic features
                    if self.subject_column and self.subject_column in row and self.demographic_data is not None:
                        subject = row[self.subject_column]
                        if subject in self.demographic_data.index:
                            demo_features = self.demographic_data.loc[subject].values
                            feature_vector.extend(demo_features)
                    
                    features.append(feature_vector)
                    
                    if label_col and label_col in row:
                        labels.append(row[label_col])
                    
                    if self.subject_column and self.subject_column in row:
                        subjects.append(row[self.subject_column])
                        
                except Exception as e:
                    print(f"Error processing row {idx}: {e}")
                    continue
        
        features_df = pd.DataFrame(features)
        result = {'features': features_df}
        
        if labels:
            result['labels'] = np.array(labels)
        if subjects:
            result['subjects'] = np.array(subjects)
            
        print(f"Feature extraction complete. Features shape: {features_df.shape}")
        return result
    
    def train(self, X, y, subjects=None):
        """
        Train the complete pipeline with proper label encoding
        """
        print("Stage 3: Model Training...")
        
        # Prepare data
        X = np.array(X)
        y = np.array(y)
        
        # Scale features
        self.scalers['features'] = StandardScaler()
        X_scaled = self.scalers['features'].fit_transform(X)
        
        # First encode all labels (convert strings to numeric)
        self.label_encoders['main'] = LabelEncoder()
        y_encoded = self.label_encoders['main'].fit_transform(y)
        
        # Create binary labels (1 for any BFRB, 0 for no BFRB)
        # Assuming class 0 is "no BFRB" - adjust if needed
        self.label_encoders['binary'] = LabelEncoder()
        y_binary = (y_encoded > 0).astype(int)  # Binary: BFRB vs no BFRB
        
        # Train binary classifier
        print("Training binary classifier...")
        self.binary_classifier = xgb.XGBClassifier(**self.config['xgb_params'])
        self.binary_classifier.fit(X_scaled, y_binary)
        
        # Train multiclass classifier if needed
        unique_classes = len(np.unique(y_encoded))
        if unique_classes > 2:
            print(f"Training multiclass classifier for {unique_classes} classes...")
            self.multiclass_classifier = xgb.XGBClassifier(**self.config['xgb_params'])
            self.multiclass_classifier.fit(X_scaled, y_encoded)
        
        # Evaluate
        binary_pred = self.binary_classifier.predict(X_scaled)
        binary_f1 = f1_score(y_binary, binary_pred, average='weighted')
        
        results = {
            'binary_f1': binary_f1,
            'feature_importance': self.binary_classifier.feature_importances_
        }
        
        if hasattr(self, 'multiclass_classifier'):
            multi_pred = self.multiclass_classifier.predict(X_scaled)
            multi_f1 = f1_score(y_encoded, multi_pred, average='weighted')
            results['multiclass_f1'] = multi_f1
        
        print(f"Training complete. Binary F1: {binary_f1:.4f}")
        return results
    
    def predict(self, X):
        """
        Make predictions on new data
        """
        print("Making predictions...")
        
        X = np.array(X)
        X_scaled = self.scalers['features'].transform(X)
        
        # Get binary predictions first
        binary_pred = self.binary_classifier.predict(X_scaled)
        
        if hasattr(self, 'multiclass_classifier'):
            # Get multiclass predictions
            multi_pred = self.multiclass_classifier.predict(X_scaled)
            # Convert back to original labels
            multi_pred = self.label_encoders['main'].inverse_transform(multi_pred)
            # Use multiclass predictions where binary predicts BFRB
            final_pred = np.where(binary_pred == 1, multi_pred, self.label_encoders['main'].classes_[0])
        else:
            final_pred = self.label_encoders['main'].inverse_transform(binary_pred)
        
        return final_pred


def debug_and_run_pipeline():
    """
    Debug version of the pipeline execution with both train and test data
    """
    print("=== BFRB PIPELINE DEBUGGING ===")
    
    # File paths (update these to your actual paths)
    train_sensor_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv'
    test_sensor_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv'
    train_demo_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv'
    test_demo_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv'
    
    # Step 1: Inspect data structure for both train and test
    try:
        print("Inspecting TRAIN sensor data:")
        inspect_data_structure(train_sensor_path, train_demo_path)
        
        print("\nInspecting TEST sensor data:")
        inspect_data_structure(test_sensor_path, test_demo_path)
        
    except Exception as e:
        print(f"Could not inspect data structure: {e}")
    
    # Step 2: Initialize enhanced pipeline
    pipeline = EnhancedBFRBDetectionPipeline()
    
    # Step 3: Load demographic data (both train and test)
    pipeline.load_demographic_data(train_demo_path, test_demo_path)
    
    # Step 4: Load and process sensor data
    try:
        print("\n=== LOADING TRAIN SENSOR DATA ===")
        train_df = pd.read_csv(train_sensor_path)
        print(f"Train sensor data shape: {train_df.shape}")
        
        print("\n=== LOADING TEST SENSOR DATA ===")
        test_df = pd.read_csv(test_sensor_path)
        print(f"Test sensor data shape: {test_df.shape}")
        
        # Step 5: Fix subject column mapping (check both datasets)
        if pipeline.demographic_data is not None:
            print("\nChecking subject column mapping for TRAIN data:")
            train_potential_cols = fix_subject_column_mapping(train_df, pipeline.demographic_data)
            
            print("\nChecking subject column mapping for TEST data:")
            test_potential_cols = fix_subject_column_mapping(test_df, pipeline.demographic_data)
        
        # Step 6: Preprocess both datasets
        print("\n=== PREPROCESSING TRAIN DATA ===")
        train_processed = pipeline.preprocess_data(train_df, is_training=True)
        
        print("\n=== PREPROCESSING TEST DATA ===")
        test_processed = pipeline.preprocess_data(test_df, is_training=False)
        
        print(f"\nFinal train processed data shape: {train_processed.shape}")
        print(f"Final test processed data shape: {test_processed.shape}")
        print("Pipeline setup complete!")
        
        return pipeline, train_processed, test_processed
        
    except Exception as e:
        print(f"Error in pipeline execution: {e}")
        return pipeline, None, None

def complete_end_to_end_pipeline():
    """
    Complete end-to-end pipeline from data loading to Kaggle submission
    """
    print("🚀 STARTING COMPLETE BFRB DETECTION PIPELINE")
    print("=" * 60)
    
    # File paths 
    train_sensor_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv'
    test_sensor_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv'
    train_demo_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv'
    test_demo_path = '/kaggle/input/cmi-detect-behavior-with-sensor-data/test_demographics.csv'
    
    try:
        # STEP 1: Initialize pipeline
        print("\n📋 STEP 1: Initializing Pipeline...")
        pipeline = EnhancedBFRBDetectionPipeline()
        
        # STEP 2: Load demographic data
        print("\n👥 STEP 2: Loading Demographic Data...")
        pipeline.load_demographic_data(train_demo_path, test_demo_path)
        
        # STEP 3: Load and preprocess sensor data
        print("\n📊 STEP 3: Loading and Preprocessing Sensor Data...")
        train_df = pd.read_csv(train_sensor_path)
        test_df = pd.read_csv(test_sensor_path)
        
        print(f"Original train data: {train_df.shape}")
        print(f"Original test data: {test_df.shape}")
        
        train_processed = pipeline.preprocess_data(train_df, is_training=True)
        test_processed = pipeline.preprocess_data(test_df, is_training=False)
        
        # STEP 4: Extract features
        print("\n🔧 STEP 4: Feature Engineering...")
        
        # Determine the correct label column - use 'behavior' as primary
        label_col = 'behavior' if 'behavior' in train_processed.columns else None
        if label_col is None:
            for col in ['label', 'target', 'y', 'class']:
                if col in train_processed.columns:
                    label_col = col
                    break

        if label_col is None:
            print("❌ No label column found in training data!")
            return None
        
        # No sequence column - using all numeric features
        sequence_col = None
        
        print(f"Using label column: {label_col}")
        print(f"Available columns: {list(train_processed.columns)}")
        
        # Extract features from training data
        train_data = pipeline.extract_features(train_processed, sequence_col, label_col)
        
        # Extract features from test data
        test_data = pipeline.extract_features(test_processed, sequence_col)
        
        # STEP 5: Train the model
        print("\n🎯 STEP 5: Training Model...")
        if 'labels' in train_data:
            results = pipeline.train(
                train_data['features'],
                train_data['labels'],
                train_data.get('subjects', None)
            )
            
            print(f"✅ Training Results:")
            for key, value in results.items():
                if key != 'feature_importance':
                    print(f"   {key}: {value}")
        else:
            print("❌ No labels found in training data!")
            return None
        
        # STEP 6: Make predictions
        print("\n🔮 STEP 6: Making Predictions...")
        predictions = pipeline.predict(test_data['features'])
        
        # STEP 7: Create submission
        print("\n📤 STEP 7: Creating Submission File...")
        
        # Find ID column in test data - use 'row_id' as primary
        id_col = 'row_id' if 'row_id' in test_df.columns else None
        if id_col is None:
            for col in ['id', 'ID', 'test_id', 'sample_id']:
                if col in test_df.columns:
                    id_col = col
                    break
        
        if id_col:
            submission = pd.DataFrame({
                id_col: test_df[id_col],
                'prediction': predictions
            })
            submission.to_csv('bfrb_submission.csv', index=False)
            print(f"✅ Submission saved as 'bfrb_submission.csv'")
            print(f"   Shape: {submission.shape}")
            if isinstance(predictions[0], str):
                unique_preds, counts = np.unique(predictions, return_counts=True)
                print(f"Prediction distribution:\n{dict(zip(unique_preds, counts))}")
            else:
                print(f"Prediction distribution: {np.bincount(predictions)}")
            
            # Save models and scaler
            print("\n💾 Saving trained models and scaler...")
            pipeline.binary_classifier.save_model('binary_model.json')
            pipeline.multiclass_classifier.save_model('multiclass_model.json')
            joblib.dump(pipeline.scalers['features'], 'scaler.pkl')
            print("✅ Models and scaler saved successfully")
            
        else:
            print("❌ No ID column found for submission")
            
        print("\n🎉 PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        
        return pipeline, results, predictions, submission if id_col else None
        
    except Exception as e:
        print(f"\n❌ ERROR in pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None        




def quick_data_exploration():
    """
    Quick exploration of the data structure to understand what we're working with
    """
    print("🔍 QUICK DATA EXPLORATION")
    print("=" * 40)
    
    try:
        # Load data
        train_df = pd.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/train.csv')
        test_df = pd.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/test.csv')
        train_demo = pd.read_csv('/kaggle/input/cmi-detect-behavior-with-sensor-data/train_demographics.csv')
        
        print(f"\n📊 TRAIN DATA:")
        print(f"   Shape: {train_df.shape}")
        print(f"   Columns: {list(train_df.columns)}")
        print(f"   Sample data:")
        print(train_df.head(2))
        
        print(f"\n📊 TEST DATA:")
        print(f"   Shape: {test_df.shape}")
        print(f"   Columns: {list(test_df.columns)}")
        
        print(f"\n👥 DEMOGRAPHIC DATA:")
        print(f"   Shape: {train_demo.shape}")
        print(f"   Columns: {list(train_demo.columns)}")
        print(f"   Sample data:")
        print(train_demo.head(2))
        
        return train_df, test_df, train_demo
        
    except Exception as e:
        print(f"Error in data exploration: {e}")
        return None, None, None


if __name__ == "__main__":
    # OPTION 1: Quick data exploration first (recommended)
    print("🔍 Starting with data exploration...")
    train_df, test_df, demo_df = quick_data_exploration()
    
    if train_df is not None:
        print("\n" + "="*60)
        # OPTION 2: Run the complete end-to-end pipeline
        result = complete_end_to_end_pipeline()
        
        if result:
            pipeline, training_results, predictions, submission = result
            print(f"\n✅ All done! Check your 'bfrb_submission.csv' file")
        else:
            print("\n❌ Pipeline failed. Check the errors above.")
    else:
        print("\n❌ Could not load data. Check your file paths.")
    
    # OPTION 3: If you want to debug step by step, uncomment this:
    # pipeline, train_processed, test_processed = debug_and_run_pipeline()

🔍 Starting with data exploration...
🔍 QUICK DATA EXPLORATION

📊 TRAIN DATA:
   Shape: (574945, 341)
   Columns: ['row_id', 'sequence_type', 'sequence_id', 'sequence_counter', 'subject', 'orientation', 'behavior', 'phase', 'gesture', 'acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5', 'tof_1_v0', 'tof_1_v1', 'tof_1_v2', 'tof_1_v3', 'tof_1_v4', 'tof_1_v5', 'tof_1_v6', 'tof_1_v7', 'tof_1_v8', 'tof_1_v9', 'tof_1_v10', 'tof_1_v11', 'tof_1_v12', 'tof_1_v13', 'tof_1_v14', 'tof_1_v15', 'tof_1_v16', 'tof_1_v17', 'tof_1_v18', 'tof_1_v19', 'tof_1_v20', 'tof_1_v21', 'tof_1_v22', 'tof_1_v23', 'tof_1_v24', 'tof_1_v25', 'tof_1_v26', 'tof_1_v27', 'tof_1_v28', 'tof_1_v29', 'tof_1_v30', 'tof_1_v31', 'tof_1_v32', 'tof_1_v33', 'tof_1_v34', 'tof_1_v35', 'tof_1_v36', 'tof_1_v37', 'tof_1_v38', 'tof_1_v39', 'tof_1_v40', 'tof_1_v41', 'tof_1_v42', 'tof_1_v43', 'tof_1_v44', 'tof_1_v45', 'tof_1_v46', 'tof_1_v47', 'tof_1_v48', 'tof_1_v49', 'tof_1_v50', 'tof_1

  processed_df[col] = processed_df[col].fillna(method='ffill').fillna(method='bfill')


Stage 1: Data Preprocessing...
Input data shape: (107, 336)
Input columns: ['row_id', 'sequence_id', 'sequence_counter', 'subject', 'acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5', 'tof_1_v0', 'tof_1_v1', 'tof_1_v2', 'tof_1_v3', 'tof_1_v4', 'tof_1_v5', 'tof_1_v6', 'tof_1_v7', 'tof_1_v8', 'tof_1_v9', 'tof_1_v10', 'tof_1_v11', 'tof_1_v12', 'tof_1_v13', 'tof_1_v14', 'tof_1_v15', 'tof_1_v16', 'tof_1_v17', 'tof_1_v18', 'tof_1_v19', 'tof_1_v20', 'tof_1_v21', 'tof_1_v22', 'tof_1_v23', 'tof_1_v24', 'tof_1_v25', 'tof_1_v26', 'tof_1_v27', 'tof_1_v28', 'tof_1_v29', 'tof_1_v30', 'tof_1_v31', 'tof_1_v32', 'tof_1_v33', 'tof_1_v34', 'tof_1_v35', 'tof_1_v36', 'tof_1_v37', 'tof_1_v38', 'tof_1_v39', 'tof_1_v40', 'tof_1_v41', 'tof_1_v42', 'tof_1_v43', 'tof_1_v44', 'tof_1_v45', 'tof_1_v46', 'tof_1_v47', 'tof_1_v48', 'tof_1_v49', 'tof_1_v50', 'tof_1_v51', 'tof_1_v52', 'tof_1_v53', 'tof_1_v54', 'tof_1_v55', 'tof_1_v56', 'tof_1_v57', 'tof_1_v58', 'to

  processed_df[col] = processed_df[col].fillna(method='ffill').fillna(method='bfill')



🔧 STEP 4: Feature Engineering...
Using label column: behavior
Available columns: ['row_id', 'sequence_type', 'sequence_id', 'sequence_counter', 'subject', 'orientation', 'behavior', 'phase', 'gesture', 'acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5', 'tof_1_v0', 'tof_1_v1', 'tof_1_v2', 'tof_1_v3', 'tof_1_v4', 'tof_1_v5', 'tof_1_v6', 'tof_1_v7', 'tof_1_v8', 'tof_1_v9', 'tof_1_v10', 'tof_1_v11', 'tof_1_v12', 'tof_1_v13', 'tof_1_v14', 'tof_1_v15', 'tof_1_v16', 'tof_1_v17', 'tof_1_v18', 'tof_1_v19', 'tof_1_v20', 'tof_1_v21', 'tof_1_v22', 'tof_1_v23', 'tof_1_v24', 'tof_1_v25', 'tof_1_v26', 'tof_1_v27', 'tof_1_v28', 'tof_1_v29', 'tof_1_v30', 'tof_1_v31', 'tof_1_v32', 'tof_1_v33', 'tof_1_v34', 'tof_1_v35', 'tof_1_v36', 'tof_1_v37', 'tof_1_v38', 'tof_1_v39', 'tof_1_v40', 'tof_1_v41', 'tof_1_v42', 'tof_1_v43', 'tof_1_v44', 'tof_1_v45', 'tof_1_v46', 'tof_1_v47', 'tof_1_v48', 'tof_1_v49', 'tof_1_v50', 'tof_1_v51', 'tof_1_v52', 'tof_1_v53