# MABe Mouse Behavior Detection - Submission Notebook

**Strategy:**
- ✅ Use pre-trained Transformer model (old version)
- ⚠️  No motion features (coordinates only)
- ✅ Inference only (no training)
- ⚡ Expected runtime: < 30 minutes

**Setup Instructions:**
1. Model already uploaded to: `mabe-submit` dataset
2. Enable GPU (T4 or P100)
3. Add dataset: `mabe-submit`
4. Run all cells
5. Submit `submission.csv`

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("MABe Kaggle Submission - Inference Only")
print("="*60)

## 1. Define Model Architecture

In [None]:
import math

class PositionalEncoding(nn.Module):
    """Positional encoding for transformer"""

    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


class TransformerBehaviorModel(nn.Module):
    """Transformer-based model for mouse behavior recognition"""

    def __init__(self, input_dim, hidden_dim=256, num_layers=4,
                 num_heads=8, num_classes=10, dropout=0.1, max_seq_len=256):
        """
        Args:
            input_dim: Dimension of input features (e.g., number of keypoints * 2)
            hidden_dim: Dimension of hidden layers
            num_layers: Number of transformer encoder layers
            num_heads: Number of attention heads
            num_classes: Number of behavior classes
            dropout: Dropout rate
            max_seq_len: Maximum sequence length
        """
        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        # Input projection
        self.input_projection = nn.Linear(input_dim, hidden_dim)

        # Positional encoding
        self.pos_encoder = PositionalEncoding(hidden_dim, max_seq_len, dropout)

        # Transformer encoder
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=hidden_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim * 4,
            dropout=dropout,
            batch_first=False
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers,
            num_layers=num_layers
        )

        # Output layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, x, mask=None):
        """
        Args:
            x: Input tensor of shape [batch_size, seq_len, input_dim]
            mask: Optional mask tensor

        Returns:
            Output tensor of shape [batch_size, seq_len, num_classes]
        """
        # Project input
        x = self.input_projection(x)  # [batch, seq_len, hidden_dim]

        # Transpose for transformer: [seq_len, batch, hidden_dim]
        x = x.transpose(0, 1)

        # Add positional encoding
        x = self.pos_encoder(x)

        # Apply transformer
        x = self.transformer_encoder(x, src_key_padding_mask=mask)

        # Transpose back: [batch, seq_len, hidden_dim]
        x = x.transpose(0, 1)

        # Classification
        output = self.fc(x)

        return output

print("✓ Model architecture defined")

## 2. Load Model Checkpoint

In [None]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print()

# Load checkpoint from mabe-submit dataset
MODEL_PATH = Path('/kaggle/input/mabe-submit/best_model.pth')

checkpoint = torch.load(MODEL_PATH, map_location=device)

print(f"✓ Loaded checkpoint from Epoch {checkpoint['epoch'] + 1}")
if 'best_val_f1' in checkpoint:
    print(f"  Best Val F1: {checkpoint['best_val_f1']:.4f}")
print()

# Build model - Old transformer (NO motion features, just coordinates)
print("⚠️  Using old model: coordinates only, no speed/acceleration")
model = TransformerBehaviorModel(
    input_dim=144,  # 72 keypoints * 2 coords (NO motion features)
    hidden_dim=256,
    num_layers=4,
    num_heads=8,
    num_classes=4,
    dropout=0.1,
    max_seq_len=256
)

model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

print(f"✓ Model loaded")
print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")

## 3. Load Test Data

In [None]:
DATA_DIR = Path('/kaggle/input/mabe-mouse-behavior-detection')

# Load test metadata
if (DATA_DIR / 'test.csv').exists():
    test_csv = pd.read_csv(DATA_DIR / 'test.csv')
    print(f"Loaded test.csv with {len(test_csv)} videos")
else:
    # Fallback: use test labs from train.csv
    train_csv = pd.read_csv(DATA_DIR / 'train.csv')
    test_labs = ['MABe22_keypoints', 'MABe22_movies']
    test_csv = train_csv[train_csv['lab_id'].isin(test_labs)].copy()
    print(f"Using test labs: {test_labs}")

print(f"Total test videos: {len(test_csv)}")

## 4. Generate Predictions

In [None]:
DATA_DIR = Path('/kaggle/input/mabe-mouse-behavior-detection')

# Load test metadata
if (DATA_DIR / 'test.csv').exists():
    test_csv = pd.read_csv(DATA_DIR / 'test.csv')
    print(f"Loaded test.csv with {len(test_csv)} videos")
else:
    # Fallback: use test labs from train.csv
    train_csv = pd.read_csv(DATA_DIR / 'train.csv')
    test_labs = ['MABe22_keypoints', 'MABe22_movies']
    test_csv = train_csv[train_csv['lab_id'].isin(test_labs)].copy()
    print(f"Using test labs: {test_labs}")

print(f"Total test videos: {len(test_csv)}")

## 5. Create Submission

In [None]:
print("Generating predictions...")
print()

all_predictions = []
sequence_length = 100
stride = 25  # 75% overlap

with torch.no_grad():
    for idx, row in tqdm(test_csv.iterrows(), total=len(test_csv), desc="Processing"):
        video_id = row['video_id']
        lab_id = row['lab_id']
        
        # Load tracking
        tracking_file = DATA_DIR / 'test_tracking' / lab_id / f'{video_id}.parquet'
        if not tracking_file.exists():
            continue
        
        try:
            tracking_df = pd.read_parquet(tracking_file)
            
            # Convert to wide format
            tracking_pivot = tracking_df.pivot_table(
                index='video_frame',
                columns=['mouse_id', 'bodypart'],
                values=['x', 'y'],
                aggfunc='first'
            )
            tracking_pivot.columns = ['_'.join(map(str, col)).strip() 
                                      for col in tracking_pivot.columns.values]
            tracking_pivot = tracking_pivot.sort_index()
            
            keypoints = tracking_pivot.values.astype(np.float32)
            keypoints = np.nan_to_num(keypoints, nan=0.0)
            
            # NO motion features for old model
            num_frames = len(keypoints)
            
            # Accumulate predictions with sliding windows
            video_preds = np.zeros((num_frames, 4), dtype=np.float32)
            video_counts = np.zeros(num_frames, dtype=np.int32)
            
            for start_idx in range(0, max(1, num_frames - sequence_length + 1), stride):
                end_idx = min(start_idx + sequence_length, num_frames)
                
                # Handle last window
                if end_idx - start_idx < sequence_length:
                    start_idx = max(0, num_frames - sequence_length)
                    end_idx = num_frames
                
                window = keypoints[start_idx:end_idx]
                
                # Pad if needed
                if len(window) < sequence_length:
                    padding = np.zeros((sequence_length - len(window), window.shape[1]), 
                                       dtype=np.float32)
                    window = np.concatenate([window, padding], axis=0)
                
                # Predict
                window_tensor = torch.FloatTensor(window).unsqueeze(0).to(device)
                output = model(window_tensor)
                probs = torch.softmax(output, dim=-1).squeeze(0).cpu().numpy()
                
                # Accumulate
                actual_length = min(sequence_length, end_idx - start_idx)
                video_preds[start_idx:start_idx + actual_length] += probs[:actual_length]
                video_counts[start_idx:start_idx + actual_length] += 1
            
            # Average overlapping predictions
            video_counts = np.maximum(video_counts, 1)
            video_preds = video_preds / video_counts[:, np.newaxis]
            final_preds = np.argmax(video_preds, axis=1)
            
            # Create submission rows
            for frame_idx, pred in enumerate(final_preds):
                all_predictions.append({
                    'video_id': video_id,
                    'frame': frame_idx,
                    'prediction': int(pred),
                })
        
        except Exception as e:
            print(f"Error: {video_id} - {e}")
            continue

print(f"\n✓ Generated {len(all_predictions):,} predictions")

## 6. Create Submission

In [None]:
submission_df = pd.DataFrame(all_predictions)
submission_df = submission_df.sort_values(['video_id', 'frame']).reset_index(drop=True)

print("="*60)
print("Prediction Distribution")
print("="*60)
class_names = {0: 'Background', 1: 'Social', 2: 'Mating', 3: 'Aggressive'}
for class_id, count in submission_df['prediction'].value_counts().sort_index().items():
    pct = count / len(submission_df) * 100
    print(f"{class_names[class_id]:12s}: {count:8,} ({pct:5.2f}%)")
print("="*60)

# Save submission
submission_df.to_csv('submission.csv', index=False)

print(f"\n✓ Saved submission.csv")
print(f"  Total predictions: {len(submission_df):,}")
print(f"  Unique videos: {submission_df['video_id'].nunique()}")
print("\nReady to submit!")

In [None]:
# Preview submission
submission_df.head(20)