<div style="display: flex; justify-content: space-between; align-items: flex-start;">
    <div style="text-align: left;">
        <p style="color:#FFD700; font-size: 15px; font-weight: bold; margin-bottom: 1px; text-align: left;">Published on  September 20, 2025</p>
        <h4 style="color:#4B0082; font-weight: bold; text-align: left; margin-top: 6px;">Author: Jocelyn C. Dumlao</h4>
        <p style="font-size: 17px; line-height: 1.7; color: #333; text-align: center; margin-top: 20px;"></p>
        <a href="https://www.linkedin.com/in/jocelyn-dumlao-168921a8/" target="_blank" style="display: inline-block; background-color: #003f88; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">LinkedIn</a>
        <a href="https://github.com/jcdumlao14" target="_blank" style="display: inline-block; background-color: transparent; color: #059c99; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px; border: 2px solid #007bff;">GitHub</a>
        <a href="https://www.youtube.com/@CogniCraftedMinds" target="_blank" style="display: inline-block; background-color: #ff0054; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">YouTube</a>
        <a href="https://www.kaggle.com/jocelyndumlao" target="_blank" style="display: inline-block; background-color: #3a86ff; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">Kaggle</a>
    </div>
</div>

<center>
  <img src="https://www.kaggle.com/competitions/59156/images/header" alt="image" width="50%">
</center>

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#00b7ff ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ff3300;"><b> </b>Import Libraries</p></div>


In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm  # For progress bars
from sklearn.metrics import f1_score

In [None]:
def score(solution, submission, labels):
    """
    Calculate the F1 score between the solution and submission dataframes,
    considering only the specified labels and handling potential edge cases.
    """

    # Extract relevant data and format as lists of tuples
    solution_list = list(solution[['video_id', 'start_frame', 'stop_frame', 'agent_id', 'target_id', 'action']].itertuples(index=False, name=None))
    submission_list = list(submission[['video_id', 'start_frame', 'stop_frame', 'agent_id', 'target_id', 'action']].itertuples(index=False, name=None))

    # Convert to sets for efficient matching
    solution_set = set(solution_list)
    submission_set = set(submission_list)

    # Calculate TP, FP, FN counts
    tp = len(solution_set.intersection(submission_set))
    fp = len(submission_set - solution_set)
    fn = len(solution_set - submission_set)

    # Calculate precision, recall, and F1 score, handling potential zero divisions
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    print(f"F1 Score: {f1:.4f}")
    return f1


# <div style="color:white;display:inline-block;border-radius:5px;background-color:#00b7ff ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ff3300;"><b> </b>Data Loading and Filtering</p></div>


In [None]:
# Data Loading and Filtering
train_df = pd.read_csv('/kaggle/input/MABe-mouse-behavior-detection/train.csv')
train_subset = train_df.query("~ lab_id.str.startswith('MABe22_')")

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#00b7ff ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ff3300;"><b> </b>Ground Truth Generation</p></div>


In [None]:
# Ground Truth Generation (Validation Baseline)

def generate_ground_truth(dataset):
    """
    Generates the ground truth dataframe from the annotation files.
    
    Args:
        dataset (pd.DataFrame): The training data dataframe containing lab and video IDs.
    
    Returns:
        pd.DataFrame: The ground truth dataframe in the required format.
    """
    ground_truth_list = []

    for _, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Processing Annotations"):
        lab_id = row['lab_id']
        video_id = row['video_id']

        # Construct the annotation file path
        annotation_file_path = f"/kaggle/input/MABe-mouse-behavior-detection/train_annotation/{lab_id}/{video_id}.parquet"

        try:
            # Load the annotation data
            annotations_df = pd.read_parquet(annotation_file_path)

            # Add video and lab ID columns
            annotations_df['video_id'] = video_id
            annotations_df['lab_id'] = lab_id

            # Add 'behaviors_labeled'
            annotations_df['behaviors_labeled'] = row['behaviors_labeled']

            # Rename and format agent/target IDs
            annotations_df['agent_id'] = annotations_df['agent_id'].apply(lambda x: f"mouse{x}")
            annotations_df['target_id'] = annotations_df['target_id'].apply(lambda x: f"mouse{x}")

            # Append to the list
            ground_truth_list.append(annotations_df)

        except FileNotFoundError:
            print(f"Warning: Annotation file not found: {annotation_file_path}")
            continue

    # Concatenate all the dataframes
    ground_truth_df = pd.concat(ground_truth_list, ignore_index=True)
    return ground_truth_df

solution = generate_ground_truth(train_subset)


# <div style="color:white;display:inline-block;border-radius:5px;background-color:#00b7ff ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ff3300;"><b> </b>Prediction</p></div>

In [None]:
# Prediction 
def create_dummy_predictions(dataset, traintest):
    """
    Generates dummy predictions based on the video metadata, dividing the video
    into segments based on the number of behaviors.

    Args:
        dataset (pd.DataFrame): The training or testing data dataframe.
        traintest (str):  'train' or 'test', indicating the dataset type.

    Returns:
        pd.DataFrame: A dataframe of dummy predictions with segmented timeframes.
    """
    predictions = []

    for _, row in tqdm(dataset.iterrows(), total=len(dataset), desc="Generating Predictions"):
        lab_id = row['lab_id']
        video_id = row['video_id']

        # Build the tracking file path
        tracking_file_path = f"/kaggle/input/MABe-mouse-behavior-detection/{traintest}_tracking/{lab_id}/{video_id}.parquet"

        try:
            # Load the tracking data
            tracking_data = pd.read_parquet(tracking_file_path)
        except FileNotFoundError:
            print(f"Warning: Tracking file not found: {tracking_file_path}")
            continue

        # Extract minimum and maximum video frame numbers
        start_frame_total = tracking_data['video_frame'].min()
        stop_frame_total = tracking_data['video_frame'].max() + 1

        # Extract behaviors
        behaviors_labeled_str = row['behaviors_labeled']
        # Safely evaluate the string as a Python literal
        try:
            behaviors_labeled = eval(behaviors_labeled_str)
        except (SyntaxError, NameError) as e:
            print(f"Error evaluating behaviors_labeled: {e}. Skipping video {video_id}")
            continue

        # Clean up behaviors
        cleaned_behaviors = [b.replace("'", "").strip() for b in behaviors_labeled]
        behavior_tuples = [tuple(b.split(',')) for b in cleaned_behaviors]

        num_behaviors = len(behavior_tuples)

        # Calculate the batch length for each action
        batch_length = (stop_frame_total - start_frame_total) // num_behaviors

        # Ensure batch_length is at least 1 to prevent zero-length batches
        batch_length = max(1, batch_length)

        current_start_frame = start_frame_total
        # Iterate through possible actions
        for agent, target, action in behavior_tuples:
            # Calculate batch start and stop frames
            batch_stop_frame = min(current_start_frame + batch_length, stop_frame_total)

            # Dummy: predict the action happens in this timeframe
            predictions.append({
                'video_id': video_id,
                'agent_id': agent.strip(),
                'target_id': target.strip(),
                'action': action.strip(),
                'start_frame': current_start_frame,
                'stop_frame': batch_stop_frame
            })

            # Update the start frame for the next action
            current_start_frame = batch_stop_frame

    # Convert to a Pandas DataFrame
    predictions_df = pd.DataFrame(predictions)
    return predictions_df


In [None]:
# Generate predictions
submission = create_dummy_predictions(train_subset, 'train')

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#00b7ff ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ff3300;"><b> </b>Evaluation</p></div>

In [None]:
# Evaluation
print("Evaluating Predictions...")
score(solution, submission, '')

# <div style="color:white;display:inline-block;border-radius:5px;background-color:#00b7ff ;font-family:Nexa;overflow:hidden"><p style="padding:10px;color:white;overflow:hidden;font-size:85%;letter-spacing:0.5px;margin:0;border: 6px groove #ff3300;"><b> </b>Test Data Submission</p></div>

In [None]:
# Test Data Submission
test_df = pd.read_csv('/kaggle/input/MABe-mouse-behavior-detection/test.csv')
test_submission = create_dummy_predictions(test_df, 'test')

# Prepare submission file
test_submission.index.name = 'row_id'
test_submission.to_csv('submission.csv')

print("Submission file created successfully!")
!head submission.csv
