In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pathlib import Path
import polars as pl
import warnings
warnings.simplefilter('ignore')


ROOT = Path("/kaggle")
DATA_DIR = ROOT / "input/MABe-mouse-behavior-detection"

# Data Overview
Since the body parts traced differ by lab, let's check the details.

In [None]:
mice_recording_setups_df = pl.read_csv(DATA_DIR / "train.csv")

u_lab_id = mice_recording_setups_df["lab_id"].unique().to_list()
u_body_parts_traced = mice_recording_setups_df["body_parts_tracked"].unique().to_list()
u_behaviors_labeled = mice_recording_setups_df["behaviors_labeled"].unique().to_list()

print(f"{len(u_lab_id)=}")
print(f"{len(u_body_parts_traced)=}")
print(f"{len(u_behaviors_labeled)=}")

In [None]:
mice_recording_setups_df.group_by("lab_id").agg(
    pl.col("body_parts_tracked").n_unique(),
    pl.col("behaviors_labeled").n_unique(),
).select("body_parts_tracked", "behaviors_labeled").max()

This result indicates that multiple patterns of body parts tracked and behaviors labeled exist within a single lab.

## Distribution of Mouse Strains Used by Each Lab

In [None]:
import seaborn as sns

pivot = mice_recording_setups_df.pivot(
    index="lab_id",
    on="mouse1_strain",
    values="mouse1_id",
    aggregate_function=pl.len(),
)

sns.heatmap(
    pivot.to_pandas().set_index("lab_id"),
    cmap="viridis",
    annot=True,
    # fmt="d",
)

- `CD-1 (ICR)`, `129/SvEvTac`, `C57BI/6J x Ai148`, and `CFW` are each used in only one lab.
- `C57BI/6N`, `BTBR`, and `CD1` are used in two labs.
- `C57BI/6J` is used in many labs.

The largest amount of data comes from MABe22.  
If we train a model without considering this, there is a risk of overfitting to the MABe22 data.  
Many strains have only a small amount of data, so if there are behavioral differences between strains, it may be worth considering collecting additional data for those strains.

## Distribution of Mouse Features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

features = ["strain", "color", "sex", "age"]
for feature in features:
    fig, axes = plt.subplots(1, 4, figsize=(24, 4), sharey=True, sharex=True)
    fig.suptitle(feature)
    for i in range(4):
        ax = axes[i]
        sns.histplot(
            data=mice_recording_setups_df,
            x=f"mouse{i+1}_{feature}",
            # hue="lab_id",
            # multiple="stack",
            ax=ax,
        )
        # using a FixedLocator to set the tick labels
        ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    plt.show()

- Most of the colors are black or black and tan. There are only a few white mice.
- Most of the mice are male.
- Most ages are between 10 and 20 weeks.

Since sex and age are dominated by specific values, further analysis of these features may not be meaningful.
Let's check if there is any relationship between strain and color.

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(24, 4))
for i in range(4):
    ax = axes[i]
    sns.histplot(
        data=mice_recording_setups_df,
        x=f"mouse{i+1}_strain",
        y=f"mouse{i+1}_color",
        cmap="viridis",
        ax=ax,
        cbar=True,
    )
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()

- The most common strain, `C57Gl/6J`, only appears with the color `black`.
- The next most common strain, `BTBR`, only appears with the color `black and tan`.
- No strain has multiple colors.

## Differences in Experimental Conditions

In [None]:
import seaborn as sns

pivot = mice_recording_setups_df.pivot(
    index="arena_shape",
    on="arena_type",
    values="lab_id",
    aggregate_function=pl.len(),
)

g = sns.heatmap(
    pivot.to_pandas().set_index("arena_shape"),
    cmap="viridis",
    annot=True,
    # fmt="d",
)
g.set_xlabel("arena_type");

- Most of the combinations are (square, neutral) (n = 7944).
- There are a few (rectangular, resident-intruder) combinations (n = 692).

In [None]:
bpbl_pair_counts = (
    mice_recording_setups_df
    .filter(
        (pl.col("arena_shape") == "square") & (pl.col("arena_type") == "neutral")
        | (pl.col("arena_shape") == "rectangular") & (pl.col("arena_type") == "resident-intruder")
    )
    .group_by("body_parts_tracked", "behaviors_labeled")
    .len("count")
    .sort(["count", "body_parts_tracked", "behaviors_labeled"], descending=True)
)

bpbl_pair_counts.head(10)

In [None]:
filtered = mice_recording_setups_df.filter(
    (
        (pl.col("arena_shape") == "square") & (pl.col("arena_type") == "neutral")
        | (pl.col("arena_shape") == "rectangular") & (pl.col("arena_type") == "resident-intruder")
    )
    & (pl.col("behaviors_labeled").is_null())
)

print(f"{filtered['lab_id'].unique().to_list()=}")
print(f"{filtered['frames_per_second'].unique().to_list()=}")
print(f"{filtered['video_duration_sec'].unique().to_list()=}")
print(f"{filtered['pix_per_cm_approx'].unique().to_list()=}")
print(f"{filtered['video_width_pix'].unique().to_list()=}")
print(f"{filtered['video_height_pix'].unique().to_list()=}")
print(f"{filtered['arena_width_cm'].unique().to_list()=}")
print(f"{filtered['arena_height_cm'].unique().to_list()=}")
print(f"{filtered['tracking_method'].unique().to_list()=}")

In [None]:
mice_recording_setups_df.filter(
    pl.col("lab_id").is_in(["MABe22_keypoints", "MABe22_movies"])
).height

For MABe22_keypoints and MABe22_movies, `behaviors_labeled` is `null`.

## Video Features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

numerical_features = ["frames_per_second", "video_duration_sec", "pix_per_cm_approx", "video_width_pix", "video_height_pix", "arena_width_cm", "arena_height_cm"]
fig, axes = plt.subplots(1, len(numerical_features), figsize=(24, 4), sharey=True)
for i, feature in enumerate(numerical_features):
    ax = axes[i]
    sns.histplot(
        data=mice_recording_setups_df,
        x=feature,
        # hue="lab_id",
        # multiple="stack",
        ax=ax,
    )
    # using a FixedLocator to set the tick labels
    # ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_title(feature)
    ax.set_yscale("log")
plt.show()

In [None]:
import seaborn as sns
import numpy as np

corr = mice_recording_setups_df.select(numerical_features).corr().to_pandas()
corr.index = corr.columns
mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(
    corr,
    cmap="viridis",
    annot=True,
    fmt=".2f",
    vmin=-1,
    vmax=1,
    square=True,
    mask=mask,
    linewidths=0.5,
    cbar_kws={"shrink": 0.75},
)


In [None]:
features = ["frames_per_second", "video_duration_sec"]
counts = mice_recording_setups_df.select(
    pl.col("lab_id"),
    *[pl.col(col) for col in features],
).unique().sort("lab_id").group_by("lab_id").len("count")

totals = mice_recording_setups_df.group_by("lab_id").len("total")

counts = (
    counts
    .join(totals, on="lab_id", how="left")
    .with_columns(
        (pl.col("count") / pl.col("total")).alias("ratio"),
    )
    .sort("ratio", descending=True)
)
display(counts.head(10))
display(counts.tail(10))

- `CaIMS21_*` has the second largest amount of data after `MABe22_*`, but (`fps`, `video_duration_sec`) are different.
- The largest dataset, `MABe22_*`, appears to have been recorded under completely identical conditions.

In [None]:
features = ["pix_per_cm_approx", "video_width_pix", "video_height_pix"]
counts = mice_recording_setups_df.select(
    pl.col("lab_id"),
    *[pl.col(col) for col in features],
).unique().sort("lab_id").group_by("lab_id").len("count")

totals = mice_recording_setups_df.group_by("lab_id").len("total")

counts = (
    counts
    .join(totals, on="lab_id", how="left")
    .with_columns(
        (pl.col("count") / pl.col("total")).alias("ratio"),
    )
    .sort("ratio", descending=True)
)
display(counts.head(10))
display(counts.tail(10))

It appears that the video settings during recording are fixed for `MABe22_*` and `CaIMS21_*`.

In [None]:
features = ["arena_width_cm", "arena_height_cm", "arena_shape", "arena_type"]
counts = mice_recording_setups_df.select(
    pl.col("lab_id"),
    *[pl.col(col) for col in features],
).unique().sort("lab_id").group_by("lab_id").len("count")

totals = mice_recording_setups_df.group_by("lab_id").len("total")

counts = (
    counts
    .join(totals, on="lab_id", how="left")
    .with_columns(
        (pl.col("count") / pl.col("total")).alias("ratio"),
    )
    .sort("ratio", descending=True)
)
display(counts.head(10))
display(counts.tail(10))

It appears that the mouse experimental environment is fixed for `MABe22_*` and `CaIMS21_*`.

# Literature Information

from [MABe22 at ICML 2023](https://arxiv.org/pdf/2207.10553.pdf)

> <span style="color:blue">What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description.</span>
> 
> The core element of this dataset, called a sequence, consists of raw video, tracked postures, sequence-level experimental conditions, and hand-scored actions of three mice interacting in a 52 cm x 52 cm arena, filmed from above at 30 Hz. All three mice are adult males from the same strain, either C57Bl/6J or BTBR. Postures of animals are estimated in terms of a set of twelve anatomically defined ”keypoints” that capture the detailed two-dimensional pose of the animal. Because the three mice are not easily distinguished, temporal filtering methods are used to track the identity of animals across frames. <span style="color:red">Because both of these processing steps are automated, some errors in pose estimation or swaps of mouse identity do occur in the dataset. Accompanying each sequence are frame-by-frame annotations for 8 ”hidden tasks” capturing experimental conditions, animal background, and animal behavior. The 8 hidden tasks for this dataset include four ”sequence-level” tasks where annotation values are the same for all frames in a one-minute sequence, and nine ”frame-level” tasks where annotation values vary from frame to frame. Descriptions of each task are provided in Table 12; all behaviors are defined between any given pair of animals.  The core element of a sequence is called a frame; this refers to the posture of the three animals on a particular frame of video, as well as annotations for the 8 hidden tasks.</span>


> <span style="color:blue">Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable).</span>
> 
> The dataset is derived from a larger experiment, in which three mice were allowed to freely interact in an open arena for a period of four days. To generate the trajectories used for this dataset, we randomly sampled up to five one-minute intervals from each recorded hour of approximately 12 such four-day experiments. In initial sampling, we observed that during the lights-on phase of the light/dark cycle the mice spent the majority of the time huddled together sleeping. <span style="color:red">As this does not generate particularly interesting behavioral data, we randomly discarded 80% of sampled one-minute intervals in which no substantial movement of the animals occurred, and replaced these with substitute samples drawn from the same one-hour time period. If after five attempts we could not randomly draw a replacement sample containing movement, we omitted the trajectory from the dataset. As a result, the dataset contains a higher proportion of trajectories with movement than is present in the source videos, and a slightly lower proportion of trajectories sampled from the light portion of the light/dark cycle.</span>


> <span style="color:blue">What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description.</span>
> 
> Each sequence has three elements. 1) Keypoints are the locations of twelve body parts on each mouse: the nose tip, left and right ears, base of neck, body centroid, base, middle, and tip of tail, and the four paws. Keypoints are estimated using a modified version of HRnet documented in (Sheppard et al., 2022). 2) Annotations are sequence-level or frame-level labels of experimental conditions or animal’s actions. Definitions of these annotations are provided in Table 12. The behavior labels were generated using a series of short scripts based on features of detected animal poses; it is therefore possible that some mis-identification of behaviors occurs. Note that this dataset does not include the original raw videos from which pose estimates were produced. This is because the objective of releasing this dataset was to determine the accuracy with which animal behavior could be detected using tracked keypoints alone.

> <span style="color:blue">Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.</span>
> 
> Pose keypoints in this dataset are produced using automated pose estimation software. <span style="color:red">The dataset was screened to remove sequences with poor pose estimation, detected as large jumps in the detected location of an animal, however some errors in pose estimation, missing keypoints, and noise in keypoint placement still occur.</span> These are most common on frames when the two animals are in close contact or moving very quickly. <span style="color:red">Frame-by-frame annotations of behavior were generated using a series of scripts that were manually tuned by a human expert. Pose estimation errors can contribute to missed bouts or false positives for behaviors in these annotations.</span>

> <span style="color:blue">Was any preprocessing/cleaning/labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)? If so, please provide a description. If not, you may skip the remainder of the questions in this section.</span>
> 
> No preprocessing was performed on the sequence data released in this dataset.


> <span style="color:blue">Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses?  For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms?</span>
> 
> Occasional errors and identity swaps during pose estimation may impact future use of the dataset for some purposes.

from [CalMS21 at NeurIPS](https://arxiv.org/pdf/2104.02710.pdf)

> <span style="color:blue">What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description.</span>
>
> The core element of this dataset, called a sequence, captures the tracked postures and actions of two mice interacting in a standard <span style="color:red">resident-intruder assay</span> filmed from above at 30Hz and manually annotated on a frameby-frame basis for one or more behaviors. <span style="color:red">The resident in these assays is always a male mouse from strain C57Bl/6J, or from a transgenic line with C57Bl/6J background. The intruder is a male or female BALB/c mouse.  Resident mice may be either group-housed or single-housed, and either socially/sexually naive or experienced (all factors that impact the types of social behaviors animals show in this assay.)</span> The core element of a sequence is called a frame; this refers to the posture of both animals on a particular frame of video, as well as one or more labels indicating the type of behavior being performed on that frame (if any).  <span style="color:red">The dataset is divided into four sub-sets: three collections of sequences associated with Tasks 1, 2, and 3 of the MABe Challenge, and a fourth "Unlabeled" collection of sequences that have only the keypoint elements with no accompanying annotations or annotator-id (see "What data does each instance consist of?" for explanation of these values.) Tasks 1-3 are split into train and test sets. Tasks 2 and 3 are also split by annotator-id (Task 2) or behavior (Task 3).</span>


> <span style="color:blue">Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable).</span>
>
> The assembled dataset presented here was manually curated from a large, unreleased repository of mouse behavior videos collected across several years by multiple members of the Anderson lab. <span style="color:red">Only videos of naturally occurring (not optogenetically or chemogenetically evoked) behavior were included.</span> Selection criteria are described in the "Collection Process" section. As a result of our selection criteria, the videos included in the Tasks 1-3 datasets may not be fully representative of mouse behavior in the resident-intruder assay: videos with minimal social interactions (when the resident ignored or avoided the intruder) were omitted in favor of including a greater number of examples of the annotated behaviors of interest.

> <span style="color:blue">What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description.</span>
> 
> Each sequence has three elements. 1) Keypoints are the locations of seven body parts (the nose, left and right ears, base of neck, left and right hips, and base of tail) on each of two interacting mice. Keypoints are estimated using the Mouse Action Recognition System (MARS). <span style="color:red">2) Annotations are manual, frame-wise labels of an animal’s actions, for example attack, mounting, and close investigation. Depending on the behaviors annotated, only between a few percent and up to half of frames will have an annotated action; frames that do not have an annotated action are labeled as other. The other label should not be taken to indicate that no behaviors are happening, and it should not be considered a true label category for purposes of classifier performance evaluation.</span> 3) Annotator-id is a unique numeric ID indicating which (anonymized) human annotator produced the labels in Annotations. This ID is provided primarily for use in Task 2 of the MABe Challenge, which pertains to annotator style capture.  Note that this dataset does not include the original raw videos from which pose estimates were produced. This is because the objective of releasing this dataset was to determine the accuracy with which animal behavior could be detected using tracked keypoints alone.

> <span style="color:blue">Is there a label or target associated with each instance? If so, please provide a description.</span>
> 
> In the Task 1, Task 2, and Task 3 datasets, the annotation field for a given behavior sequence consists of frame-wise labels of animal behaviors. <span style="color:red">Note that only a minority of frames have behavior labels; remaining frames are labeled as other. Only a small number of behaviors were tracked by human annotators (most typically attack, mount, and close investigation), therefore frames labeled as other are not a homogeneous category, but may contain diverse other behaviors.</span> The "Unlabeled" collection of sequences has no labels, and instead contains only keypoint tracking data.

> <span style="color:blue">Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.</span>
> 
> The dataset includes a recommended train/test split for Tasks 1, 2, and 3. In Tasks 2 and 3, the split was designed to provide a roughly consistent, small amount of training data for each sub-task. In Task 1, the split was manually selected so that the test set included sequences from a range of experimental conditions and dates.

> <span style="color:blue">Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.</span>
> 
> Pose keypoints in this dataset are produced using automated pose estimation software (the Mouse Action Recognition System, MARS). While the entire dataset was manually screened to remove sequences with poor pose estimation, some errors in pose estimation and noise in keypoint placement still occur. These are most common on frames when the two animals are in close contact or moving very quickly. In addition, manual annotations of animal behavior are inherently subjective, and individual annotators show some variability in the precise frame-by-frame labeling of behavior sequences. An investigation of within- and between-annotator variability is included in the MARS pre-print.

> <span style="color:blue">Any other comments?</span>
>
> A subset of videos in Task 1 and the Unlabeled dataset are from animals that have been implanted with a head-mounted microendoscope or optical fiber (for fiber photometry.) Because the objective of this dataset is to learn to recognize behavior in a manner that is invariant to experimental setting, the precise preparation of the resident and intruder mice (including age, sex, past experiences, and presence of neural recording devices) is not provided in the dataset.

> <span style="color:blue">How was the data associated with each instance acquired? Was the data directly observable (e.g., raw text, movie ratings), reported by subjects (e.g., survey responses), or indirectly inferred/derived from other data (e.g., part-of-speech tags, modelbased guesses for age or language)? If data was reported by subjects or indirectly inferred/derived from other data, was the data validated/verified? If so, please describe how.</span>
>
> Sequences in the dataset are derived from video of pairs of socially interacting mice engaged in a standard resident-intruder assay. <span style="color:red">In this assay, a black (C57Bl/6J) male "resident" mouse is filmed in its home cage, and a white (BALB/c) male or female "intruder" mouse is manually introduced to the cage by an experimenter.</span> The animals are then allowed to freely interact for between 1-2 and 10 minutes. If there is excessive fighting (injury to either animal) the assay is stopped and that trial is discarded. Resident mice typically undergo several (3-6) resident-intruder assays per day with different intruder animals.  Poses of both mice were estimated from top-view video using MARS, and pose sequences were cropped to only include frames where both animals were present in the arena. Manual, frame-by-frame annotation of animals’ actions were performed from top- and front-view video by trained experts.

> <span style="color:blue">If the dataset is a sample from a larger set, what was the sampling strategy (e.g., deterministic, probabilistic with specific sampling probabilities)?</span>
> 
> <span style="color:red"> The Task 1 dataset was chosen to match the training and test sets of behavior classifiers of MARS.</span> These training and test sets, in turn, were sampled from among unpublished videos collected and annotated by a member of the Anderson lab. Selection criteria for inclusion were high annotation quality (as estimated by the individual who annotated the data) and annotation completeness; videos with diverse social behaviors (mounting and attack in addition to investigation) were favored. The Tasks 2 and 3 datasets were manually selected from among previously collected (unpublished) datasets, where selection criteria were for high annotation quality, annotation completeness, and sufficient number of behavior annotations. The Unlabeled dataset consists of videos from a subset of experiments in a recent publication[30]. The subset of experiments included in this dataset was chosen at random.

> Has the dataset been used for any tasks already? If so, please provide a description.
>
> Yes: this dataset was released to accompany the three tasks of the 2021 Multi-Agent Behavior (MABe) Challenge, posted here. The challenge tasks are summarized as follows:
> - Task 1, Classical Classification: train supervised classifiers to detect instances of close investigation, mounting, and attack from labeled examples. All behaviors were annotated by the same individual.
> - Task 2, Annotation Style Transfer: given limited training examples, train classifiers to reproduce the annotation style of five additional annotators for close investigation, mounting, and attack behaviors.
> - Task 3, Learning New Behavior: given limited training examples, train classifiers to detect instances of seven additional behaviors (names of these behaviors were anonymized for this task.)

> <span style="color:blue"> Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms?</span>
> 
> At time of writing there is no precise, numerical consensus definition of the mouse behaviors annotated in this dataset (and in fact even different individuals trained in the same research lab and following the same written descriptions of behavior can vary in how they define particular actions such as attack, as is evidenced in Task 2.) Future users should be aware of this limitation, and bear in mind that behavior annotations in this dataset may not always agree with the behavior annotations produced by other individuals or labs.

- MABe21: https://www.aicrowd.com/challenges/multi-agent-behavior-representation-modeling-measurement-and-applications  
- MABe22: https://www.aicrowd.com/challenges/multi-agent-behavior-challenge-2022/problems/mabe-2022-mouse-triplets#public-tasks