# DeepLabCut Data Analysis and Visualization

This notebook performs an initial exploration of pose estimation data obtained from DeepLabCut (DLC). The goal is to better understand the movement and position patterns before building a machine learning pipeline.

## 1. Load Data

The data consists of 2D coordinates of keypoints tracked over video frames.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

from asoid_utils.preprocessing import adp_filt

import os
from dotenv import load_dotenv
load_dotenv()
DLC_DATASET_PATH = os.getenv('DLC_DATASET_PATH')

FRAME_RATE = 30
SAMPLE_RATE = 10
MULTI_ANIMAL = False
LIKELIHOOD_THRESHOLD = 0.6
BODY_PARTS = ["tailbase", "earR", "earL", "msBase", "msTop", "centroid", "cleft", "cright"]
RULE_BASED_LABELS = ["shuttles_label_naive1", "shuttles_label_naive2", "shuttles_label_naive3", "shuttles_label_hardcoded", "freezing_label"]

In [42]:
# Multi-level header
df_pose_labels = pd.read_csv(DLC_DATASET_PATH + "/labelled_DLC.csv", header=[0, 1], index_col=0)

# Flatten the multi-level header into single strings
df_pose_labels.columns = ['_'.join(col).strip() for col in df_pose_labels.columns.values]

# Remove second level header if it is empty
df_pose_labels.columns = [col.split("_Unnamed")[0] if "_Unnamed" in col else col for col in df_pose_labels.columns.values]

# Remove NaN columns and rows
df_pose_labels = df_pose_labels.dropna(axis=1, how='all')
df_pose_labels = df_pose_labels.dropna(axis=0, how='any')

print("Shape of df_pose_labels:", df_pose_labels.shape)
df_pose_labels.head()

Shape of df_pose_labels: (20000, 32)


Unnamed: 0,tailbase_x,tailbase_y,tailbase_likelihood,earR_x,earR_y,earR_likelihood,earL_x,earL_y,earL_likelihood,msBase_x,...,cright_y,cright_likelihood,calc_centroid_x,calc_centroid_y,speed,shuttles_label_naive1,shuttles_label_naive2,shuttles_label_naive3,shuttles_label_hardcoded,freezing_label
0,215.430298,318.572968,0.999996,335.296844,346.048737,0.999918,351.505981,309.215546,0.999891,351.370636,...,345.997833,0.999949,313.40094,326.633553,0.0,0,0,0,0,0
1,216.124664,319.105774,0.999997,334.252045,349.912659,0.99995,351.527374,307.669678,0.99998,351.550171,...,347.752991,0.99995,313.363564,327.412323,0.779667,0,0,0,0,0
2,215.643936,319.362427,0.999998,333.88031,349.696228,0.999942,351.895233,308.665619,0.999976,352.242249,...,348.114502,0.999924,313.415432,327.838448,0.42927,0,0,0,0,0
3,216.000183,319.383545,0.999997,334.263062,349.323456,0.99992,351.224792,307.465485,0.999981,351.896088,...,347.783112,0.999937,313.346031,327.274757,0.567946,0,0,0,0,0
4,215.784775,319.112488,0.999998,334.187714,349.681763,0.999959,351.621399,307.867065,0.999985,351.576569,...,347.832947,0.99997,313.292614,327.455528,0.188498,0,0,0,0,0


In [43]:
df_labels = df_pose_labels[RULE_BASED_LABELS]

print("Shape of df_labels:", df_labels.shape)
df_labels.head()

Shape of df_labels: (20000, 5)


Unnamed: 0,shuttles_label_naive1,shuttles_label_naive2,shuttles_label_naive3,shuttles_label_hardcoded,freezing_label
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [44]:
# all unlabeled are placed into "other"
df_labels_other = df_labels.copy()
df_labels_other["other"] = 0

# find all unlabeled
unlabeled_data = df_labels_other.sum(axis=1) == 0

# change all unlabeled to 1 in other
df_labels_other.loc[unlabeled_data, "other"] = 1

print("Shape of df_labels_other:", df_labels_other.shape)
df_labels_other.head()

Shape of df_labels_other: (20000, 6)


Unnamed: 0,shuttles_label_naive1,shuttles_label_naive2,shuttles_label_naive3,shuttles_label_hardcoded,freezing_label,other
0,0,0,0,0,0,1
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [45]:
df_pose = df_pose_labels.drop(columns=RULE_BASED_LABELS)

print("Shape of df_pose:", df_pose.shape)
df_pose.head()

Shape of df_pose: (20000, 27)


Unnamed: 0,tailbase_x,tailbase_y,tailbase_likelihood,earR_x,earR_y,earR_likelihood,earL_x,earL_y,earL_likelihood,msBase_x,...,centroid_likelihood,cleft_x,cleft_y,cleft_likelihood,cright_x,cright_y,cright_likelihood,calc_centroid_x,calc_centroid_y,speed
0,215.430298,318.572968,0.999996,335.296844,346.048737,0.999918,351.505981,309.215546,0.999891,351.370636,...,0.999951,281.177429,278.16452,0.999803,275.792725,345.997833,0.999949,313.40094,326.633553,0.0
1,216.124664,319.105774,0.999997,334.252045,349.912659,0.99995,351.527374,307.669678,0.99998,351.550171,...,0.999983,281.244598,276.559937,0.999884,277.471069,347.752991,0.99995,313.363564,327.412323,0.779667
2,215.643936,319.362427,0.999998,333.88031,349.696228,0.999942,351.895233,308.665619,0.999976,352.242249,...,0.999982,282.179657,276.486542,0.999849,276.880859,348.114502,0.999924,313.415432,327.838448,0.42927
3,216.000183,319.383545,0.999997,334.263062,349.323456,0.99992,351.224792,307.465485,0.999981,351.896088,...,0.999978,283.015259,276.601379,0.999904,277.634735,347.783112,0.999937,313.346031,327.274757,0.567946
4,215.784775,319.112488,0.999998,334.187714,349.681763,0.999959,351.621399,307.867065,0.999985,351.576569,...,0.999984,282.534119,277.098724,0.999917,276.700134,347.832947,0.99997,313.292614,327.455528,0.188498


## 2. Basic Info & Summary Statistics

In [46]:
df_pose.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 19999
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tailbase_x           20000 non-null  float64
 1   tailbase_y           20000 non-null  float64
 2   tailbase_likelihood  20000 non-null  float64
 3   earR_x               20000 non-null  float64
 4   earR_y               20000 non-null  float64
 5   earR_likelihood      20000 non-null  float64
 6   earL_x               20000 non-null  float64
 7   earL_y               20000 non-null  float64
 8   earL_likelihood      20000 non-null  float64
 9   msBase_x             20000 non-null  float64
 10  msBase_y             20000 non-null  float64
 11  msBase_likelihood    20000 non-null  float64
 12  msBase_x.1           20000 non-null  float64
 13  msTop_y              20000 non-null  float64
 14  msTop_likelihood     20000 non-null  float64
 15  centroid_x           20000 non-null  floa

In [47]:
df_pose.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
tailbase_x,20000.0,387.713075,153.66021,69.017937,260.55703,358.448959,526.712326,788.670776
tailbase_y,20000.0,254.462539,68.018463,59.827847,208.120079,259.345657,308.617203,391.764801
tailbase_likelihood,20000.0,0.981936,0.084167,0.004577,0.999474,0.999971,0.999994,1.0
earR_x,20000.0,388.674316,176.607557,9.667383,258.944054,356.780731,522.892899,821.359802
earR_y,20000.0,266.56039,80.980086,44.658031,211.077774,292.38562,329.222214,424.513489
earR_likelihood,20000.0,0.98514,0.08737,0.008146,0.999368,0.99989,0.999971,1.0
earL_x,20000.0,393.266979,179.023002,32.378056,267.11348,370.085068,544.096985,830.715149
earL_y,20000.0,263.302823,82.131447,43.581833,209.631668,285.433441,325.312035,436.150269
earL_likelihood,20000.0,0.985476,0.090719,0.005108,0.999636,0.999923,0.999976,1.0
msBase_x,20000.0,390.333988,182.655025,15.762933,256.992073,368.558289,536.291153,845.399597


In [51]:
# Filter rows where a specific label equals 1
for label in RULE_BASED_LABELS + ["other"]:
    subtable = df_labels_other[df_labels_other[label] == 1]
    print(f"Shape of subtable for {label}: {subtable.shape}")

Shape of subtable for shuttles_label_naive1: (903, 6)
Shape of subtable for shuttles_label_naive2: (1006, 6)
Shape of subtable for shuttles_label_naive3: (415, 6)
Shape of subtable for shuttles_label_hardcoded: (1122, 6)
Shape of subtable for freezing_label: (2209, 6)
Shape of subtable for other: (16010, 6)


In [54]:
# Filter rows where all specified labels are 1
all_shuttles_label_1 = df_labels[
    (df_labels["shuttles_label_naive1"] == 1) &
    (df_labels["shuttles_label_naive2"] == 1) &
    (df_labels["shuttles_label_naive3"] == 1) &
    (df_labels["shuttles_label_hardcoded"] == 1)
]
print(f"Shape of rows where all shuttles labels are 1: {all_shuttles_label_1.shape}")

# Filter rows where any of the specified labels are 1
any_shuttles_label_1 = df_labels[
    (df_labels["shuttles_label_naive1"] == 1) |
    (df_labels["shuttles_label_naive2"] == 1) |
    (df_labels["shuttles_label_naive3"] == 1) |
    (df_labels["shuttles_label_hardcoded"] == 1)
]
print(f"Shape of rows where any shuttles label is 1: {any_shuttles_label_1.shape}")

Shape of rows where all shuttles labels are 1: (80, 5)
Shape of rows where any shuttles label is 1: (1781, 5)


## 3. Filter by Likelihood

Smooths out unreliable keypoint coordinates based on confidence values (likelihoods).