## Name behaviors and add location context

#### This code names syllables based on their grid movies generated in the Keypoint-MoSeq pipeline and modifies these names by adding location context from locations.ipynb.
#### If using toy data, load the csvs from the toy data folder NOT locations.ipynb.

In [1]:
# Packages
import os
import pandas as pd
import glob
import numpy as np
import os

In [2]:
# Set the working directory to the path where you put the TOY csv files 
path_to_csvs = r"C:\Users\irs3th\test code"  
if not os.path.exists(path_to_csvs):
    raise FileNotFoundError(f"Cannot find the path: '{path_to_csvs}'")

# Change the working directory and verify
os.chdir(path_to_csvs)
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\irs3th\test code


### For this file naming scheme use code below to add id and group columns from file name and combine videos

In [3]:
# List of KPMS file names 
file_names = ["veh_arena1_1001_re", "veh_arena2_1002_re", "dan_arena1_1001_re", "dan_arena2_1002_re"] # toy datasets

# Extracts ID and group from filename
def extract_info(file_name):
    parts = file_name.split('_')
    group = f"{parts[0]}"  # This takes "sd_arena" or "LiCl_arena"
    id_ = parts[2]  # ID is now the number right after 'arena' and before '_re'
    return group, id_

# List to store dataframes
dataframes = []

# Dictionary for ID map
id_map = {}

# Read each CSV, add frame_index, group, id columns, and append 
for file in file_names:
    df = pd.read_csv(f"{file}.csv")  
    df['frame_index'] = df.index + 1  # Add a frame_index starting from 1
    
    # Extract group and ID from the file name
    group, id_ = extract_info(file)
    df['group'] = group  # Add group column
    df['id'] = id_  # Add id column
    
    # Reorder columns to have id and group as the first two columns
    cols = df.columns.tolist()
    cols = ['id', 'group'] + [col for col in cols if col not in ['id', 'group']]
    df = df[cols]
    
    # Update the ID map
    id_map[file] = id_
    
    dataframes.append(df)

# Concatenate all dataframes into one combined dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined dataframe 
print(combined_df.head())

     id group  syllable  centroid x  centroid y  heading  latent_state 0  \
0  1001   veh         0       700.9        75.2     1.01             0.5   
1  1001   veh         0       700.8        75.2     1.02             0.5   
2  1001   veh         0       700.7        75.2     1.00             0.5   
3  1001   veh         0       700.6        75.2     1.01             0.5   
4  1001   veh         0       700.5        75.2     1.02             0.5   

   latent_state 1  latent_state 2  latent_state 3  frame_index  
0            -0.5            -0.6             0.6            1  
1            -0.5            -0.6             0.6            2  
2            -0.5            -0.6             0.6            3  
3            -0.5            -0.6             0.6            4  
4            -0.5            -0.6             0.6            5  


## Name syllables base on grid movies, you can decide how to name. I commented out noisy syllables or those we are not interested in

In [4]:
syllable_behavior_map = {
    0: "run",
    1: "move forward",
    2: "run", 
    3: "pause", 
    4: "run",
    5: "run", 
    6: "swing back", # leap or move quickly back on the wheel
    7: "swing back",
    8: "run",
    9: "swing back",
    10: "run",
    11: "move forward",
    #12: "remove", # background
    13: "run",
    14: "swing back", 
    15: "turn",
    16: "swing back", 
    17: "run",
    18: "swing back", 
    19: "move forward", 
    20: "move forward",
    21: "groom/sniff", # whether sniff or groom depends on context, can better define with nose movement if wanted
    22: "groom/sniff",
    23: "groom/sniff", 
    #24: "remove", # background
    25: "turn", 
    26: "turn", 
    27: "run",
    28: "swing back", 
    29: "sniff", 
    30: "hopper interaction", # head probe 
    31: "swing back", 
    32: "swing back", 
    33: "hopper interaction",
    34: "turn",
    35: "run",
    36: "turn/groom", 
    37: "swing back", 
    38: "hopper interaction",
    39: "groom",
    40: "run",
    41: "turn", 
    42: "turn", 
    43: "sniff", 
    44: "groom/sniff",
    45: "run",
    46: "run",
    47: "run",
    48: "groom/sniff",
    49: "run",
    50: "sniff",
    51: "run",
    52: "run",
    53: "run",
    54: "run",
    55: "groom",
    56: "groom/sniff",
    57: "turn",
    58: "groom",
    59: "groom",
    60: "turn",
    61: "turn",
    62: "hopper interaction",
    63: "groom",
    64: "groom/sniff",
    65: "move forward", 
    66: "groom/sniff",
    67: "turn/groom",
    68: "run",
    69: "run",
    70: "sniff",
    71: "run",
    72: "groom/sniff",
    73: "run",
    74: "groom",
    75: "run",
    76: "run",
    78: "run",
    #79: "remove",
    80: "groom/sniff",
    81: "groom/sniff",
    # no 82? 
    83: "groom/sniff",
    84: "groom",
    85: "groom/sniff", 
    86: "groom",
    87: "groom/sniff",
    88: "run",
    89: "sniff",
    90: "groom",
    91: "groom/sniff" 
    
}


# apply the mapping to create the behavior and id columns
combined_df['behavior'] = combined_df['syllable'].map(syllable_behavior_map)

# check work!
combined_df 

Unnamed: 0,id,group,syllable,centroid x,centroid y,heading,latent_state 0,latent_state 1,latent_state 2,latent_state 3,frame_index,behavior
0,1001,veh,0,700.9,75.2,1.01,0.5,-0.5,-0.6,0.6,1,run
1,1001,veh,0,700.8,75.2,1.02,0.5,-0.5,-0.6,0.6,2,run
2,1001,veh,0,700.7,75.2,1.00,0.5,-0.5,-0.6,0.6,3,run
3,1001,veh,0,700.6,75.2,1.01,0.5,-0.5,-0.6,0.6,4,run
4,1001,veh,0,700.5,75.2,1.02,0.5,-0.5,-0.6,0.6,5,run
...,...,...,...,...,...,...,...,...,...,...,...,...
967,1002,dan,22,678.1,75.4,1.02,0.5,-0.5,-0.6,0.6,239,groom/sniff
968,1002,dan,22,678.0,75.4,1.00,0.5,-0.5,-0.6,0.6,240,groom/sniff
969,1002,dan,22,677.9,75.4,1.01,0.5,-0.5,-0.6,0.6,241,groom/sniff
970,1002,dan,22,677.8,75.4,1.02,0.5,-0.5,-0.6,0.6,242,groom/sniff


# Add the location data 

In [5]:
# Ensure combined_df['id'] is a string
combined_df['id'] = combined_df['id'].astype(str)


# List all location information csv files
csv_list = ["location_veh_arena1_1001_new.csv", # toy locations
            "location_veh_arena2_1002_new.csv",
            "location_dan_arena1_1001_new.csv",
            "location_dan_arena2_1002_new.csv"
]
# Read them into a DF list
df_list = []
for file in csv_list:
    full_path = os.path.join(path_to_csvs, file)
    if os.path.exists(full_path):
        print(f"Loading file: {full_path}")
        df = pd.read_csv(full_path)
        df['id'] = df['id'].astype(str)  
        df_list.append(df)
    else:
        print(f"File not found: {full_path}")

# Concatenate 
if df_list:
    locations = pd.concat(df_list, ignore_index=True)
else:
    raise ValueError("No CSV files were loaded; check file paths and names.")

# Merge the main behavior DF (combined_df) with the combined location DataFrame
merged_df = pd.merge(combined_df, locations, on=['id', 'group', 'frame_index'], how='inner')

# Verify merge result
print("Merge completed. Shape of merged DataFrame:", merged_df.shape)

Loading file: C:\Users\irs3th\test code\location_veh_arena1_1001_new.csv
Loading file: C:\Users\irs3th\test code\location_veh_arena2_1002_new.csv
Loading file: C:\Users\irs3th\test code\location_dan_arena1_1001_new.csv
Loading file: C:\Users\irs3th\test code\location_dan_arena2_1002_new.csv
Merge completed. Shape of merged DataFrame: (972, 15)


In [6]:
merged_df

Unnamed: 0,id,group,syllable,centroid x,centroid y,heading,latent_state 0,latent_state 1,latent_state 2,latent_state 3,frame_index,behavior,shelter,food_hopper,water
0,1001,veh,0,700.9,75.2,1.01,0.5,-0.5,-0.6,0.6,1,run,False,False,False
1,1001,veh,0,700.8,75.2,1.02,0.5,-0.5,-0.6,0.6,2,run,False,False,False
2,1001,veh,0,700.7,75.2,1.00,0.5,-0.5,-0.6,0.6,3,run,False,False,False
3,1001,veh,0,700.6,75.2,1.01,0.5,-0.5,-0.6,0.6,4,run,False,False,False
4,1001,veh,0,700.5,75.2,1.02,0.5,-0.5,-0.6,0.6,5,run,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967,1002,dan,22,678.1,75.4,1.02,0.5,-0.5,-0.6,0.6,239,groom/sniff,True,False,False
968,1002,dan,22,678.0,75.4,1.00,0.5,-0.5,-0.6,0.6,240,groom/sniff,True,False,False
969,1002,dan,22,677.9,75.4,1.01,0.5,-0.5,-0.6,0.6,241,groom/sniff,True,False,False
970,1002,dan,22,677.8,75.4,1.02,0.5,-0.5,-0.6,0.6,242,groom/sniff,True,False,False


Next, modify behavior names based on the columns that give us information about animal location.

In [7]:
merged_df = merged_df.dropna(subset=['behavior']) # drop rows without detected behavior

def modify_behavior(row):
    
    # Append 'shelter' to any behavior if shelter is True (you can pick what you care is in shelter)
    if row['shelter']:
        row['behavior'] = f"{row['behavior']} in shelter"
    
    # Append 'by food' if food_hopper is True and behavior is sniff, or turn. 
    # Change groom/sniff to 'sniff by food' if food_hopper is True. 
    # Hopper interaction we only care if its by food.
    # Could add move forward and/or pause to this list too!
    
    if row['food_hopper'] and row['behavior'] == 'sniff':
        row['behavior'] = f"{row['behavior']} by food"
    
    if row['food_hopper'] and row['behavior'] == 'turn':
        row['behavior'] = f"{row['behavior']} by food"

    if row['food_hopper'] and row['behavior'] == 'groom/sniff':
        row['behavior'] = 'sniff by food'
        
    if row['food_hopper'] and row['behavior'] == 'hopper interaction':
        row['behavior'] = 'hopper interaction by food'
   
        
    # Change behavior to 'sniff/drink' if behavior is 'groom/sniff' or 'sniff' and water is True. 'pause/drink' if behavior is 'pause' and water is True.
    if row['water'] and row['behavior'] == 'groom/sniff':
        row['behavior'] = 'sniff/drink'
    
    if row['water'] and row['behavior'] == 'sniff':
        row['behavior'] = 'sniff/drink'
        
    if row['water'] and row['behavior'] == 'pause':
        row['behavior'] = 'pause/drink'
        
    return row

# Apply the function to each row (ignore dataframe naming scheme)
location_df = merged_df.apply(modify_behavior, axis=1)

In [8]:
# Hopper interaction should only occur by food hopper. Otherwise it is a similar movement, but not something we are interested in.
location_df = location_df[location_df.behavior != 'hopper interaction in shelter']
location_df = location_df[location_df.behavior != 'hopper interaction']

In [9]:
# Get the unique values of behavior column
location_df.behavior.unique()

array(['run', 'swing back', 'move forward', 'sniff by food',
       'hopper interaction by food', 'groom in shelter', 'groom/sniff',
       'pause in shelter', 'groom/sniff in shelter'], dtype=object)

In [97]:
# drop behaviors after frame 180,000 if you forgot to crop the video for whatever reason
# location_df_cea_control = location_df_cea_control[location_df_cea_control['frame_index'] <= 180000]

Group behaviors into larger, general categories. I used Network analysis of all nodes and watching videos to get a sense of how these should be grouped. If you want to use OG behaviors just skip this cell.

In [10]:
# Rename with larger categories 
behavior_mapping = {
    'hopper interaction by food': 'food motivated',
    'sniff by food': 'food motivated',
    'turn by food': 'food motivated',
    'sniff/drink': 'drink',
    'pause/drink': 'drink',
    'move forward': 'move/explore',
    'move forward in shelter': 'move/explore - shelter',
    'run': 'move/explore',
    'run in shelter': 'move/explore - shelter',
    'swing back': 'move/explore',
    'swing back in shelter': 'move/explore - shelter',
    'turn': 'move/explore',
    'sniff': 'move/explore',
    'groom': 'groom',
    'groom/sniff': 'groom',
    'turn/groom': 'groom',
    'groom in shelter': 'rest/groom in shelter',
    'groom/sniff in shelter': 'rest/groom in shelter',
    'sniff in shelter': 'rest/groom in shelter',
    'turn/groom in shelter': 'rest/groom in shelter',
    'turn in shelter': 'rest/groom in shelter',
    'pause in shelter': 'rest/groom in shelter',
   # 'pause': 'rest' # I actually remove this as its more of a transition state than rest but you could define as rest by setting minimum bout threshold
}

def rename_behaviors(df, mapping):
    """
    Rename behaviors based on exact match in the behavior name map.
    """
    df['behavior'] = df['behavior'].map(mapping).fillna(df['behavior'])
    return df

# Apply to rename behaviors their broader categories
location_df = rename_behaviors(location_df, behavior_mapping)

In [11]:
location_df.behavior.unique()

array(['move/explore', 'food motivated', 'rest/groom in shelter', 'groom'],
      dtype=object)

In [12]:
# save for time series stuff
location_df.to_csv("location_time.csv", index=False)

# Now that we have modified behaviors of interest, calculate total time spent on each behavior and save as a csv for pie charts, STATS, and clustering. 

In [87]:
behavior_names = location_df.behavior.unique().tolist() # check which behaviors we have now and store to a list for the function below 

# Get the unique values of behavior column
location_df.behavior.unique()

array(['move/explore', 'food motivated', 'groom', 'rest/groom in shelter'],
      dtype=object)

In [88]:
location_df = location_df.dropna(subset=['behavior']) # drop rows without detected behavior

def get_behavior_duration(df, behavior_names):
    """
    Get the duration of behaviors for each id, group, behavior (total number frames / 25 FPS = sec).
    All specified behavior names are included with counts of 0 if not present.
    """
    
    # Generate all combinations of 'id', 'group', and 'behavior'
    all_combinations = pd.MultiIndex.from_product(
        [df['id'].unique(), df['group'].unique(), behavior_names],
        names=['id', 'group', 'behavior']
    ).to_frame(index=False)

    # Calculate the counts of each behavior for each id, group
    behavior_counts = df.groupby(['id', 'group', 'behavior']).size().reset_index(name='count')
 
    # Merge to fill in missing combinations with count of 0
    behavior_counts_filled = pd.merge(all_combinations, behavior_counts, on=['id', 'group', 'behavior'], how='left').fillna({'count': 0})

    # Calculate behavior duration in seconds 
    behavior_counts_filled['time (sec)'] = behavior_counts_filled['count'] / 25
   
    # Pivot the dataframe to have behaviors as columns
    behavior_duration_pivot = behavior_counts_filled.pivot_table(
        index=['id', 'group'],
        columns='behavior',
        values='time (sec)',
        fill_value=0
    ).reset_index()

    # Rename the columns to append ' sec'
    behavior_duration_pivot.columns = [f'{col} sec' if col not in ['id', 'group'] else col for col in behavior_duration_pivot.columns]

    return behavior_duration_pivot

behavior_duration = get_behavior_duration(location_df, behavior_names) # locations and behavior names

behavior_duration.to_csv('behavior_test.csv', index=False) # save as csv 

### Now move on to build_dataset.ipynb