# Update behavior annotations in datasets

#### Brandon Pratt, 10/04/2024

Description: This notebook uses manual behavioral annotations from the anipose visualizer to append new behavioral annotations to a dataset. The general use case for this code is to refine predicted behavioral annotations as needed to retrain behavioral classifiers. 

### Workflow:

1) Update behavioral annotations in the anipose visualizer (https://flyviz.biz/).

2) Download .json file for each date that data within the dataset was collected.

3) Create a new git branch (git branch NAME) and open this notebook.

4) Specify the directory that contains the dataset (.pq) and .json files in the code below. 

5) Run the cells below and the dataset will be appended with a column called 'behavioral annotations', which contains the updated annotations. 

6) Optional: Assign a numerical value to each behavior for training classifers. 

In [30]:
# python libraries
import pandas as pd
import numpy as np
import json
import glob
import pathlib
from tqdm import tqdm

In [31]:
# located and read data
filename = 'CxHP8_gtACR1_grooming_trials.pq'
file_path = pathlib.Path.cwd().parent.joinpath('data')
data = pd.read_parquet(file_path.joinpath(filename), engine='pyarrow')

# Get behavior annotations for each data file
beh_col = pd.DataFrame(np.nan, index=np.arange(len(data)).tolist(), columns=['behavior_annotations'])
data = data.join(beh_col) # add a behavior column initialized with NaNs to data 

In [32]:
# find all filenames in dataset
unique_files = np.unique(data['fullfile']).tolist()
n_files = len(unique_files)
print(f'Number of Trials = {n_files}')

'''Extract info needed to search through jsons for behavioral labels'''
# iterate through unique file names and extarct fly and trial info
info_df = pd.DataFrame(np.nan, index=np.arange(n_files).tolist(), columns=['fly id', 'trial id'])
for j, name in enumerate(unique_files):
    split_name = name.split('|')
    info_df.loc[j,'fly id'] = split_name[1]  # fly #
    info_df.loc[j,'trial id'] = split_name[2]  # session id

Number of Trials = 33


In [33]:
print(f'Flies to annotate:')
unique_files

Flies to annotate:


['4.4.24|Fly 2_0|04042024_fly2_0 R10C1  str-cw-0 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R10C10  str-ccw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R10C5  str-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R10C6  str-ccw-0 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R1C14  rot-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R2C1  str-cw-0 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R2C15  rot-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R2C17  rot-ccw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R3C1  str-cw-0 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R3C9  str-ccw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R4C11  rot-cw-0 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R4C14  rot-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R4C17  rot-ccw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R4C3  str-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R5C12  rot-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R5C19  rot-ccw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R5C3  str-cw-1 sec',
 '4.4.24|Fly 2_0|04042024_fly2_0 R5C6  str-ccw-0 sec',
 '4.4.24

### Extract behavorial annotations from jsons

Note: Each json data is only associated with a single date of collected data so the user needs to download all relevant jsons

In [34]:
# behavior annotation directory (all jsons will be found)
json_path = pathlib.Path.cwd().parent.joinpath('data')
json_list = glob.glob(str(json_path.joinpath('*.json')))
merged_jsons = []

# extract behavior annotations from each json and append them
for json_path in json_list:
    # Open and read the JSON file
    with open(json_path, 'r') as file:
        beh_labels = json.load(file)
        merged_jsons.append(beh_labels) 

In [35]:
# iterate through json files and add updated behavioral annotations to the dataset
for i in tqdm(range(n_files)):
    fly_id = info_df['fly id'][i]
    trial_id = info_df['trial id'][i]
    trial_flag = 0 # flag if the trial data is found

    for j in range(len(merged_jsons)):
        json_data = merged_jsons[j]
        fly_keys = list(json_data.keys())

        # check to see if fly id matches any of the fly keys
        if np.isin(fly_id, fly_keys):
            fly_data = json_data[fly_id]
            trial_keys = list(fly_data.keys())

            # check if trial id matches any session keys
            if np.isin(trial_id, trial_keys):
                trial_flag = 1 # update trial flag because data was found
                trial_data = fly_data[trial_id]

                # get behavior bout keys and append the behaviors to the data
                beh_keys = list(trial_data.keys())
                for bout in beh_keys:
                    curr_bout = trial_data[bout]
                    behavior = curr_bout['behavior']
                    start_frame = curr_bout['start']
                    if start_frame != None: #  Occurs if behavior is deleted

                        # correct start frame to 0 if -1 (error in visualizer?)
                        if start_frame == -1:
                            start_frame = 0
                        end_frame = curr_bout['end']

                        # add information to data
                        df_indices = data[data['fullfile'] == unique_files[i]].index
                        data.loc[df_indices[int(start_frame):int(end_frame+1)], 'behavior_annotations'] = behavior
                
                # found data so no need to search other json files
                break
                
    if trial_flag == 0:
        print(f'{fly_id}_{trial_id} not found')
            
# save the data with update behavior annotations 
new_filename = f'{filename[0:-3]}_updated_annotations.pq'
data.to_parquet(file_path.joinpath(new_filename))

100%|██████████| 33/33 [00:00<00:00, 154.41it/s]
