# Prepare documents to be labeled with social orientation tags by GPT4

In [2]:
%load_ext autoreload
%autoreload 2

import os
import pickle
import json
from collections import deque
import hashlib

import pandas as pd
import numpy as np
import xmltodict
import tiktoken
from tqdm import tqdm

# from charm.data import utils as charm_utils
import utils

In [3]:
# select which model to use
model = 'gpt-3.5-turbo'
token_limit = 4096
# model = 'gpt-4'
# token_limit = 8192

In [4]:
# load the pickle file
data = utils.load_pickle(os.path.expanduser('~/Documents/data/charm/transformed/tm3229-cache.pkl'))

## Create GPT prompts
Data preparation plan
1. Convert participant IDs to speaker numbers
1. Only annotate LDC annotated regions
1. Prepare the data for all splits in chunks of 100 conversations
    1. Prioritize internal train and val splits for starters
1. Assess prices of processing everything
1. Measure conversation length and split conversation into multiple chunks as needed
1. Save to jsonl
1. Merge in change point information (won't repeat this exercise due to concerns about the incorrectness of timestamps

In [5]:
def id_speakers_convo(group_df):
    """Give each speaker a numerical identifier."""
    if 'participant' not in group_df.columns:
        group_df['participant_id'] = 'unknown'
        return group_df
    
    # fillna with unknown
    group_df['participant'] = group_df['participant'].fillna('unknown')

    speaker_map = {}
    for idx, participant in enumerate(group_df['participant'].unique()):
        speaker_map[participant] = idx + 1

    # apply speaker map to the participant column
    group_df['participant_id'] = group_df['participant'].apply(lambda x: speaker_map[x])
    return group_df

In [6]:
# create conversation turn
def create_line(row):
    # TODO: could optionally include the time
    return f"Speaker {row['participant_id']} ({row['utterance_id']}): {row['text']}"

In [7]:
def prepare_gpt_lines(data, transcript='whisper'):
    """Records the following data in the data dict:
        1. Gives a numeric ID to each speaker in a transcript
        2. Creates utterance IDs for each utterance in a transcript
        3. Creates a complete line that will be sent to GPT
    """
    # for all file_ids, add a participant_id value to utterances
    unprocessed = []
    for file_id in tqdm(data.keys()):
        if not data[file_id]['processed']:
            unprocessed.append(file_id)
            continue
        # if data_type == 'text', don't need whisper key
        if data[file_id]['data_type'] == 'text':
            temp_df = pd.DataFrame(data[file_id]['utterances'])
        else:
            temp_df = pd.DataFrame(data[file_id]['utterances'][transcript])
        
        # add participant_id
        temp_df = id_speakers_convo(temp_df)
        # add utterance_id
        # sort by start to be safe
        temp_df = temp_df.sort_values(by='start', ascending=True)
        temp_df = temp_df.reset_index(drop=True)
        temp_df['utterance_id'] = temp_df.index + 1
        # create GPT line
        temp_df['gpt_line'] = temp_df.apply(create_line, axis=1)

        # persist results
        # if data_type == 'text', don't need whisper key
        if data[file_id]['data_type'] == 'text':
            # save results back to data
            data[file_id]['utterances'] = temp_df.to_dict('records')
        else:
            # save results back to data
            data[file_id]['utterances'][transcript] = temp_df.to_dict('records')
    return data, unprocessed

In [8]:
data, unprocessed = prepare_gpt_lines(data)

100%|██████████| 10008/10008 [01:08<00:00, 146.56it/s]


## Create GPT chunks

In [9]:
# load prompt
with open('prompt.txt', 'r') as f:
    prompt = f.read()

# load prompt addendum for no speaker scenario
with open('prompt_speaker_unknown.txt', 'r') as f:
    prompt_speaker_unknown = f.read()

In [10]:
# need some descriptive stats on distribution of conversation lengths in terms of encoding length
encoding = tiktoken.encoding_for_model(model)

In [11]:
# remove the last two lines of the prompt and add the speaker unknown prompt
prompt_speaker_unknown = '\n'.join(prompt.split('\n')[:-2]) + '\n' + prompt_speaker_unknown
# model_input = prompt + '\n'.join(sample_df['Complete Line'].tolist()) + '\n\nOutput:\n'
# model_input_speaker_unknown = prompt_speaker_unknown + '\n'.join(sample_df['Complete Line (Unknown Speaker)'].tolist()) + '\n\nOutput:\n'

In [12]:
prompt_length = len(encoding.encode(prompt))
prompt_speaker_unknown_length = len(encoding.encode(prompt_speaker_unknown))

In [13]:
print(f"Prompt length: {prompt_length}")
print(f"Prompt (speaker unknown) length: {prompt_speaker_unknown_length}")

Prompt length: 962
Prompt (speaker unknown) length: 1715


In [14]:
# GPT4 has a max length of 8192 so leave some fraction of generative capacity for the response
# ie. (8192 - prompt length) / 2 = max length of input
max_input_length = int((token_limit - len(encoding.encode(prompt))) / 2)
print(f"Max conversation input length (excluding prompt): {max_input_length}")
max_input_length_no_speaker = int((token_limit - len(encoding.encode(prompt_speaker_unknown))) / 2)
print(f"Max conversation input length (no speaker, excluding prompt): {max_input_length_no_speaker}")

Max conversation input length (excluding prompt): 1567
Max conversation input length (no speaker, excluding prompt): 1190


In [15]:
# create conversation chunks (again could probably speed this up with a DF and some indexing)
# encode conversations with GPT-4 tokenizer
# could probably do this faster if everything was in a single list/df
convo_turn_lengths = []
convo_encoding_lengths = []
for file_id in tqdm(data.keys()):
    if not data[file_id]['processed']:
        continue
    if data[file_id]['data_type'] == 'text':
        temp_df = pd.DataFrame(data[file_id]['utterances'])
    else:
        temp_df = pd.DataFrame(data[file_id]['utterances']['whisper'])
    
    # filter temp_df to only annotated regions
    # not doing this because of issues with Whisper timestamps
    # temp_df = temp_df[(temp_df['start'] >= data[file_id]['start']) & (temp_df['end'] <= data[file_id]['end'])]
    
    convo_turn_lengths.append((file_id, data[file_id]['data_type'], len(temp_df)))
    encoding_length = 0
    encoding_lengths = []
    encoding_cum_sum = []
    encoded_content = encoding.encode_batch(temp_df['gpt_line'].values.tolist())
    for encoded_line in encoded_content:
        encoding_length += len(encoded_line)
        encoding_lengths.append(len(encoded_line))
        encoding_cum_sum.append(encoding_length)
    temp_df['encoding_length'] = encoding_lengths
    temp_df['encoding_cumsum'] = encoding_cum_sum
    convo_encoding_lengths.append((file_id, data[file_id]['data_type'], encoding_length))

    # save cumsum info back to data
    if data[file_id]['data_type'] == 'text':
        # save results back to data
        data[file_id]['utterances'] = temp_df.to_dict('records')
    else:
        # save results back to data
        data[file_id]['utterances']['whisper'] = temp_df.to_dict('records')
    
    # use the cumsum information to create dialog chunks with overlapping utterances for continuity
    # identify indices where convo chunk is approx max_input_length
    max_size = max_input_length if data[file_id]['data_type'] == 'text' else max_input_length_no_speaker
    utterances = data[file_id]['utterances'] if data[file_id]['data_type'] == 'text' else data[file_id]['utterances']['whisper']
    chunks = []
    current_chunk = []
    current_size = 0
    n_overlap = 10
    last_n = deque([], maxlen=n_overlap)
    idx_end = 0
    while idx_end != len(utterances):
        # mental model is to create a chunk that is as close to max_input_length as possible
        # then move the idx_end to 10 utterances before idx_end to get some overlap and repeat
        current_chunk.append(utterances[idx_end])
        current_size += utterances[idx_end]['encoding_length']
        last_n.append(utterances[idx_end]['encoding_length'])

        # record chunk if filled or at end of utterances
        if (current_size > max_size) or (idx_end == (len(utterances) - 1)):
            # if the most recent n turns are too long, then don't reset idx_end, just continue
            # prevents infinite loop
            # or if at the end of the utterances, then don't reset idx_end
            if (sum(last_n) > max_size) or (idx_end == (len(utterances) - 1)):
                idx_end = idx_end
            else:
                idx_end = max(0, idx_end - n_overlap) # get some overlap with 10 previous utterances
            # reset trackers
            last_n.clear()
            current_size = 0
            chunks.append(current_chunk) 
            current_chunk = []
        
        # advance idx_end
        idx_end += 1
    
    # save chunks back to data
    data[file_id]['gpt_prompts'] = chunks

100%|██████████| 10008/10008 [04:30<00:00, 37.02it/s]


In [16]:
convo_turn_lengths_df = pd.DataFrame(convo_turn_lengths, columns=['file_id', 'data_type', 'num_turns'])
convo_encoding_lengths_df = pd.DataFrame(convo_encoding_lengths, columns=['file_id', 'data_type', 'encoding_length'])

In [17]:
convo_turn_lengths_df.groupby('data_type').describe()

Unnamed: 0_level_0,num_turns,num_turns,num_turns,num_turns,num_turns,num_turns,num_turns,num_turns
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
data_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
audio,727.0,360.05227,308.962545,16.0,215.0,293.0,374.0,3640.0
text,1767.0,83.787776,70.766406,15.0,40.0,64.0,105.0,959.0
video,7474.0,394.418651,369.629549,1.0,204.0,311.0,475.0,4905.0


In [18]:
convo_encoding_lengths_df.groupby('data_type').describe()

Unnamed: 0_level_0,encoding_length,encoding_length,encoding_length,encoding_length,encoding_length,encoding_length,encoding_length,encoding_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
data_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
audio,727.0,6033.68088,5344.089944,234.0,3911.0,4519.0,5599.5,69103.0
text,1767.0,2583.804754,2253.267471,354.0,1223.0,1813.0,3077.0,23254.0
video,7474.0,7275.57787,6458.365181,215.0,3787.0,5453.0,8212.5,82887.0


In [19]:
# sanity check distribution of chunk lengths
chunk_lengths = []
for file_id in data.keys():
    if not data[file_id]['processed']:
        continue
    chunk_lengths.append((file_id, data[file_id]['data_type'], len(data[file_id]['gpt_prompts'])))

In [20]:
pd.DataFrame(chunk_lengths, columns=['file_id', 'data_type', 'num_chunks']).groupby('data_type').describe()

Unnamed: 0_level_0,num_chunks,num_chunks,num_chunks,num_chunks,num_chunks,num_chunks,num_chunks,num_chunks
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
data_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
audio,727.0,6.35901,5.296355,1.0,4.0,5.0,6.0,69.0
text,1767.0,2.335597,2.01523,1.0,1.0,2.0,3.0,22.0
video,7474.0,7.204576,6.120104,1.0,4.0,5.0,8.0,81.0


### Create Final GPT Prompts, Break into 100 Conversation Splits, Assess Prices

In [21]:
# split by train/val/test/eval
# use the speaker unknown prompt for the video and audio data
# get all splits
splits = set()
for file_id in data.keys():
    if not data[file_id]['processed']:
        continue
    # union with existing splits
    splits = splits.union(data[file_id]['splits'])
    
splits

{'EVALUATION_LDC2023E07', 'INTERNAL_TEST', 'INTERNAL_TRAIN', 'INTERNAL_VAL'}

In [22]:
train_prompts = []
val_prompts = []
test_prompts = []
eval_prompts = []
for file_id in tqdm(data):
    if not data[file_id]['processed']:
        continue
    # retrieve chunks
    chunks = data[file_id]['gpt_prompts']
    preamble = prompt if data[file_id]['data_type'] == 'text' else prompt_speaker_unknown
    file_gpt_messages = []
    for chunk in chunks:        
        final_prompt = preamble + '\n'.join([utterance['gpt_line'] for utterance in chunk]) + '\n\nOutput:\n'
        # format GPT messages
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": final_prompt},
            ]
        file_gpt_messages.append([file_id, messages])
    
    if 'INTERNAL_TRAIN' in data[file_id]['splits']:
        train_prompts.extend(file_gpt_messages)
    elif 'INTERNAL_VAL' in data[file_id]['splits']:
        val_prompts.extend(file_gpt_messages)
    elif 'INTERNAL_TEST' in data[file_id]['splits']:
        test_prompts.extend(file_gpt_messages)
    elif 'EVALUATION_LDC2023E07' in data[file_id]['splits']:
        eval_prompts.extend(file_gpt_messages)

100%|██████████| 10008/10008 [00:01<00:00, 7618.29it/s]


In [23]:
# tally up unique file ids in each split to make sure we have the right number
train_file_id_count = len(set([file_id for file_id, _ in train_prompts]))
val_file_id_count = len(set([file_id for file_id, _ in val_prompts]))
test_file_id_count = len(set([file_id for file_id, _ in test_prompts]))
eval_file_id_count = len(set([file_id for file_id, _ in eval_prompts]))
processed_file_count = len([file_id for file_id in data.keys() if data[file_id]['processed']])
assert train_file_id_count + val_file_id_count + test_file_id_count + eval_file_id_count == processed_file_count

In [24]:
# split each split into chunks of 100 file_ids randomly by using a hash function
train_hash_buckets = train_file_id_count // 100
val_hash_buckets = val_file_id_count // 100
test_hash_buckets = test_file_id_count // 100
eval_hash_buckets = eval_file_id_count // 100
print(train_hash_buckets, val_hash_buckets, test_hash_buckets, eval_hash_buckets)

33 5 5 56


In [25]:
def assign_to_hash_bucket(prompts, num_file_ids_per_bucket=100):
    file_id_count = len(set([file_id for file_id, _ in prompts]))
    num_hash_buckets = file_id_count // num_file_ids_per_bucket
    # loop through files and assign to hash buckets
    shards = {}
    for file_id, messages in prompts:
        hash_bucket = int(hashlib.sha256(file_id.encode('utf-8')).hexdigest(), 16) % num_hash_buckets
        if hash_bucket not in shards:
            shards[hash_bucket] = []
        shards[hash_bucket].append([file_id, messages])
    return shards

In [26]:
train_shards = assign_to_hash_bucket(train_prompts)
val_shards = assign_to_hash_bucket(val_prompts)
test_shards = assign_to_hash_bucket(test_prompts)
eval_shards = assign_to_hash_bucket(eval_prompts)

In [27]:
# assert that the total number of messages in train_shards matches train_prompts
assert sum([len(train_shards[key]) for key in train_shards.keys()]) == len(train_prompts)
assert sum([len(val_shards[key]) for key in val_shards.keys()]) == len(val_prompts)
assert sum([len(test_shards[key]) for key in test_shards.keys()]) == len(test_prompts)
assert sum([len(eval_shards[key]) for key in eval_shards.keys()]) == len(eval_prompts)

In [28]:
# calculate total cost
def calculate_cost(data, shard, prompt_length, prompt_speaker_unknown_length, model='gpt-4'):
    total_prompt_tokens = 0
    estimated_output_tokens = 0
    for file_id, messages in shard:
        input_len = utils.num_tokens_from_messages(messages, model='gpt-4')
        total_prompt_tokens += input_len
        prompt_len_ = prompt_length if data[file_id]['data_type'] == 'text' else prompt_speaker_unknown_length
        # responses seem to be much shorter than prompts
        estimated_output_tokens += (input_len - prompt_len_) / 2 # probably an overestimate
    # cost per 1000 tokens
    prompt_cost = 0.03 if model == 'gpt-4' else 0.002
    response_cost = 0.06 if model == 'gpt-4' else 0.002
    input_cost = ((total_prompt_tokens/1000)*prompt_cost)
    output_cost = ((estimated_output_tokens/1000)*response_cost)
    return input_cost + output_cost

In [29]:
shard_costs = {}
for name, split in [('train', train_shards), ('val', val_shards), ('test', test_shards), ('eval', eval_shards)]:
    shard_costs[name] = {}
    for shard in split.keys():
        shard_costs[name][shard] = calculate_cost(data, split[shard], prompt_length, prompt_speaker_unknown_length, model=model)

In [30]:
# convert shard costs to a dataframe
shard_costs_df = pd.DataFrame(shard_costs)

In [31]:
shard_costs_df.describe()

Unnamed: 0,train,val,test,eval
count,33.0,5.0,5.0,56.0
mean,4.14906,3.693544,3.622777,4.783058
std,0.52464,0.454993,0.7518,0.649478
min,2.829947,2.995734,2.628138,3.182713
25%,3.781194,3.549116,3.277541,4.397372
50%,4.192907,3.743929,3.464696,4.757459
75%,4.531407,4.043519,4.31982,5.148559
max,5.023166,4.13542,4.42369,6.51306


In [32]:
shard_costs_df.sum()

train    136.918984
val       18.467718
test      18.113885
eval     267.851243
dtype: float64

In [42]:
# retrieve 1 document of each type from train_shard 0
video = None
text = None
audio = None
for file_id, messages in train_shards[0]:
    if data[file_id]['data_type'] == 'video':
        video = file_id
    elif data[file_id]['data_type'] == 'text':
        text = file_id
    elif data[file_id]['data_type'] == 'audio':
        audio = file_id

video_messages = []
audio_messages = []
text_messages = []
# retain all the messages for these selected files
for file_id, messages in train_prompts:
    if file_id == video:
        video_messages.append([file_id, messages])
    elif file_id == audio:
        audio_messages.append([file_id, messages])
    elif file_id == text:
        text_messages.append([file_id, messages])

In [43]:
# create data directory
os.makedirs('data', exist_ok=True)

In [44]:
# save these messages to a jsonl file, train_shard_0_sample.jsonl
with open('./data/train_shard_0_sample.jsonl', 'w') as f:
    all_messages = video_messages + audio_messages + text_messages
    for m in all_messages:
        json_string = json.dumps(m)
        f.write(json_string + "\n")

In [36]:
def count_utterances(data, shard):
    # get all unique file ids in this shard
    file_ids = set([file_id for file_id, _ in shard])
    # count the number of utterances in the actual prompt and then compare to the number of utterances in the file
    actual_utterances = 0
    gpt_utterances = 0
    for file_id in file_ids:
        utterances = data[file_id]['utterances'] if data[file_id]['data_type'] == 'text' else data[file_id]['utterances']['whisper']
        actual_utterances += len(utterances)
    # get the number of utterances in the gpt prompt
    for file_id, messages in shard:
        gpt_utterances += len(messages[1]['content'].split('Input:\n')[-1].split('\n')[:-3])
        
    return actual_utterances, gpt_utterances

In [37]:
# tally up how many utterances are in each shard
shard_utterances_counts = {}
for name, split in [('train', train_shards), ('val', val_shards), ('test', test_shards), ('eval', eval_shards)]:
    shard_utterances_counts[name] = {}
    for shard in split.keys():
        actual_utterances, gpt_utterances = count_utterances(data, split[shard])
        shard_utterances_counts[name][shard] = {'actual_utterances': actual_utterances, 'gpt_utterances': gpt_utterances}['actual_utterances']

In [38]:
utterance_counts = pd.DataFrame(shard_utterances_counts)

In [39]:
utterance_counts.describe()

Unnamed: 0,train,val,test,eval
count,33.0,5.0,5.0,56.0
mean,26043.484848,30825.2,29979.2,39182.839286
std,3461.614048,3491.59687,6456.704322,5274.111271
min,18954.0,25032.0,21889.0,28260.0
25%,23331.0,30891.0,26640.0,35823.75
50%,25981.0,31050.0,28578.0,38715.5
75%,28687.0,33507.0,34680.0,42641.75
max,34865.0,33646.0,38109.0,52794.0


In [41]:
# save shards to disk
for name, split in [('train', train_shards), ('val', val_shards), ('test', test_shards), ('eval', eval_shards)]:
    for shard in split.keys():
        with open(f'./data/{name}_shard_{shard}.jsonl', 'w') as f:
            for file_id, messages in split[shard]:
                json_string = json.dumps([file_id, messages])
                f.write(json_string + "\n")


In [52]:
# save updated pickle file
with open(os.path.expanduser('~/Documents/data/charm/transformed/tm3229-cache-updated.json'), 'wb') as f:
    pickle.dump(data, f)

In [40]:
# delete files, as needed
# import glob
# for name in ['train', 'val', 'test', 'eval']:
#     for file in glob.glob(f'./data/{name}_shard_*'):
#         os.remove(file)

### Graveyard (old code) - though some of this may be much faster (e.g. using a DF instead of a list of dicts)

In [76]:
# df['Complete Line Length'] =  df['Complete Line'].apply(lambda x: len(encoding.encode(x)))
# def group_cumsum(group_df):
#     group_df['line_len_cumsum'] = group_df['Complete Line Length'].cumsum()
#     return group_df
# df = df.groupby('file_id', group_keys=False).apply(group_cumsum)
# # for each conversation, create a message
# temp_df = df[df['file_id'] == 'M01000GE2']
# conversation_string = '\n'.join(temp_df['Complete Line'].values)
# messages = [
#   {"role": "system", "content": "You are a helpful assistant."},
#   {"role": "user", "content": prompt},
# ]

# # check length, first check with just the prompt
# prompt_len = num_tokens_from_messages(messages)

# model_input = prompt + conversation_string
# # then check the whole convo and get the diff
# messages = [
#   {"role": "system", "content": "You are a helpful assistant."},
#   {"role": "user", "content": model_input},
# ]

# input_len = num_tokens_from_messages(messages)

# leftover = 4_096 - input_len
# # want capacity of about 2500 tokens for the model, which means convo must be less than
# # 4,096 - 2000 - prompt_len
# convo_target_len = 4_096 - 2250 - prompt_len
# convo_target_len
# # loop over conversations and generate complete prompts
# prompts = []
# for file_id in df['file_id'].unique():
#     file_df = df[df['file_id'] == file_id]

#     # identify indices where convo chunk is approx convo_target_len
#     start = 0
#     end = convo_target_len
#     idx_end = 0
#     while idx_end+1 != len(file_df):
#         chunk_df = file_df[(file_df['line_len_cumsum'] > start) & (file_df['line_len_cumsum'] <= end)]
#         idx_start = chunk_df.iloc[0].name
#         idx_end = chunk_df.iloc[-1].name
        
#         # create model input
#         conversation_string = '\n'.join(file_df.iloc[idx_start:idx_end+1]['Complete Line'].values)
#         model_input = prompt + conversation_string
        
#         messages = [
#           {"role": "system", "content": f"You are a helpful assistant."},
#           {"role": "user", "content": model_input},
#         ]
#         prompts.append([file_id, messages])

#         # update start and end
#         start = chunk_df.iloc[-1]['line_len_cumsum']
#         end = start + convo_target_len
# prompts[0][1]
# print(prompts[0][-1][-1]['content'])
# # estimate cost
# token_count = 0
# for p in prompts:
#     token_count += num_tokens_from_messages(p[-1])
# # price
# (token_count / 1000) * 0.002
# os.makedirs('data', exist_ok=True)
# # save to disk
# # filename = "data/gpt_requests.jsonl"

# # with open(filename, "w") as f:
# #     for p in prompts:
# #         json_string = json.dumps(p)
# #         f.write(json_string + "\n")
# df['filename'].nunique()
# len(df)
# df.merge(meta_df.drop_duplicates(subset=['file_uid']), left_on='file_id', right_on='file_uid', how='left')['release'].value_counts()
# df
# # save the final df to disk
# df.rename(columns={'social_orientation': 'social_orientation_random'}, inplace=True)
# circumplex_dir = os.path.join(data_dir, 'transformed/circumplex')
# os.makedirs(circumplex_dir, exist_ok=True)
# save_filepath = os.path.join(circumplex_dir, 'gpt_prompts_r1_mini_eval_text.csv')
# df.to_csv(save_filepath, index=False)
# # run GPT on the prompts
# ## Merge change points into utterances
# # for each file_id, convert participants to numbers
# df = df.groupby('filename', group_keys=False).apply(id_speakers)
# df['@begin_offset'] = df['@begin_offset'].astype(int)
# df['@char_length'] = df['@char_length'].astype(int)
# def merge_changepoints(group_df, change_point_anno_df):
#     # identify file_i
#     file_id = group_df['file_id'].iloc[0]
#     file_df = change_point_anno_df[change_point_anno_df['file_id'] == file_id].sort_values(by='timestamp')
#     # merge in changepoint data
#     merged_df = pd.merge_asof(group_df, file_df[['timestamp', 'impact_scalar', 'comment']], left_on='@begin_offset', right_on='timestamp', direction='nearest')
#     # remove invalid matches
#     # TODO: this doesn't solve for the issue of multiple changepoints in one utterance
#     greater_equal = merged_df['@begin_offset'] <= merged_df['timestamp']
#     less = merged_df['timestamp'] < (merged_df['@begin_offset'] + merged_df['@char_length'])
#     merged_df.loc[~(greater_equal & less), ['timestamp', 'impact_scalar', 'comment']] = np.nan
#     return merged_df
# from functools import partial
# merge_changepoints_partial = partial(merge_changepoints, change_point_anno_df=change_point_anno_df)
# change_point_anno_df['timestamp'] = change_point_anno_df['timestamp'].astype(int)
# file_id = 'M01000GZR'
# file_df = change_point_anno_df[change_point_anno_df['file_id'] == file_id]
# df = df.groupby('file_id', group_keys=False).apply(merge_changepoints_partial)
# df['timestamp'].notnull().sum()
# change_point_anno_df['file_id'].isin(text_file_ids).sum()
# # check that all change points available were used
# # assert df['timestamp'].notnull().sum() == change_point_anno_df['file_id'].isin(text_file_ids).sum()