# Prepare a sample from the BOLT text message corpus for Amazon Mechanical Turk labeling

In [2]:
import os

import pandas as pd
import numpy as np
import xmltodict
import tiktoken

from charm.data import utils

In [3]:
# load metadata and identify text conversations that have changepoint labels
data_dir = '/home/iron-man/Documents/data/charm'
meta_df = pd.read_csv(os.path.join(data_dir, 'transformed/metadata.csv'))

In [4]:
change_point_annotated = meta_df['changepoint_count'] > 0
text_modality = meta_df['modality'] == 'text'
text_anno_df = meta_df[change_point_annotated & text_modality]

In [5]:
text_anno_df['release'].value_counts().sum()

624

In [6]:
r1 = 'LDC2022E11_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R1'
r1_text_dir = os.path.join(data_dir, f'raw/{r1}/data/text')
ltf_dir = os.path.join(r1_text_dir, 'ltf')
psm_dir = os.path.join(r1_text_dir, 'psm')

In [7]:
# mini eval dir
mini_eval = 'LDC2022E22_CCU_TA1_Mandarin_Chinese_Mini_Evaluation_Source_Data'
mini_eval_text_dir = os.path.join(data_dir, f'raw/{mini_eval}/data/text')
mini_eval_ltf_dir = os.path.join(mini_eval_text_dir, 'ltf')
mini_eval_psm_dir = os.path.join(mini_eval_text_dir, 'psm')

In [8]:
def load_conversation(ltf_file, psm_file):
    # ltf_file = os.path.join(ltf_dir, ltf_file)
    # psm_file = os.path.join(psm_dir, psm_file)
    with open(ltf_file, 'r') as f:
        ltf_content = f.read()
    
    try:
        with open(psm_file, 'r') as f:
            psm_content = f.read()
    except FileNotFoundError:
        psm_content = False
    return ltf_content, psm_content

def unpack_attributes(attribute):
    attr_dict = {}
    for attr in attribute:
        attr_dict[attr['@name']] = attr['@value']
    return attr_dict

def merge_metadata(ltf_content, psm_content):
    ltf = xmltodict.parse(ltf_content)
    psm = xmltodict.parse(psm_content)
    
    # filter psm list to message attributes
    message_attr = []
    for d in psm['psm']['string']:
        if d['@type'] == 'message':
            message_attr.append(d)
    
    # unpack the message attributes
    psm_df = pd.DataFrame(message_attr)
    psm_attr_df = pd.DataFrame(psm_df['attribute'].apply(unpack_attributes).values.tolist())
    
    psm_df = pd.concat((psm_df.drop(columns=['attribute']), psm_attr_df), axis=1)
    
    ltf_df = pd.DataFrame(ltf['LCTL_TEXT']['DOC']['TEXT']['SEG'])
    
    # join ltf and psm on start_char
    df = pd.merge(ltf_df, psm_df, left_on='@start_char', right_on='@begin_offset', how='left')
    
    # filter out messages where content length is 0 for a clean inner join
    df = df[df['@char_length'] != '0'].reset_index(drop=True)
    assert (len(df) == len(ltf_df))
    # may still be missing attributes for each message
    return df

In [9]:
ltf_files = [os.path.join(ltf_dir, f) for f in os.listdir(ltf_dir) if f != '.DS_Store']
psm_files = [os.path.join(psm_dir, f) for f in os.listdir(psm_dir) if f != '.DS_Store']

In [10]:
# load mini-eval data too
ltf_files += [os.path.join(mini_eval_ltf_dir, f) for f in os.listdir(mini_eval_ltf_dir) if f != '.DS_Store']
psm_files += [os.path.join(mini_eval_psm_dir, f) for f in os.listdir(mini_eval_psm_dir) if f != '.DS_Store']

In [11]:
len(ltf_files)

1360

In [12]:
len(psm_files)

1360

In [13]:
dfs = []
errors = []
for ltf_file in ltf_files:
    psm_file = ltf_file.replace('ltf', 'psm')
    ltf_content, psm_content = load_conversation(ltf_file, psm_file)
    if psm_content == False:
        errors.append((ltf_file, psm_file, 'PSM file not found'))
        continue
    df = merge_metadata(ltf_content, psm_content)
    
    if len(df[df['participant'].isna()]) > 0:
        errors.append((ltf_file, psm_file, 'Attributes missing'))
        continue
    
    # retain filename
    df.insert(0, 'filename', ltf_file)
    dfs.append(df)

In [14]:
error_df = pd.DataFrame(errors, columns=['ltf_file', 'psm_file', 'error'])

In [15]:
error_df['error'].value_counts()

Attributes missing    5
Name: error, dtype: int64

In [16]:
len(dfs)

1355

In [17]:
df = pd.concat(dfs)
df = df.reset_index(drop=True)

In [18]:
df['filename'] = df['filename'].apply(lambda x: os.path.split(x)[-1])

In [19]:
# number of conversations
df['filename'].nunique()

1355

In [20]:
sample_file = text_anno_df.iloc[0]['file_uid'] + text_anno_df.iloc[0]['data_type']

In [21]:
sample_df = df[df['filename'] == sample_file]

In [22]:
sample_df = sample_df[['ORIGINAL_TEXT', 'time', 'participant']]

In [23]:
sample_df = sample_df.rename(columns={'ORIGINAL_TEXT':'Original Text', 'time': 'Time', 'participant': 'Participant'})

In [24]:
speaker_map = {}
speakers = ['A', 'B']
for idx, participant in enumerate(sample_df['Participant'].unique()):
    speaker_map[participant] = speakers[idx]

In [25]:
sample_df['Participant'] = sample_df['Participant'].apply(lambda x: speaker_map[x])

In [26]:
sample_df = sample_df.reset_index(drop=True)
sample_df.index.name = 'Utterance ID'
sample_df = sample_df.reset_index()

In [27]:
def create_line(row):
    # TODO: could optionally include the time
    return f"Speaker {row['Participant']} ({row['Utterance ID']}):  {row['Original Text']}"

sample_df['Complete Line'] = sample_df.apply(create_line, axis=1)

In [28]:
conversation_string = '\n'.join(sample_df['Complete Line'].values.tolist())

In [29]:
# load prompt to prepend:
with open('prompt.txt', 'r') as f:
    prompt = f.read()

In [30]:
# get 10 utterances at a clip to fit within ChatGPT
prompts = []
for i in range(0, len(sample_df), 10):
    conversation_string = '\n'.join(sample_df['Complete Line'].iloc[i: i+10].values.tolist())
    prompts.append(prompt + '\n\n' + conversation_string)

In [31]:
# load chat GPT output
chat_gpt_lines = []
with open('ChatGPT_output.txt', 'r') as f:
    for line in f.readlines():
        if line == '\n':
            continue
        else:
            chat_gpt_lines.append(line)

In [32]:
chat_gpt_labels = [x.split(': ')[1].split(' -')[0] for x in chat_gpt_lines]

In [33]:
chat_gpt_explanations = [x.split(' - ')[1] for x in chat_gpt_lines]

In [34]:
sample_df['chat_gpt_labels'] = chat_gpt_labels

In [35]:
sample_df['chat_gpt_explanations'] = chat_gpt_explanations

In [36]:
sample_df['character_count'] = sample_df['Original Text'].apply(lambda x: len(x))

In [37]:
sample_df['character_count_cumsum'] = sample_df['character_count'].cumsum()

In [38]:
# get start/stop character intervals for each utterance
# intervals will be [start, end)
# start should be previous row character_count_cumsum
# end should be start + character_count
sample_df['start_character'] = sample_df['character_count_cumsum'].shift(1, fill_value=0.0)
sample_df['end_character'] = sample_df['start_character'] + sample_df['character_count']

In [39]:
sample_file_id = text_anno_df.iloc[0]['file_uid']

In [41]:
# load annotations
result = utils.load_ldc_annotations(os.path.join(data_dir, 'raw'))

In [42]:
changepoint_dfs = {}
for anno in result:
    changepoint_dfs[anno] = result[anno]['anno_dfs']['changepoint.tab']

In [43]:
change_point_anno_df = pd.concat(changepoint_dfs.values())

In [44]:
change_point_anno_df['timestamp'] = change_point_anno_df['timestamp'].astype(int)

In [45]:
change_point_sample_anno_df = change_point_anno_df[change_point_anno_df['file_id'] == text_anno_df.iloc[0]['file_uid']]

In [46]:
sample_df = pd.merge_asof(sample_df, change_point_sample_anno_df[['timestamp', 'impact_scalar', 'comment']], left_on='start_character', right_on='timestamp', direction='nearest')

In [47]:
# remove invalid matches
# TODO: this doesn't solve for the issue of multiple changepoints in one utterance
greater_equal = sample_df['start_character'] <= sample_df['timestamp']
less = sample_df['timestamp'] < sample_df['end_character']
sample_df.loc[~(greater_equal & less), ['timestamp', 'impact_scalar', 'comment']] = np.nan

In [48]:
save_df = sample_df[['Utterance ID', 'Participant', 'Time', 'Original Text', 'chat_gpt_labels', 'chat_gpt_explanations','timestamp', 'impact_scalar', 'comment']]

In [49]:
# save_df.to_csv(f'{sample_file_id}_social_orientation.csv', index=False)

### Save complete dataset

In [50]:
r1_transformed_dir = os.path.join(data_dir, f'transformed/{r1}/data/text')
os.makedirs(r1_transformed_dir, exist_ok=True)
df.to_csv(os.path.join(r1_transformed_dir, 'text.csv'), index=False)

In [51]:
r1_transformed_dir

'/home/iron-man/Documents/data/charm/transformed/LDC2022E11_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R1/data/text'

### Create and save a Circumplex version of the dataset

In [52]:
import random

circumplex_labels = ['Assured-Dominant', 'Gregarious-Extraverted', 'Warm-Agreeable', 'Unassuming-Ingenuous', 'Unassured-Submissive', 'Aloof-Introverted', 'Cold', 'Arrogant-Calculating']
# generate random labels for now
df['social_orientation'] = random.choices(circumplex_labels, k=len(df))

In [53]:
df.to_csv(os.path.join(r1_transformed_dir, 'text_circumplex_random.csv'), index=False)

### Create GPT prompts

In [54]:
import json

os.makedirs('data', exist_ok=True)

### Data preparation plan
1. Only annotate change point annotated conversations from R1 for now
1. Convert participant IDs to speaker letters
1. Merge in change point information
1. Measure conversation length and split conversation into multiple chunks as needed
1. Save to jsonl

In [55]:
# label everything
# text_anno_df = text_anno_df[text_anno_df['release'] == 'R1']

In [56]:
text_file_ids = set(text_anno_df['file_uid'].unique())

In [57]:
# filter conversations df to these file_ids
df['file_id'] = df['filename'].apply(lambda x: x.split('.')[0])

In [58]:
df = df[df['file_id'].isin(text_file_ids)]

In [59]:
temp_df = df[df['filename'] == 'M01000GZR.ltf.xml']

In [60]:
def id_speakers(group_df):
    speaker_map = {}
    for idx, participant in enumerate(group_df['participant'].unique()):
        speaker_map[participant] = idx + 1

    # apply speaker map to the participant column
    group_df['participant'] = group_df['participant'].apply(lambda x: speaker_map[x])
    return group_df

In [61]:
# for each file_id, convert participants to numbers
df = df.groupby('filename', group_keys=False).apply(id_speakers)

In [62]:
df['@begin_offset'] = df['@begin_offset'].astype(int)
df['@char_length'] = df['@char_length'].astype(int)

In [63]:
def merge_changepoints(group_df, change_point_anno_df):
    # identify file_i
    file_id = group_df['file_id'].iloc[0]
    file_df = change_point_anno_df[change_point_anno_df['file_id'] == file_id].sort_values(by='timestamp')
    # merge in changepoint data
    merged_df = pd.merge_asof(group_df, file_df[['timestamp', 'impact_scalar', 'comment']], left_on='@begin_offset', right_on='timestamp', direction='nearest')
    # remove invalid matches
    # TODO: this doesn't solve for the issue of multiple changepoints in one utterance
    greater_equal = merged_df['@begin_offset'] <= merged_df['timestamp']
    less = merged_df['timestamp'] < (merged_df['@begin_offset'] + merged_df['@char_length'])
    merged_df.loc[~(greater_equal & less), ['timestamp', 'impact_scalar', 'comment']] = np.nan
    return merged_df

In [64]:
from functools import partial

In [65]:
merge_changepoints_partial = partial(merge_changepoints, change_point_anno_df=change_point_anno_df)

In [66]:
change_point_anno_df['timestamp'] = change_point_anno_df['timestamp'].astype(int)

In [67]:
file_id = 'M01000GZR'
file_df = change_point_anno_df[change_point_anno_df['file_id'] == file_id]

In [68]:
df = df.groupby('file_id', group_keys=False).apply(merge_changepoints_partial)

In [69]:
df['timestamp'].notnull().sum()

205

In [70]:
change_point_anno_df['file_id'].isin(text_file_ids).sum()

206

In [71]:
# check that all change points available were used
# assert df['timestamp'].notnull().sum() == change_point_anno_df['file_id'].isin(text_file_ids).sum()

In [72]:
# create utterance ID for each file
def create_utterance_id(group_df):
    group_df['Utterance ID'] = range(1, len(group_df)+1)
    return group_df

In [73]:
df = df.groupby('file_id', group_keys=False).apply(create_utterance_id)

In [74]:
# create conversation turn
def create_line(row):
    # TODO: could optionally include the time
    return f"Speaker {row['participant']} ({row['Utterance ID']}):  {row['ORIGINAL_TEXT']}"

df['Complete Line'] = df.apply(create_line, axis=1)

In [75]:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0301')

In [76]:
df['Complete Line Length'] =  df['Complete Line'].apply(lambda x: len(encoding.encode(x)))

In [77]:
def group_cumsum(group_df):
    group_df['line_len_cumsum'] = group_df['Complete Line Length'].cumsum()
    return group_df

In [78]:
df = df.groupby('file_id', group_keys=False).apply(group_cumsum)

In [79]:
# measure prompt length and split as needed
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo-0301":  # note: future models may deviate from this
        num_tokens = 0
        for message in messages:
            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":  # if there's a name, the role is omitted
                    num_tokens += -1  # role is always required and always 1 token
        num_tokens += 2  # every reply is primed with <im_start>assistant
        return num_tokens
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
    See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

In [80]:
# load prompt
with open('prompt.txt', 'r') as f:
    prompt = f.read()

In [81]:
# for each conversation, create a message
temp_df = df[df['file_id'] == 'M01000GE2']

In [82]:
conversation_string = '\n'.join(temp_df['Complete Line'].values)

In [83]:
messages = [
  {"role": "system", "content": "You are a helpful assistant."},
  {"role": "user", "content": prompt},
]

# check length, first check with just the prompt
prompt_len = num_tokens_from_messages(messages)

model_input = prompt + conversation_string
# then check the whole convo and get the diff
messages = [
  {"role": "system", "content": "You are a helpful assistant."},
  {"role": "user", "content": model_input},
]

input_len = num_tokens_from_messages(messages)

leftover = 4_096 - input_len

In [84]:
# want capacity of about 2500 tokens for the model, which means convo must be less than
# 4,096 - 2000 - prompt_len
convo_target_len = 4_096 - 2250 - prompt_len

In [85]:
convo_target_len

1122

In [86]:
# loop over conversations and generate complete prompts
prompts = []
for file_id in df['file_id'].unique():
    file_df = df[df['file_id'] == file_id]

    # identify indices where convo chunk is approx convo_target_len
    start = 0
    end = convo_target_len
    idx_end = 0
    while idx_end+1 != len(file_df):
        chunk_df = file_df[(file_df['line_len_cumsum'] > start) & (file_df['line_len_cumsum'] <= end)]
        idx_start = chunk_df.iloc[0].name
        idx_end = chunk_df.iloc[-1].name
        
        # create model input
        conversation_string = '\n'.join(file_df.iloc[idx_start:idx_end+1]['Complete Line'].values)
        model_input = prompt + conversation_string
        
        messages = [
          {"role": "system", "content": f"You are a helpful assistant."},
          {"role": "user", "content": model_input},
        ]
        prompts.append([file_id, messages])

        # update start and end
        start = chunk_df.iloc[-1]['line_len_cumsum']
        end = start + convo_target_len

In [87]:
prompts[0][1]

[{'role': 'system', 'content': 'You are a helpful assistant.'},
 {'role': 'user',
  'content': 'Circumplex theory is a social psychology based theory that characterizes social interactions between speakers. The social orientation tagset includes: {Assured-Dominant, Gregarious-Extraverted, Warm-Agreeable, Unassuming-Ingenuous, Unassured-Submissive, Aloof-Introverted, Cold, Arrogant-Calculating}, which are defined below in more detail.\n\nAssured-Dominant - Demands to be the center of interest / Demands attention, Does most of the talking, Speaks loudly, Is firm, Is self-confident, Is forceful, Is ambitious, Is assertive, Is persistent, Is domineering, Not self-conscious\n\nGregarious-Extraverted - Feels comfortable around people, Starts conversations, Talks to a lot of different people, Loves large groups, Is friendly, Is enthusiastic, Is warm, Is extraverted, Is good-natured, Is cheerful / happy, Is pleasant, Is outgoing, Is approachable, Is not shy, Is "lively"\n\nWarm-Agreeable - Is 

In [88]:
print(prompts[0][-1][-1]['content'])

Circumplex theory is a social psychology based theory that characterizes social interactions between speakers. The social orientation tagset includes: {Assured-Dominant, Gregarious-Extraverted, Warm-Agreeable, Unassuming-Ingenuous, Unassured-Submissive, Aloof-Introverted, Cold, Arrogant-Calculating}, which are defined below in more detail.

Assured-Dominant - Demands to be the center of interest / Demands attention, Does most of the talking, Speaks loudly, Is firm, Is self-confident, Is forceful, Is ambitious, Is assertive, Is persistent, Is domineering, Not self-conscious

Gregarious-Extraverted - Feels comfortable around people, Starts conversations, Talks to a lot of different people, Loves large groups, Is friendly, Is enthusiastic, Is warm, Is extraverted, Is good-natured, Is cheerful / happy, Is pleasant, Is outgoing, Is approachable, Is not shy, Is "lively"

Warm-Agreeable - Is interested in people, Reassures others, Inquires about others' well-being, Gets along well with others

In [89]:
# estimate cost
token_count = 0
for p in prompts:
    token_count += num_tokens_from_messages(p[-1])

In [90]:
# price
(token_count / 1000) * 0.002

2.674286

In [91]:
# save to disk
# filename = "data/gpt_requests.jsonl"

# with open(filename, "w") as f:
#     for p in prompts:
#         json_string = json.dumps(p)
#         f.write(json_string + "\n")

In [95]:
df['filename'].nunique()

311

In [96]:
len(df)

34558

In [114]:
df.merge(meta_df.drop_duplicates(subset=['file_uid']), left_on='file_id', right_on='file_uid', how='left')['release'].value_counts()

R1           20864
Mini-Eval    13694
Name: release, dtype: int64

In [93]:
# save the final df to disk
df.rename(columns={'social_orientation': 'social_orientation_random'}, inplace=True)
df.to_csv('data/df.csv', index=False)

In [None]:
# run GPT on the prompts