# Prepare a sample from the BOLT text message corpus for Amazon Mechanical Turk labeling

In [112]:
import os

import pandas as pd
import xmltodict

from charm.data import utils

In [2]:
# load metadata and identify text conversations that have changepoint labels
data_dir = '/home/iron-man/Documents/data/charm'
meta_df = pd.read_csv(os.path.join(data_dir, 'transformed/metadata.csv'))

In [9]:
change_point_annotated = meta_df['changepoint_count'] > 0
text_modality = meta_df['modality'] == 'text'
text_anno_df = meta_df[change_point_annotated & text_modality]

In [135]:
text_anno_df['release'].value_counts().sum()

624

In [12]:
r1_text_dir = os.path.join(data_dir, 'raw/LDC2022E11_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R1/data/text')
ltf_dir = os.path.join(r1_text_dir, 'ltf')
psm_dir = os.path.join(r1_text_dir, 'psm')

In [13]:
def load_conversation(ltf_file, psm_file, ltf_dir, psm_dir):
    ltf_file = os.path.join(ltf_dir, ltf_file)
    psm_file = os.path.join(psm_dir, psm_file)
    with open(ltf_file, 'r') as f:
        ltf_content = f.read()
    
    try:
        with open(psm_file, 'r') as f:
            psm_content = f.read()
    except FileNotFoundError:
        psm_content = False
    return ltf_content, psm_content

def unpack_attributes(attribute):
    attr_dict = {}
    for attr in attribute:
        attr_dict[attr['@name']] = attr['@value']
    return attr_dict

def merge_metadata(ltf_content, psm_content):
    ltf = xmltodict.parse(ltf_content)
    psm = xmltodict.parse(psm_content)
    
    # filter psm list to message attributes
    message_attr = []
    for d in psm['psm']['string']:
        if d['@type'] == 'message':
            message_attr.append(d)
    
    # unpack the message attributes
    psm_df = pd.DataFrame(message_attr)
    psm_attr_df = pd.DataFrame(psm_df['attribute'].apply(unpack_attributes).values.tolist())
    
    psm_df = pd.concat((psm_df.drop(columns=['attribute']), psm_attr_df), axis=1)
    
    ltf_df = pd.DataFrame(ltf['LCTL_TEXT']['DOC']['TEXT']['SEG'])
    
    # join ltf and psm on start_char
    df = pd.merge(ltf_df, psm_df, left_on='@start_char', right_on='@begin_offset', how='left')
    
    # filter out messages where content length is 0 for a clean inner join
    df = df[df['@char_length'] != '0'].reset_index(drop=True)
    assert (len(df) == len(ltf_df))
    # may still be missing attributes for each message
    return df

In [14]:
ltf_files = [f for f in os.listdir(ltf_dir) if f != '.DS_Store']
psm_files = [f for f in os.listdir(psm_dir) if f != '.DS_Store']

In [15]:
len(ltf_files)

976

In [16]:
len(psm_files)

976

In [17]:
dfs = []
errors = []
for ltf_file in ltf_files:
    psm_file = ltf_file.replace('ltf', 'psm')
    ltf_content, psm_content = load_conversation(ltf_file, psm_file, ltf_dir, psm_dir)
    if psm_content == False:
        errors.append((ltf_file, psm_file, 'PSM file not found'))
        continue
    df = merge_metadata(ltf_content, psm_content)
    
    if len(df[df['participant'].isna()]) > 0:
        errors.append((ltf_file, psm_file, 'Attributes missing'))
        continue
    
    # retain filename
    df.insert(0, 'filename', ltf_file)
    dfs.append(df)

In [18]:
error_df = pd.DataFrame(errors, columns=['ltf_file', 'psm_file', 'error'])

In [19]:
error_df['error'].value_counts()

Attributes missing    2
Name: error, dtype: int64

In [20]:
len(dfs)

974

In [22]:
df = pd.concat(dfs)
df = df.reset_index(drop=True)

In [23]:
# number of conversations
df['filename'].nunique()

974

In [54]:
sample_file = text_anno_df.iloc[0]['file_uid'] + text_anno_df.iloc[0]['data_type']

In [55]:
sample_df = df[df['filename'] == sample_file]

In [56]:
sample_df = sample_df[['ORIGINAL_TEXT', 'time', 'participant']]

In [57]:
sample_df = sample_df.rename(columns={'ORIGINAL_TEXT':'Original Text', 'time': 'Time', 'participant': 'Participant'})

In [63]:
speaker_map = {}
speakers = ['A', 'B']
for idx, participant in enumerate(sample_df['Participant'].unique()):
    speaker_map[participant] = speakers[idx]

In [66]:
sample_df['Participant'] = sample_df['Participant'].apply(lambda x: speaker_map[x])

In [67]:
sample_df = sample_df.reset_index(drop=True)
sample_df.index.name = 'Utterance ID'
sample_df = sample_df.reset_index()

In [69]:
def create_line(row):
    # TODO: could optionally include the time
    return f"Speaker {row['Participant']} ({row['Utterance ID']}):  {row['Original Text']}"

In [70]:
sample_df['Complete Line'] = sample_df.apply(create_line, axis=1)

In [71]:
conversation_string = '\n'.join(sample_df['Complete Line'].values.tolist())

In [87]:
# load prompt to prepend:
with open('prompt.txt', 'r') as f:
    prompt = f.read()

In [88]:
# get 10 utterances at a clip to fit within ChatGPT
prompts = []
for i in range(0, len(sample_df), 10):
    conversation_string = '\n'.join(sample_df['Complete Line'].iloc[i: i+10].values.tolist())
    prompts.append(prompt + '\n\n' + conversation_string)

In [95]:
print(prompts[6])

Circumplex theory is a social psychology based theory that characterizes social interactions between speakers. The social orientation tagset includes: {Assured-Dominant, Gregarious-Extraverted, Warm-Agreeable, Unassuming-Ingenuous, Unassured-Submissive, Aloof-Introverted, Cold, Arrogant-Calculating}, which are defined below in more detail.

Assured-Dominant - Demands to be the center of interest / Demands attention, Does most of the talking, Speaks loudly, Is firm, Is self-confident, Is forceful, Is ambitious, Is assertive, Is persistent, Is domineering, Not self-conscious

Gregarious-Extraverted - Feels comfortable around people, Starts conversations, Talks to a lot of different people, Loves large groups, Is friendly, Is enthusiastic, Is warm, Is extraverted, Is good-natured, Is cheerful / happy, Is pleasant, Is outgoing, Is approachable, Is not shy, Is "lively"

Warm-Agreeable - Is interested in people, Reassures others, Inquires about others' well-being, Gets along well with others

In [99]:
# load chat GPT output
chat_gpt_labels = []
with open('ChatGPT_output.txt', 'r') as f:
    for line in f.readlines():
        if line == '\n':
            continue
        else:
            chat_gpt_labels.append(line)

In [105]:
chat_gpt_labels = [x.split(': ')[1].split(' -')[0] for x in chat_gpt_labels]

In [109]:
sample_df['chat_gpt_labels'] = chat_gpt_labels

In [128]:
sample_df['character_count'] = sample_df['Original Text'].apply(lambda x: len(x))

In [131]:
sample_df['character_count_cumsum'] = sample_df['character_count'].cumsum()

In [134]:
sample_df[sample_df['character_count_cumsum'] > 100]

Unnamed: 0,Utterance ID,Original Text,Time,Participant,Complete Line,chat_gpt_labels,character_count,character_count_cumsum
11,11,嗯深受影响。。。,2014-09-25 15:43:57 UTC,B,Speaker B (11): 嗯深受影响。。。,Unassured-Submissive,8,107
12,12,世界上还有另外一种不叫caraway 的菜有香菜味,2014-09-25 15:55:01 UTC,A,Speaker A (12): 世界上还有另外一种不叫caraway 的菜有香菜味,Unassuming-Ingenuous,25,132
13,13,哈哈,2014-09-25 15:55:12 UTC,B,Speaker B (13): 哈哈,Unassuming-Ingenuous,2,134
14,14,我不是之前数据处理得不合理吗，我就有在办公室里说我这个处理方法完全不对（completely...,2014-09-25 16:49:04 UTC,A,Speaker A (14): 我不是之前数据处理得不合理吗，我就有在办公室里说我这个处理...,Unassured-Submissive,212,346
15,15,是啊 摸头,2014-09-25 16:51:30 UTC,B,Speaker B (15): 是啊 摸头,Warm-Agreeable,5,351
16,16,不开心,2014-09-25 16:51:46 UTC,A,Speaker A (16): 不开心,Unassured-Submissive,3,354
17,17,谦虚但不要自贬,2014-09-25 16:51:50 UTC,B,Speaker B (17): 谦虚但不要自贬,Warm-Agreeable,7,361
18,18,换个角度想 人家也就是照实说了啊 别放在心上 以后自己注意就好了,2014-09-25 16:52:19 UTC,B,Speaker B (18): 换个角度想 人家也就是照实说了啊 别放在心上 以后自己注意就好了,Unassuming-Ingenuous,32,393
19,19,摸头,2014-09-25 16:52:22 UTC,B,Speaker B (19): 摸头,Warm-Agreeable,2,395
20,20,哎是的。我也确实经常表现出对自己做的东西没有信心，总是贬低,2014-09-25 16:52:51 UTC,A,Speaker A (20): 哎是的。我也确实经常表现出对自己做的东西没有信心，总是贬低,Unassured-Submissive,29,424


In [115]:
# load annotations
result = utils.load_ldc_annotations(os.path.join(data_dir, 'raw'))

In [124]:
change_point_anno_df = result['Annotation-1']['anno_dfs']['changepoint.tab']

In [127]:
change_point_anno_df[change_point_anno_df['file_id'] == text_anno_df.iloc[0]['file_uid']].values

array([[212, 'M01000G9A', 162, 4,
        'Pre-change: Speakers were chit-chatting. Both speakers were casual and relaxed. Shift: One speaker recounted a bad experience and felt upset. Both speakers were more serious. Evidence: One speaker mentioned feeling very uncomfortable and not happy. Both speakers used longer messages'],
       [212, 'M01000G9A', 635, 5,
        'Pre-change: One speaker was upset and the other speaker was comforting them. The mood was quite down. Shift: Speakers changed the topic to plans for dinner and the evening. Both speakers were casual and more relaxed. Evidence: Both speakers used shorter messages and sounded more upbeat.']],
      dtype=object)

In [46]:
# # write to disk
# with open('sample_convo.txt', 'w') as f:
#     f.write(conversation_string)

In [None]:
# prepare prompts one at a time

### Save complete dataset

In [35]:
df.to_csv('/Users/tmorrill002/Documents/datasets/charm/transformed/BOLT_dev_conversations.csv')