# Patch pickled dataset with speaker information for text conversations

In [1]:
import os
import pickle

import pandas as pd
import numpy as np
import xmltodict
import tiktoken
from tqdm import tqdm

import utils

## Load data
1. load the pickle file containing the complete dataset
1. load the metadata associated with text message conversations so we can include speaker IDs in the prompt

In [2]:
# load the pickle file
data = utils.load_pickle('/mnt/swordfish-pool2/ccu/amith-cache.pkl')

In [3]:
# identify all text conversations and determine if the metadata is present on this server
text_conversations = {file_id: meta for file_id, meta in data.items() if meta['data_type'] == 'text'}

In [4]:
releases = set()
more_than_one_corpora_issues = []
not_present_issues = []
for file_id, meta in text_conversations.items():
    if len(meta['status_in_corpora']) != 1:
        more_than_one_corpora_issues.append(file_id)
        continue
    if meta['status_in_corpora'][0][1] != 'present':
        not_present_issues.append(file_id)
        continue
    releases.add(meta['status_in_corpora'][0][0])

In [5]:
len(not_present_issues)

191

In [6]:
releases

{'LDC2022E11_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R1',
 'LDC2022E22_CCU_TA1_Mandarin_Chinese_Mini_Evaluation_Source_Data',
 'LDC2023E03_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R4_V1.0',
 'LDC2023E07_CCU_TA1_Mandarin_Chinese_Evaluation_Source_Data_V1.0'}

In [7]:
len(text_conversations)

1767

In [8]:
# gather up all text metadata files
ltf_files = []
psm_files = []
for release in releases:
    release_dir = os.path.join('/mnt/swordfish-pool2/ccu', release)
    ltf_dir = os.path.join(release_dir, 'data/text/ltf')
    psm_dir = os.path.join(release_dir, 'data/text/psm')
    release_ltf_files = [os.path.join(ltf_dir, f) for f in os.listdir(ltf_dir) if f != '.DS_Store']
    release_psm_files = [os.path.join(psm_dir, f) for f in os.listdir(psm_dir) if f != '.DS_Store']
    ltf_files.extend(release_ltf_files)
    psm_files.extend(release_psm_files)

In [9]:
len(ltf_files), len(psm_files)

(4294, 4294)

In [10]:
# the following functions retrieve speaker information for the conversation utterances
def load_conversation(ltf_file, psm_file):
    with open(ltf_file, 'r') as f:
        ltf_content = f.read()
    
    try:
        with open(psm_file, 'r') as f:
            psm_content = f.read()
    except FileNotFoundError:
        psm_content = False
    return ltf_content, psm_content

def unpack_attributes(attribute):
    attr_dict = {}
    for attr in attribute:
        attr_dict[attr['@name']] = attr['@value']
    return attr_dict

def merge_metadata(ltf_content, psm_content):
    ltf = xmltodict.parse(ltf_content)
    psm = xmltodict.parse(psm_content)
    
    # filter psm list to message attributes
    message_attr = []
    for d in psm['psm']['string']:
        if d['@type'] == 'message':
            message_attr.append(d)

    # no message attributes, then data is not in BOLT format but rather
    # from a discussion forum (see Release 4 README.md for more details)
    if len(message_attr) == 0:
        for d in psm['psm']['string']:
            if d['@type'] == 'post':
                message_attr.append(d)
        
    # unpack the message attributes
    psm_df = pd.DataFrame(message_attr)
    psm_attr_df = pd.DataFrame(psm_df['attribute'].apply(unpack_attributes).values.tolist())
    
    psm_df = pd.concat((psm_df.drop(columns=['attribute']), psm_attr_df), axis=1)
    
    ltf_df = pd.DataFrame(ltf['LCTL_TEXT']['DOC']['TEXT']['SEG'])
    
    # join ltf and psm on start_char
    df = pd.merge(ltf_df, psm_df, left_on='@start_char', right_on='@begin_offset', how='left')
    
    # filter out messages where content length is 0 for a clean inner join
    df = df[df['@char_length'] != '0'].reset_index(drop=True)
    assert (len(df) == len(ltf_df))
    # may still be missing attributes for each message
    return df

In [11]:
ltf_files[:1]

['/mnt/swordfish-pool2/ccu/LDC2022E11_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R1/data/text/ltf/M01000GAM.ltf.xml']

In [12]:
dfs = []
errors = []
for ltf_file in tqdm(ltf_files):
    psm_file = ltf_file.replace('ltf', 'psm')
    ltf_content, psm_content = load_conversation(ltf_file, psm_file)
    if psm_content == False:
        errors.append((ltf_file, psm_file, 'PSM file not found'))
        continue
    df = merge_metadata(ltf_content, psm_content)
    
    # for BOLT style conversations, the participant field is the speaker
    if 'participant' in df.columns:
        if len(df[df['participant'].isna()]) > 0:
            errors.append((ltf_file, psm_file, 'participant field missing'))
    elif 'author' in df.columns:
        # TODO: we're missing authorship information for many of the lines
        # need to better understand the data format
        df.rename(columns={'author': 'participant'}, inplace=True)
    # retain filename
    df.insert(0, 'filename', ltf_file)
    dfs.append(df)

100%|██████████| 4294/4294 [01:14<00:00, 57.54it/s]


In [13]:
error_df = pd.DataFrame(errors, columns=['ltf_file', 'psm_file', 'error'])

In [14]:
error_df['error'].value_counts()

participant field missing    5
Name: error, dtype: int64

In [15]:
len(dfs)

4294

In [16]:
df = pd.concat(dfs)
df = df.reset_index(drop=True)

In [17]:
df.head()

Unnamed: 0,filename,@id,@start_char,@end_char,ORIGINAL_TEXT,@type,@begin_offset,@char_length,id,participant,time,TOKEN,datetime
0,/mnt/swordfish-pool2/ccu/LDC2022E11_CCU_TA1_Ma...,M01000GAM_0000,0,7,你房子找咋样了？,message,0,10,m0000,135850,2013-05-04 00:24:19 UTC,,
1,/mnt/swordfish-pool2/ccu/LDC2022E11_CCU_TA1_Ma...,M01000GAM_0001,10,15,好郁闷啊……,message,10,8,m0001,135610,2013-05-04 00:27:40 UTC,,
2,/mnt/swordfish-pool2/ccu/LDC2022E11_CCU_TA1_Ma...,M01000GAM_0002,18,22,你几个人找,message,18,7,m0002,135850,2013-05-04 00:28:20 UTC,,
3,/mnt/swordfish-pool2/ccu/LDC2022E11_CCU_TA1_Ma...,M01000GAM_0003,25,32,我们人数波动……,message,25,10,m0003,135610,2013-05-04 00:29:21 UTC,,
4,/mnt/swordfish-pool2/ccu/LDC2022E11_CCU_TA1_Ma...,M01000GAM_0004,35,39,来我这住吧,message,35,7,m0004,135850,2013-05-04 00:29:49 UTC,,


In [18]:
len(df)

388730

In [19]:
df['file_id'] = df['filename'].apply(lambda x: os.path.split(x)[-1].split('.')[0])

In [20]:
# number of conversations
df['file_id'].nunique()

4294

In [21]:
# convert @start_char to int
df['@start_char'] = df['@start_char'].astype(int)

## Merge speaker information back into pickled dataset

In [25]:
for file_id in tqdm(text_conversations):
    temp_df = pd.DataFrame(data[file_id]['utterances'])
    temp_df = temp_df.merge(df[df['file_id'] == file_id][['@start_char', 'participant']], left_on='start', right_on='@start_char', how='left', validate='1:1')
    temp_df.drop(columns=['@start_char'], inplace=True)
    temp_df.to_dict(orient='records')
    data[file_id]['utterances'] = temp_df.to_dict(orient='records')


100%|██████████| 1767/1767 [00:52<00:00, 33.92it/s]


In [26]:
# save back to disk as pkl
with open('/mnt/swordfish-pool2/ccu/tm3229-cache.pkl', 'wb') as f:
    pickle.dump(data, f)