In [1]:
import os

import pandas as pd
import xmltodict

In [2]:
ltf_dir = '/Users/tmorrill002/Documents/datasets/charm/raw/ltf'
psm_dir = '/Users/tmorrill002/Documents/datasets/charm/raw/psm'
ltf_sample = os.path.join(ltf_dir, 'M01000G9C.ltf.xml')
psm_sample = os.path.join(psm_dir, 'M01000G9C.psm.xml')

In [68]:
def load_conversation(ltf_file, psm_file, ltf_dir, psm_dir):
    ltf_file = os.path.join(ltf_dir, ltf_file)
    psm_file = os.path.join(psm_dir, psm_file)
    with open(ltf_file, 'r') as f:
        ltf_content = f.read()
    
    try:
        with open(psm_file, 'r') as f:
            psm_content = f.read()
    except FileNotFoundError:
        psm_content = False
    return ltf_content, psm_content

def unpack_attributes(attribute):
    attr_dict = {}
    for attr in attribute:
        attr_dict[attr['@name']] = attr['@value']
    return attr_dict

def merge_metadata(ltf_content, psm_content):
    ltf = xmltodict.parse(ltf_content)
    psm = xmltodict.parse(psm_content)
    
    # filter psm list to message attributes
    message_attr = []
    for d in psm['psm']['string']:
        if d['@type'] == 'message':
            message_attr.append(d)
    
    # unpack the message attributes
    psm_df = pd.DataFrame(message_attr)
    psm_attr_df = pd.DataFrame(psm_df['attribute'].apply(unpack_attributes).values.tolist())
    
    psm_df = pd.concat((psm_df.drop(columns=['attribute']), psm_attr_df), axis=1)
    
    ltf_df = pd.DataFrame(ltf['LCTL_TEXT']['DOC']['TEXT']['SEG'])
    
    # join ltf and psm on start_char
    df = pd.merge(ltf_df, psm_df, left_on='@start_char', right_on='@begin_offset', how='left')
    
    # filter out messages where content length is 0 for a clean inner join
    df = df[df['@char_length'] != '0'].reset_index(drop=True)
    assert (len(df) == len(ltf_df))
    # may still be missing attributes for each message
    return df

In [69]:
ltf_files = [f for f in os.listdir(ltf_dir) if f != '.DS_Store']
psm_files = [f for f in os.listdir(psm_dir) if f != '.DS_Store']

In [70]:
dfs = []
errors = []
for ltf_file in ltf_files:
    psm_file = ltf_file.replace('ltf', 'psm')
    ltf_content, psm_content = load_conversation(ltf_file, psm_file, ltf_dir, psm_dir)
    if psm_content == False:
        errors.append((ltf_file, psm_file, 'PSM file not found'))
        continue
    df = merge_metadata(ltf_content, psm_content)
    
    if len(df[df['participant'].isna()]) > 0:
        errors.append((ltf_file, psm_file, 'Attributes missing'))
        continue
    
    # retain filename
    df.insert(0, 'filename', ltf_file)
    dfs.append(df)

In [71]:
error_df = pd.DataFrame(errors, columns=['ltf_file', 'psm_file', 'error'])

In [72]:
error_df['error'].value_counts()

PSM file not found    62
Attributes missing     2
Name: error, dtype: int64

In [73]:
len(dfs)

274

### Descriptive statistics
- distribution of conversation length
- distribution of utterance length
- distribution of conversation duration (could be useful for breaking conversations up)
- number of speakers
- dist of number of speakers per convo
- dist of number of turns per speaker per convo

In [74]:
df = pd.concat(dfs)
df = df.reset_index(drop=True)

In [75]:
# number of conversations
df['filename'].nunique()

274

In [76]:
# distribution of conversation length
turn_counts = df.groupby('filename').agg(**{'turn_count':('@id', 'count')})

In [77]:
# most conversations are about 82 turns
turn_counts.describe

Unnamed: 0,turn_count
count,274.0
mean,107.715328
std,85.921034
min,16.0
25%,58.0
50%,81.5
75%,121.0
max,704.0


In [78]:
# most utterances are about 8 characters
df['ORIGINAL_TEXT'].apply(len).to_frame().describe()

Unnamed: 0,ORIGINAL_TEXT
count,29514.0
mean,10.177678
std,11.069686
min,1.0
25%,4.0
50%,8.0
75%,13.0
max,666.0


In [113]:
# conversation duration typically about 3 hours
df['time'] = pd.to_datetime(df['time'])
duration_df = df.groupby('filename').agg(**{'start_time': ('time', 'min'), 'end_time': ('time', 'max')})

duration_df['diff'] = duration_df['end_time'] - duration_df['start_time']
duration_df['minutes'] = duration_df['diff'].apply(lambda x: x.seconds / 60)

duration_df['minutes'].describe()

count     274.000000
mean      460.530414
std       524.094847
min         0.000000
25%        57.291667
50%       187.058333
75%       846.341667
max      1439.916667
Name: minutes, dtype: float64

In [None]:
# TODO: investigate natural split points in the conversation for notions of
# intra/inter conversational communcation change points

In [116]:
# fewer speakers than conversations
df['participant'].nunique()

244

In [126]:
# 2 speakers per convo
participants_df = df.groupby('filename').agg(**{'participant_count': ('participant', 'nunique')})
participants_df.describe()

Unnamed: 0,participant_count
count,274.0
mean,2.021898
std,0.362473
min,2.0
25%,2.0
50%,2.0
75%,2.0
max,8.0


In [125]:
# one conversation with 8 participants
participants_df[participants_df['participant_count'].apply(lambda x: x>2)]

Unnamed: 0_level_0,participant_count
filename,Unnamed: 1_level_1
M01000GNK.ltf.xml,8


In [132]:
# number of turns per speaker per convo
turn_df = df.groupby(['filename', 'participant']).agg(**{'turn_count': ('participant', 'count')})

In [139]:
turn_df = turn_df.groupby('filename').agg(**{'turn_counts':('turn_count', list)})

In [141]:
# remove convos with more than 2 participants
turn_df = turn_df[turn_df['turn_counts'].apply(lambda x: len(x) == 2)]

In [144]:
turn_df['max_pct'] = turn_df['turn_counts'].apply(lambda x: max(x) / sum(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  turn_df['max_pct'] = turn_df['turn_counts'].apply(lambda x: max(x) / sum(x))


In [146]:
turn_df['min_pct'] = 1 - turn_df['max_pct']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  turn_df['min_pct'] = 1 - turn_df['max_pct']


In [148]:
# typically one speaker does more of the talking
turn_df.describe()

Unnamed: 0,max_pct,min_pct
count,273.0,273.0
mean,0.58768,0.41232
std,0.075033,0.075033
min,0.5,0.04698
25%,0.530612,0.377246
50%,0.571429,0.428571
75%,0.622754,0.469388
max,0.95302,0.5


### Select a representative conversation for annotation

In [152]:
turn_counts[turn_counts['turn_count'] == 82]

Unnamed: 0_level_0,turn_count
filename,Unnamed: 1_level_1
M01000GEW.ltf.xml,82
M01000GK0.ltf.xml,82
M01000GOY.ltf.xml,82
M01000GQK.ltf.xml,82
M01000GSZ.ltf.xml,82


In [154]:
sample_df = df[df['filename'] == 'M01000GEW.ltf.xml']

In [156]:
sample_df = sample_df[['ORIGINAL_TEXT', 'time', 'participant']]

In [159]:
sample_df = sample_df.rename(columns={'ORIGINAL_TEXT':'Original Text', 'time': 'Time', 'participant': 'Participant'})

In [163]:
sample_df['Participant'] = sample_df['Participant'].apply(lambda x: {'135882': 'A', '138485': 'B'}[x])

In [180]:
# ValueError: Excel does not support datetimes with timezones. Please ensure that datetimes are timezone unaware before writing to Excel.
sample_df['Time'] = sample_df['Time'].dt.tz_localize(None)

In [181]:
sample_df.to_excel('/Users/tmorrill002/Documents/datasets/charm/transformed/sample_M01000GEW.xlsx', index=False)