# Labeled Data EDA

In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

## Load the annotations from LDC

In [2]:
home_dir = os.path.expanduser('~')

In [3]:
# source: https://drive.google.com/drive/folders/1aL7bcLWQmUskR3dmj3K1jdXQsb_nIcv2
anno_dir = os.path.join(home_dir, 'Documents/datasets/charm/raw/LDC2022E18_CCU_TA1_Mandarin_Chinese_Development_Annotation_V1.0/data')
anno_files = [os.path.join(anno_dir, x) for x in os.listdir(anno_dir) if x not in ['.DS_Store']]

In [4]:
# annotation files
for f in anno_files:
    print(os.path.split(f)[-1])

valence_arousal.tab
changepoint.tab
norms.tab
emotions.tab


In [5]:
anno_dfs = {}
for f in anno_files:
    filename = os.path.split(f)[-1]
    anno_dfs[filename] = pd.read_csv(f, sep='\t')

In [6]:
for f in anno_dfs:
    print(f)
    display(anno_dfs[f].head(2))
    print()

valence_arousal.tab


Unnamed: 0,user_id,file_id,segment_id,valence_continuous,valence_binned,arousal_continuous,arousal_binned
0,212,M0100053I,M0100053I_0001,403,3,330,2
1,212,M0100053I,M0100053I_0002,623,4,596,3



changepoint.tab


Unnamed: 0,user_id,file_id,timestamp,impact_scalar,comment
0,212,M01003YN6,38,1,Pre-change: The female served the male fruit t...
1,212,M01003MTK,64,5,Pre-change: female introduced the male to her ...



norms.tab


Unnamed: 0,user_id,file_id,segment_id,norm,status
0,314,M0100053I,M0100053I_0001,none,EMPTY_NA
1,314,M0100053I,M0100053I_0002,none,EMPTY_NA



emotions.tab


Unnamed: 0,user_id,file_id,segment_id,emotion,multi_speaker
0,212,M0100053I,M0100053I_0001,sadness,False
1,212,M0100053I,M0100053I_0002,"anticipation,trust",True





In [7]:
# load segment information
segment_filepath = os.path.join(home_dir, 'Documents/datasets/charm/raw/LDC2022E18_CCU_TA1_Mandarin_Chinese_Development_Annotation_V1.0/docs/segments.tab')
segment_df = pd.read_csv(segment_filepath, delimiter='\t')

In [8]:
anno_files = {}
anno_files_list = set()
for f in anno_dfs:
    temp_files = anno_dfs[f]['file_id'].unique()
    anno_files[f] = temp_files
    anno_files_list = anno_files_list.union(set(temp_files))
anno_files_list = sorted(list(anno_files_list))

In [9]:
anno_files['changepoint.tab']

array(['M01003YN6', 'M01003MTK', 'M01000AJ9', 'M01000AJA', 'M01003M18',
       'M01000FN8', 'M01000FO1', 'M01000FT6', 'M01003PKW', 'M01003W6M',
       'M01003PLL'], dtype=object)

In [10]:
anno_files_list[:2]

['M0100053I', 'M0100053J']

## Load annotations from Columbia

In [11]:
# source: https://docs.google.com/spreadsheets/d/1LhDzrUO2yFKXEWKqC_W-xx7jMmO5EUKgYT-Hb0LVgRk/edit#gid=1013745441
columbia_anno_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/annotations/CCU LDC Data Release R2 Annotation - Annotations.csv')

In [12]:
cu_anno_df = pd.read_csv(columbia_anno_filepath, skiprows=3)

In [13]:
rename = {'Unnamed: 0': 'user_id', 'Unnamed: 1': 'file_id', 'Unnamed: 2':'url'}
cu_anno_df = cu_anno_df.rename(columns=rename)

In [14]:
cu_anno_df['user_id'] = cu_anno_df['user_id'].ffill()

In [15]:
cu_anno_df.head(2)

Unnamed: 0,user_id,file_id,url,Timestamp,Direction,Timestamp.1,Direction.1,Timestamp.2,Direction.2,Timestamp.3,...,Timestamp.16,Emotion Shift .9,Timestamp.17,Emotion Shift .10,Timestamp.18,Emotion Shift .11,Timestamp.19,Emotion Shift .12,Timestamp.20,Emotion Shift .13
0,Jeff,M01004MLF,https://www.bilibili.com/video/BV1Xe4y1X7Q1,No Change - All Success,,,,,,,...,,,,,,,,,,
1,Jeff,M01004OWY,https://www.bilibili.com/video/BV1mR4y177NS,1:30,Success to Failure,5:35,Success to Failure,6:29,Success to Failure,9:05,...,,,,,,,,,,


In [16]:
core_cols = ['user_id', 'file_id', 'url']

### Prepare change point annotations

In [17]:
# change point cols
changepoint_cols = []
for i in range(7):
    if i == 0:
        changepoint_cols.append(['Timestamp', 'Direction'])
    else:
        changepoint_cols.append([f'Timestamp.{i}', f'Direction.{i}'])

# slice out cols and then stack the resulting dfs
change_point_dfs = []
for change in changepoint_cols:
    temp_df = cu_anno_df[core_cols + change]
    rename_map = {col: col.split('.')[0] for col in change}
    temp_df = temp_df.rename(columns=rename_map)
    change_point_dfs.append(temp_df)

change_point_df = pd.concat(change_point_dfs)

In [18]:
# making an assumption here, that our success to failure maps to a 1 and failure to success maps to 5
# which may not be accurate. Also treating no annotation as a 0
impact_scalar_map = {'Success to Failure':1, 'Failure to Success':5}

In [19]:
change_point_df = change_point_df.sort_values(by='user_id')

In [20]:
change_point_df['Direction'] = change_point_df['Direction'].apply(lambda x: impact_scalar_map[x] if x in impact_scalar_map else 0)

In [21]:
change_point_df = change_point_df.dropna(subset='Timestamp')

In [22]:
# null out timestamps for no change files
change_point_df['Timestamp'] = change_point_df['Timestamp'].replace('No Change - All Success', np.nan)

In [23]:
assert change_point_df['file_id'].nunique() == 25

In [24]:
change_point_df = change_point_df.reset_index(drop=True)

In [25]:
change_point_df = change_point_df.rename(columns={'Timestamp': 'timestamp', 'Direction': 'impact_scalar'})

In [26]:
def convert_to_seconds(timestamp):
    # if not null
    if isinstance(timestamp, str):
        minutes, seconds = timestamp.split(':')
        return int(minutes) * 60 + int(seconds)
    return timestamp

In [27]:
change_point_df['timestamp'] = change_point_df['timestamp'].apply(convert_to_seconds)

In [28]:
# convert to int type
change_point_df['timestamp'] = change_point_df['timestamp'].astype('O')
change_point_df.loc[change_point_df['timestamp'].notna(), 'timestamp'] = change_point_df[change_point_df['timestamp'].notna()]['timestamp'].apply(lambda x: int(x))

In [29]:
change_point_df = change_point_df.drop(columns=['url'])

In [30]:
change_point_df['comment'] = np.nan

In [31]:
change_point_df = change_point_df.sort_values(by=['user_id', 'file_id']).reset_index(drop=True)

In [32]:
change_point_df.head()

Unnamed: 0,user_id,file_id,timestamp,impact_scalar,comment
0,Jeff,M01003LPD,,0,
1,Jeff,M01003N7R,,0,
2,Jeff,M01004MLF,,0,
3,Jeff,M01004MXF,,0,
4,Jeff,M01004OWY,545.0,1,


In [33]:
save_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/annotations/changepoint_columbia.tab')
change_point_df[anno_dfs['changepoint.tab'].columns].to_csv(save_filepath, sep='\t', index=False)

### Prepare emotion annotations

In [34]:
cu_anno_df.columns

Index(['user_id', 'file_id', 'url', 'Timestamp', 'Direction', 'Timestamp.1',
       'Direction.1', 'Timestamp.2', 'Direction.2', 'Timestamp.3',
       'Direction.3', 'Timestamp.4', 'Direction.4', 'Timestamp.5',
       'Direction.5', 'Timestamp.6', 'Direction.6', 'Unnamed: 17',
       'Unnamed: 18', 'Timestamp.7', 'Emotion Shift ', 'Timestamp.8',
       'Emotion Shift .1', 'Timestamp.9', 'Emotion Shift .2', 'Timestamp.10',
       'Emotion Shift .3', 'Timestamp.11', 'Emotion Shift .4', 'Timestamp.12',
       'Emotion Shift .5', 'Timestamp.13', 'Emotion Shift .6', 'Timestamp.14',
       'Emotion Shift .7', 'Timestamp.15', 'Emotion Shift .8', 'Timestamp.16',
       'Emotion Shift .9', 'Timestamp.17', 'Emotion Shift .10', 'Timestamp.18',
       'Emotion Shift .11', 'Timestamp.19', 'Emotion Shift .12',
       'Timestamp.20', 'Emotion Shift .13'],
      dtype='object')

In [35]:
# change point cols
emotion_cols = []
for i in range(7, 7+14):
    if i == 7:
        emotion_cols.append(['Timestamp.7', 'Emotion Shift '])
    else:
        emotion_cols.append([f'Timestamp.{i}', f'Emotion Shift .{i-7}'])

In [36]:
# slice out cols and then stack the resulting dfs
emotion_col_names = ['Timestamp', 'Emotion Shift']
emotion_dfs = []
for emotion in emotion_cols:
    temp_df = cu_anno_df[core_cols + emotion]
    rename_map = {col: emotion_col_names[idx] for idx, col in enumerate(emotion)}
    temp_df = temp_df.rename(columns=rename_map)
    emotion_dfs.append(temp_df)

emotion_df = pd.concat(emotion_dfs)

In [37]:
emotion_df = emotion_df.rename(columns={'Timestamp': 'timestamp', 'Emotion Shift': 'emotion'})

In [38]:
emotion_df['emotion'] = emotion_df['emotion'].apply(lambda x: x.lower() if isinstance(x, str) else 'none')

In [39]:
typos = {'anticapation': 'anticipation', 'confused': 'surprise', 'non speech': 'noann', 'anticipating': 'anticipation', 'surprise ': 'surprise'}

In [40]:
emotion_df['emotion'] = emotion_df['emotion'].apply(lambda x: typos[x] if x in typos else x)

In [41]:
valid_emotions = ['fear', 'anger', 'sadness', 'joy', 'disgust', 'surprise', 'trust', 'anticipation', 'none', 'noann']

In [42]:
# make sure all emotion labels are valid
assert len(emotion_df[~emotion_df['emotion'].isin(valid_emotions)]) == 0

In [43]:
emotion_df = emotion_df.reset_index(drop=True)

In [44]:
emotion_df['timestamp'] = emotion_df['timestamp'].apply(lambda x: convert_to_seconds(x))

In [45]:
# convert to int type
emotion_df['timestamp'] = emotion_df['timestamp'].astype('O')
emotion_df.loc[emotion_df['timestamp'].notna(), 'timestamp'] = emotion_df[emotion_df['timestamp'].notna()]['timestamp'].apply(lambda x: int(x))

In [46]:
anno_dfs['emotions.tab'].columns

Index(['user_id', 'file_id', 'segment_id', 'emotion', 'multi_speaker'], dtype='object')

In [47]:
emotion_df = emotion_df.drop(columns=['url'])

#### Attempt to pull in segment ids

In [48]:
# probably a more pythonic way to do this but I'm just going to iterate
segment_ids = []
for idx, row in emotion_df.iterrows():
    # if nan, then skip, we'll process these later
    if not isinstance(row['timestamp'], int):
        segment_ids.append(np.NaN)
        continue
    # filter segment_ids
    segment_filter = segment_df[segment_df['file_id'] == row['file_id']]
    found = False
    for idx2, row2 in segment_filter.iterrows():
        if (row['timestamp'] >= row2['start']) and (row['timestamp'] < row2['end']):
            found = True
            segment_ids.append(row2['segment_id'])
    if not found:
        segment_ids.append(np.NaN)

In [49]:
# no segment ids found
for seg in segment_ids:
    if isinstance(seg, str):
        print(seg)

In [50]:
# manually create segment for the number 15 second intervals we are away from the start
emotion_df['segment_id'] = emotion_df.apply(lambda x: f"{x['file_id']}_{int((x['timestamp'] // 15 + 1)):04d}" if isinstance(x['timestamp'], int) else x['timestamp'], axis=1)

In [51]:
def comma_separate(sequence):
    return ','.join([str(x) for x in list(sequence) if isinstance(x, int) or isinstance(x, str)])

In [52]:
no_emotion_df = emotion_df[emotion_df['timestamp'].isna()]

In [53]:
# group the same way that LDC did
emotion_df = emotion_df.groupby(['user_id', 'file_id', 'segment_id'], as_index=False, sort=False).agg(**{'emotion':('emotion', comma_separate), 'timestamp':('timestamp', comma_separate)})

In [54]:
# didn't annotate this
emotion_df['multi_speaker'] = np.nan

In [55]:
# based on README.txt from LDC
emotion_df.loc[emotion_df['emotion'].str.contains('noann'), 'multi_speaker'] = 'noann'

In [56]:
no_emotion_df = no_emotion_df.drop_duplicates()

In [57]:
# based on README.txt from LDC
no_emotion_df['multi_speaker'] = 'EMPTY_NA'

In [58]:
no_emotion_df = no_emotion_df.sort_values(by='user_id').reset_index(drop=True)

In [59]:
# add timestamp back in
no_emotion_df['timestamp'] = np.NaN

In [60]:
col_order = list(anno_dfs['emotions.tab'].columns) + ['timestamp']
full_emotion_df = pd.concat((emotion_df[col_order], no_emotion_df[col_order]))

In [61]:
full_emotion_df = full_emotion_df.sort_values(by=['user_id', 'file_id', 'segment_id']).reset_index(drop=True)

In [62]:
# all videos are in there a second time, remove the extra anno
full_emotion_df[full_emotion_df['file_id'] == 'M01003JTT']

Unnamed: 0,user_id,file_id,segment_id,emotion,multi_speaker,timestamp
91,Yanda,M01003JTT,M01003JTT_0002,joy,,28.0
92,Yanda,M01003JTT,M01003JTT_0008,anticipation,,113.0
93,Yanda,M01003JTT,M01003JTT_0011,joy,,160.0
94,Yanda,M01003JTT,M01003JTT_0014,sadness,,202.0
95,Yanda,M01003JTT,M01003JTT_0019,joy,,275.0
96,Yanda,M01003JTT,M01003JTT_0028,anticipation,,410.0
97,Yanda,M01003JTT,M01003JTT_0036,surprise,,525.0
98,Yanda,M01003JTT,M01003JTT_0052,joy,,770.0
99,Yanda,M01003JTT,,none,EMPTY_NA,


In [63]:
# groupby file_id and if it's longer than 1, remove the last row of the group
final_groups = []
for key, group_df in full_emotion_df.groupby('file_id'):
    if len(group_df) > 1:
        # remove last row
        final_groups.append(group_df.iloc[:-1])
    else:
        final_groups.append(group_df)

In [64]:
full_emotion_df = pd.concat(final_groups).sort_values(by=['user_id', 'file_id', 'segment_id']).reset_index(drop=True)

In [65]:
# these are the only 3 videos with no emotion tags
full_emotion_df[full_emotion_df['segment_id'].isna()]

Unnamed: 0,user_id,file_id,segment_id,emotion,multi_speaker,timestamp
57,Todd,M01003XTU,,none,EMPTY_NA,
113,Yukun,M01003WXG,,none,EMPTY_NA,
114,Yukun,M01004ECQ,,none,EMPTY_NA,


In [69]:
full_emotion_df[full_emotion_df['emotion'] == 'none']

Unnamed: 0,user_id,file_id,segment_id,emotion,multi_speaker,timestamp
57,Todd,M01003XTU,,none,EMPTY_NA,
113,Yukun,M01003WXG,,none,EMPTY_NA,
114,Yukun,M01004ECQ,,none,EMPTY_NA,


In [66]:
full_emotion_df.sample(5)

Unnamed: 0,user_id,file_id,segment_id,emotion,multi_speaker,timestamp
48,Sara,M01004QOU,M01004QOU_0008,anticipation,,109
70,Todd,M01004R1K,M01004R1K_0068,sadness,,1016
52,Sara,M01004QOU,M01004QOU_0016,"joy,anticipation",,230238
111,Yukun,M01003RBS,M01003RBS_0034,joy,,500
108,Yukun,M01003NA1,M01003NA1_0034,anticipation,,503


In [67]:
assert full_emotion_df['file_id'].nunique() == 25

In [68]:
save_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/annotations/emotion_columbia.tab')
full_emotion_df.to_csv(save_filepath, sep='\t', index=False)

## Load ASR transcriptions

In [70]:
asr_dirs = [os.path.join(home_dir, 'Documents/datasets/charm/transformed/R2/ldc-r2-batch1-tom-n79'),
os.path.join(home_dir, 'Documents/datasets/charm/transformed/R1/audio_processed'),
os.path.join(home_dir, 'Documents/datasets/charm/transformed/R1/video_processed')]

In [75]:
# create lists of all filepaths and file_ids
asr_files = []
file_ids = []
for dir_ in asr_dirs:
    for f in os.listdir(dir_):
        if f.endswith('.json'):
            asr_files.append(os.path.join(dir_, f))
            file_ids.append(f.split('_')[0])

In [79]:
len(file_ids)

245

## Determine which annotated files are missing transcriptions

In [88]:
len(anno_files_list)

173

In [105]:
# which files do we have transcriptions for?
ldc_intersection = set(anno_files_list).intersection(set(file_ids))

In [106]:
len(ldc_intersection)

96

In [107]:
# determine if any of the LDC data needs to be trascribed
ldc_missing = set(anno_files_list) - set(file_ids)

In [108]:
len(ldc_missing)

77

In [98]:
assert (len(ldc_intersection) + len(ldc_missing)) == len(anno_files_list)

In [101]:
# determine if we have transcriptions for our labeled data
cu_intersection = set(full_emotion_df['file_id'].unique()).intersection(set(file_ids))
# and what we're mising
cu_missing = set(full_emotion_df['file_id'].unique()) - set(file_ids)

In [119]:
print(len(cu_intersection))
print(len(cu_missing))

2
23


In [104]:
assert (len(cu_intersection) + len(cu_missing)) == len(full_emotion_df['file_id'].unique())

In [111]:
# what's the intersection between ldc_missing and cu_missing
ldc_missing.intersection(cu_missing)
# disjoint

set()

In [110]:
# take the union of ldc missing and cu missing to make a list of all files that need to be transcribed
missing = ldc_missing.union(cu_missing)

In [112]:
len(missing)

100

In [114]:
asr_request_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/asr_request.txt')
with open(asr_request_filepath, 'w') as fh:
    for f in sorted(list(missing)):
        fh.write(f+'\n')

## Determine which transcribed files are missing translations

In [115]:
# based on Google drive, no translations exist for the labeled data
# https://drive.google.com/drive/folders/1rhRJhBgtBuMSpcWn8nQHWmlMUGqfmAba

In [117]:
# how many files are ready for transcription
len(ldc_intersection.union(cu_intersection))

96

## Compare to Tom's requests

In [134]:
import pickle

r2_request = os.path.join(home_dir, 'Documents/datasets/charm/transformed/r2_asr_files.pkl')
r3_request = os.path.join(home_dir, 'Documents/datasets/charm/transformed/r3_asr_files.pkl')
with open(r2_request, 'rb') as f:
    r2_request_files = pickle.load(f)
with open(r3_request, 'rb') as f:
    r3_request_files = pickle.load(f)

In [142]:
print(len(r2_request_files))
print(len(r3_request_files))
print(len(r2_request_files) + len(r3_request_files))

79
66
145


In [143]:
toms_request_files = set(r2_request_files).union(set(r3_request_files))

In [144]:
# out of Tom's request list, what's can't I find in the ASR'd data
leftover = toms_request_files - set(file_ids)

In [145]:
# Tom's request is totally covered?
len(leftover)

67

In [147]:
# almost all of the missing data is based on the r3 request 
len(leftover.intersection(set(r3_request_files)))

66

In [158]:
# what's the extra file? this is the file Sukrit said was corrupted
leftover - leftover.intersection(set(r3_request_files))

{'M01003JET'}

In [148]:
# now look at annotated files - tom's request - what we have
missing = (set(anno_files_list) - toms_request_files) - set(file_ids)

In [150]:
# still have 10 files unnaccounted for
len(missing)

10

In [168]:
missing

{'M01000G9A',
 'M01000G9B',
 'M01000G9D',
 'M01000G9E',
 'M01000G9F',
 'M01000G9G',
 'M01000G9H',
 'M01000G9J',
 'M01000G9K',
 'M01000G9L'}

In [157]:
# now account for the cu missing data (cu annotated - files we have transcribed = 23 files), which has no overlap with Tom's request
len(cu_missing.intersection(toms_request_files))

0

In [159]:
# cu_missing set is totally disjoint from what we have transcriptions for, Tom's request, and LDC data
# totally net new
len(cu_missing)

23

In [165]:
# 10 missing + r3_request + 1 corrupted + 23 cu_missing == 100
todds_request = missing.union({'M01003JET'}).union(set(r3_request_files)).union(cu_missing)
len(todds_request)

100

In [167]:
# fully accounted for my request
assert todds_request == ldc_missing.union(cu_missing)