In [95]:
import os
import hashlib
import yaml

import pandas as pd
import numpy as np

from charm.eval.eval import mapping, categorize_pairs, precision, recall, load_data
from charm.data import utils

In [2]:
raw_dir = '/home/iron-man/Documents/data/charm/raw'
transformed_dir = '/home/iron-man/Documents/data/charm/transformed'
r2 = 'LDC2022E19_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R2_V2.0'
r2_dir = os.path.join(raw_dir, r2)

In [3]:
# load predictions
annotations_dir = '/home/iron-man/Documents/data/charm/transformed/annotations'
genglin = os.path.join(annotations_dir, 'Circumplex Theory Annotations - Genglin - Sheet1.csv')
jialiang = os.path.join(annotations_dir, 'Circumplex Theory Annotations - Jialiang - Sheet1.csv')
yukun = os.path.join(annotations_dir, 'Circumplex Theory Annotations - Yukun - Sheet1.csv')

genglin_df = pd.read_csv(genglin, skiprows=1, usecols=range(1, 9))
jialiang_df = pd.read_csv(jialiang, skiprows=1, usecols=range(1, 9))
yukun_df = pd.read_csv(yukun, skiprows=1, usecols=range(1, 9))

In [4]:
# load ground truth
anno_dir = '/home/iron-man/Documents/data/charm/raw/LDC2022E18_CCU_TA1_Mandarin_Chinese_Development_Annotation_V5.0/'
anno_dfs, segment_df, version_df = utils.load_ldc_annotation(anno_dir)

In [5]:
# load metadata
meta_filepath = '/home/iron-man/Documents/data/charm/transformed/metadata.csv'
meta_df = pd.read_csv(meta_filepath)

In [6]:
modality_df = anno_dfs['changepoint.tab']

In [7]:
# set thresholds
delta = 10
llr_threshold = 0

In [8]:
def convert_to_seconds(timestamp):
    mins, secs = timestamp.split(':')
    return int(mins)*60 + int(secs)

In [9]:
def clean_df(df):
    # drop rows where timestamp is empty
    df = df[df['Timestamp'].notnull()].reset_index(drop=True)
    df['llr'] = 1.0
    df = df.rename(columns={'File ID': 'file_id', 'Timestamp': 'timestamp', 'Annotator Notes': 'annotator_notes', 'Tag': 'tag', 'Speaker Descriptor (if needed)': 'speaker'}, errors='ignore')
    cols = ['file_id', 'timestamp', 'llr', 'tag', 'speaker', 'annotator_notes', 'URL']
    df = df[cols]
    df['timestamp'] = df['timestamp'].apply(convert_to_seconds)
    return df

In [10]:
genglin_df = clean_df(genglin_df)
jialiang_df = clean_df(jialiang_df)
yukun_df = clean_df(yukun_df)

### Approach 1: assume all annotated points are change points

In [11]:
# genglin_df['file_id'].unique()
# yukun_df['file_id'].unique()
# modality_df[modality_df['file_id'].isin(yukun_df['file_id'].unique())]['file_id'].unique()

In [12]:
# verify all file_ids have annotations
file_ids = genglin_df['file_id'].unique().tolist() + yukun_df['file_id'].unique().tolist()

In [123]:
genglin_set = set(genglin_df['file_id'].unique().tolist())
genglin_set

{'M01003JLO', 'M01003M18', 'M01003S1K', 'M01003VVI'}

In [124]:
yukun_set = set(yukun_df['file_id'].unique().tolist())
yukun_set

{'M01003JQV', 'M01003M18', 'M01003M20', 'M01003MTK', 'M01003YN6'}

In [122]:
genglin_set.intersection(yukun_set)

{'M01003M18'}

In [13]:
# assert that changepoint annotations exist for these files
assert (version_df[version_df['file_id'].isin(file_ids)]['changepoint_count'] >= 1).all()

In [14]:
file_ids

['M01003S1K',
 'M01003M18',
 'M01003VVI',
 'M01003JLO',
 'M01003M18',
 'M01003YN6',
 'M01003M20',
 'M01003JQV',
 'M01003MTK']

In [15]:
file_id = 'M01003JLO'

In [16]:
meta_df[meta_df['file_uid'] == 'M01003JLO']

Unnamed: 0,release,catalog_id,file_uid,url,modality,start,end,transcribed,utterance_count,valence_arousal_count,...,unwrapped_md5,download_date,content_date,status_in_corpus,legacy_catalog_id,original_file_id,type,file_path,length,version
2127,R2,LDC2022E19_R2,M01003JLO,http://vd2.bdstatic.com/mda-ngt14pajd86dszc9/c...,video,9.0,309.0,True,101.0,3.0,...,ba2106e462cd9d8c7634b79c8bd453aa,2022-09-13,na,present,,,,,,V1.0


In [17]:
# load release 2 file_info.tab
uid_list_filepath = '/home/iron-man/Documents/data/charm/raw/LDC2022E19_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R2_V2.0/docs/uid_list.tab'
uid_list_df = pd.read_csv(uid_list_filepath, delimiter='\t')

# save this to the dl_tool folder
dl_tool_dir = '/home/iron-man/Documents/data/charm/raw/LDC2022E19_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R2_V2.0/tools/dl_tool/'
uid_filepath = os.path.join(dl_tool_dir, 'uid_sublist.tab')
uid_sublist_df = uid_list_df[uid_list_df['file_uid'] == 'M01003JLO']
uid_sublist_df.to_csv(uid_filepath, sep='\t', index=False)

In [19]:
# in dl_tool folder run
# ./get_urls.sh uid_sublist.tab

In [20]:
# transcribe this file with whisper
input_filepath = os.path.join(r2_dir, f'tools/dl_tool/out/{file_id}.mp4.ldcc')

In [21]:
# read first 16 bytes and determine the size of the header
with open(input_filepath, 'rb') as f:
    first_bytes = f.read(16).decode()

header_size = int(first_bytes.split('\n')[1].strip())

# read header size bytes, strip off first 16 bytes and last 8 bytes and pass remainder to a YAML parser
with open(input_filepath, 'rb') as f:
    header = f.read(header_size).decode()
    complete_content = f.read()

header_dict = yaml.safe_load(header[16:-8])

assert hashlib.md5(complete_content).hexdigest() == header_dict['data_md5']

In [22]:
print(header[16:-8])

---
source_uid: S0C
parent_uid: na
has_siblings: 'false'
root_uid: na
data_bytes: 24573458
data_md5: ba2106e462cd9d8c7634b79c8bd453aa
data_type: mp4
data_url: http://vd2.bdstatic.com/mda-ngt14pajd86dszc9/cae_h264/1658970258963454944/mda-ngt14pajd86dszc9.mp4
data_uid: M01003JLO
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 


In [23]:
# save the mp4 file to disk and remove the ldcc file to save disk space
# create data/video directory
data_dir = os.path.join(r2_dir, 'data/video')
os.makedirs(data_dir, exist_ok=True)
output_filepath = os.path.join(data_dir, f'{file_id}.mp4')
with open(output_filepath, 'wb') as f:
    f.write(complete_content)

In [24]:
# create translations folder per data release
translation_dir = os.path.join(transformed_dir, 'translations', r2, 'video')
os.makedirs(translation_dir, exist_ok=True)

In [34]:
translation_filepath = os.path.join(translation_dir, f'{file_id}.json')
result = utils.transcribe(output_filepath, translation_filepath, task='translate', strip_ldc=False)

In [35]:
# remove the LDCC file to save disk space
os.remove(input_filepath)

FileNotFoundError: [Errno 2] No such file or directory: '/home/iron-man/Documents/data/charm/raw/LDC2022E19_CCU_TA1_Mandarin_Chinese_Development_Source_Data_R2_V2.0/tools/dl_tool/out/M01003JLO.mp4.ldcc'

In [33]:
# for seg in result['segments']:
#     print(f"start: {seg['start']}, end: {seg['end']}, \t {seg['text']}")

In [77]:
genglin_df[genglin_df['file_id'] == file_id].loc[18].values

array(['M01003JLO', 60, 1.0, 'Arrogant-Calculating',
       'woman in pink shirt',
       'She is being very dismissive and aggressive to the male, snaps at him',
       'http://vd2.bdstatic.com/mda-ngt14pajd86dszc9/cae_h264/1658970258963454944/mda-ngt14pajd86dszc9.mp4'],
      dtype=object)

In [70]:
modality_df[modality_df['file_id'] == file_id]

Unnamed: 0,user_id,file_id,timestamp,impact_scalar,comment
23,212,M01003JLO,150,4,Pre-change: The female was very upset with the...
24,212,M01003JLO,298,1,Pre-change: The speakers discussed ways to sol...


In [71]:
segment_df[segment_df['file_id'] == file_id]

Unnamed: 0,file_id,segment_id,start,end
25759,M01003JLO,M01003JLO_0001,9.0,24.0
25760,M01003JLO,M01003JLO_0002,24.0,39.0
25761,M01003JLO,M01003JLO_0003,39.0,54.0
25762,M01003JLO,M01003JLO_0004,54.0,69.0
25763,M01003JLO,M01003JLO_0005,69.0,84.0
25764,M01003JLO,M01003JLO_0006,84.0,99.0
25765,M01003JLO,M01003JLO_0007,99.0,114.0
25766,M01003JLO,M01003JLO_0008,114.0,129.0
25767,M01003JLO,M01003JLO_0009,129.0,144.0
25768,M01003JLO,M01003JLO_0010,144.0,159.0


In [48]:
# break predictions apart by file_id
pred_df = genglin_df
def breakdown_pred_df(pred_df):
    """Breaks a single df into one df per file_id."""
    preds_dfs = {}
    for file_id in pred_df['file_id'].unique():
        preds_dfs[file_id] = pred_df[pred_df['file_id'] == file_id].reset_index(drop=True)
    return preds_dfs

In [49]:
genglin_preds_dfs = breakdown_pred_df(genglin_df)
jialiang_preds_dfs = breakdown_pred_df(jialiang_df)
yukun_preds_dfs = breakdown_pred_df(yukun_df)

In [72]:
def filter_predictions(preds_dfs, segment_df):
    """Filter system predictions to only regions that were annotated."""
    # filter system_predictions down to annotated regions
    # TODO: this can probably be optimized
    preds_filt_dfs = {}
    for file_id in preds_dfs:
        preds_df = preds_dfs[file_id]
        preds_df['start'] = np.nan
        preds_df['end'] = np.nan
        file_id = preds_df['file_id'].unique()[0]
        segments = segment_df[segment_df['file_id'] == file_id][['start', 'end']]
        for i, row in preds_df.iterrows():
            for _, segment in segments.iterrows():
                if row['timestamp'] >= segment['start'] and row['timestamp'] <= segment['end']:
                    preds_df.loc[i, 'start'] = segment['start']
                    preds_df.loc[i, 'end'] = segment['end']
                    break
        preds_df = preds_df[preds_df['start'].notna()].reset_index(drop=True)
        preds_filt_dfs[file_id] = preds_df
    return preds_filt_dfs

In [78]:
# filter all predictions
genglin_preds_filt_dfs = filter_predictions(genglin_preds_dfs, segment_df)
jialiang_preds_filt_dfs = filter_predictions(jialiang_preds_dfs, segment_df)
yukun_preds_filt_dfs = filter_predictions(yukun_preds_dfs, segment_df)

In [81]:
def evaluate(dfs, modality_df, delta=10, llr_threshold=0):
    mappings = {}
    file_counts = {}
    threshold_counts = {
        'correct': 0,
        'false_positive': 0,
        'false_negative': 0,
    }
    for file_id in dfs:
        df = dfs[file_id]
        system_dict = df.to_dict('records')
        reference_dict = modality_df[modality_df['file_id'] ==
                                     file_id].to_dict('records')

        correct_pairs, system_misses, reference_misses = mapping(
            system_dict, reference_dict, delta)
        mappings[file_id] = {
            'correct_pairs': correct_pairs,
            'system_misses': system_misses,
            'reference_misses': reference_misses,
        }

        file_counts_ = categorize_pairs(**mappings[file_id], threshold=llr_threshold)
        file_counts[file_id] = file_counts_
        # add file counts to threshold counts
        for key in threshold_counts:
            threshold_counts[key] += file_counts_[key]

    return precision(threshold_counts), recall(threshold_counts), mappings

In [91]:
genglin_precision, genglin_recall, genglin_mappings = evaluate(genglin_preds_filt_dfs, modality_df, delta=delta, llr_threshold=llr_threshold)
jialiang_precision, jialiang_recall, jialiang_mappings = evaluate(jialiang_preds_filt_dfs, modality_df, delta=delta, llr_threshold=llr_threshold)
yukun_precision, yukun_recall, yukun_mappings = evaluate(yukun_preds_filt_dfs, modality_df, delta=delta, llr_threshold=llr_threshold)

In [94]:
print(f'Yukun - Precision: {yukun_precision:.2f}, Recall: {yukun_recall:.2f}')
print(f'Genglin - Precision: {genglin_precision:.2f}, Recall: {genglin_recall:.2f}')
print(f'Jialiang - Precision: {jialiang_precision:.2f}, Recall: {jialiang_recall:.2f}')

Yukun - Precision: 0.53, Recall: 0.47
Genglin - Precision: 0.29, Recall: 0.20
Jialiang - Precision: 0.50, Recall: 0.20


In [89]:
fps = 0
for key in genglin_mappings.keys():
    print(key)
    fps += len(genglin_mappings[key]['system_misses'])
    print(genglin_mappings[key]['system_misses'])
    print()
fps

M01003S1K
[]

M01003M18
[{'file_id': 'M01003M18', 'timestamp': 100, 'llr': 1.0, 'tag': 'Unassuming-Ingenuous', 'speaker': 'bald guy in blue shirt', 'annotator_notes': "He's introducing a job to her", 'URL': 'http://vd2.bdstatic.com/mda-nh4bibyv4v3ns4s0/cae_h264/1659758632534327528/mda-nh4bibyv4v3ns4s0.mp4', 'start': 96.5, 'end': 111.5}]

M01003VVI
[{'file_id': 'M01003VVI', 'timestamp': 360, 'llr': 1.0, 'tag': 'Warm-Agreeable', 'speaker': 'white guy in colorful coat', 'annotator_notes': 'They are discussing the food at the restaurant and the white guy is agreeing that the food is good', 'URL': 'https://www.bilibili.com/video/BV1KP4y1A7AY', 'start': 357.0, 'end': 372.0}]

M01003JLO
[{'file_id': 'M01003JLO', 'timestamp': 130, 'llr': 1.0, 'tag': 'Unassured-Submissive', 'speaker': 'Man in white shirt', 'annotator_notes': 'we finally got to see the second speaker but the tag doesnt change', 'URL': 'http://vd2.bdstatic.com/mda-ngt14pajd86dszc9/cae_h264/1658970258963454944/mda-ngt14pajd86dszc9

5

### Evaluate best performing system

In [96]:
best_submission_dir = os.path.join(transformed_dir, 'predictions/CCU_P1_TA1_CD_COL_LDC2022E22-V1_20221121_125014')
reference_dir = os.path.join(raw_dir, 'LDC2023E01_CCU_TA1_Mandarin_Chinese_Mini_Evaluation_Annotation_Unsequestered')
system_predictions, anno_dfs, segment_df, versions_df = load_data(best_submission_dir, reference_dir)

In [98]:
# filter predictions to labeled segments
system_predictions_filt = filter_predictions(system_predictions, segment_df)

In [105]:
modality_filter = meta_df['modality'] == 'video'
release_filter = meta_df['release'] == 'Mini-Eval'
file_ids = meta_df[modality_filter & release_filter]['file_uid'].unique()
# get file_ids from versions_df
labeled_file_ids = versions_df[versions_df['changepoint_count'] > 0]['file_id'].unique()
file_ids = list(set(file_ids).intersection(set(labeled_file_ids)))

In [107]:
change_point_df = anno_dfs['changepoint.tab']

In [108]:
# filter change point (label) df to only include labeled modality files
modality_df = change_point_df[change_point_df['file_id'].isin(file_ids)]

In [110]:
# filter to labeled file_ids
system_predictions_filt = {k: v for k, v in system_predictions_filt.items() if k in file_ids}

In [114]:
system_precision, system_recall, system_mappings = evaluate(system_predictions_filt, modality_df, delta=delta, llr_threshold=llr_threshold)

In [115]:
print(f'System - Precision: {system_precision:.2f}, Recall: {system_recall:.2f}')

System - Precision: 0.05, Recall: 0.77
