# Identify videos for manual labeling and evaluate results

In [1]:
import os
import json

import pandas as pd
import numpy as np

import utils

%load_ext autoreload
%autoreload 2

## Load data

In [2]:
home_dir = os.path.expanduser('~')
anno_dir = os.path.join(home_dir, 'Documents/datasets/charm/raw/LDC2022E18_CCU_TA1_Mandarin_Chinese_Development_Annotation_V2.0/data')

In [3]:
anno_dfs, segment_df, versions_df = utils.load_ldc_annotations(os.path.join(home_dir, anno_dir))

In [4]:
anno_dfs.keys()

dict_keys(['valence_arousal.tab', 'changepoint.tab', 'norms.tab', 'emotions.tab'])

In [5]:
changepoint_df = anno_dfs['changepoint.tab']

In [6]:
changepoint_df['impact_scalar'].value_counts()

4    22
5    19
1    15
2    13
Name: impact_scalar, dtype: int64

In [7]:
changepoint_df.head()

Unnamed: 0,user_id,file_id,timestamp,impact_scalar,comment
0,212,M01000FT6,287,5,Pre-change: Host talked about patterns of dirt...
1,212,M01000FT6,353,2,Pre-change: Host questioned why the male guest...
2,212,M01003MTK,64,5,Pre-change: female introduced the male to her ...
3,212,M01003MTK,239,1,Pre-change: The female and male speakers were ...
4,212,M01003MTK,267,5,Pre-change: The group chit-chat with the male ...


In [8]:
change_anno_files = set(versions_df[versions_df['changepoint_count'] > 0]['file_id'].unique())
change_pos_anno_files = set(changepoint_df['file_id'].unique())
change_neg_anno_files = change_anno_files - change_pos_anno_files

In [9]:
len(change_pos_anno_files)

38

In [10]:
len(change_neg_anno_files)

32

## Identify 2 median length videos for annotation

In [11]:
output_dir = os.path.join(home_dir, 'Documents/datasets/charm/transformed/translations')

In [12]:
# load transcriptions/translations
translation_files, data_dfs, data = utils.load_translated_files(translation_dir=output_dir, return_data=True)

In [13]:
# identify 2 videos that are median length (that we have transcriptions/translations for)
# and that we have a high number of changepoint annotations for

In [14]:
changepoint_files = changepoint_df['file_id'].value_counts().to_frame()

In [15]:
changepoint_counts_df = changepoint_files.reset_index().rename(columns={'index':'file_id', 'file_id':'count'})

In [16]:
changepoint_lens = []
for key in data_dfs:
    changepoint_lens.append((key, len(data_dfs[key])))

In [17]:
changepoint_lens_df = pd.DataFrame(changepoint_lens, columns=['file_id', 'length'])

In [18]:
merged_df = pd.merge(changepoint_lens_df, changepoint_counts_df, how='inner', on='file_id')

In [19]:
# identify median length of conversations
changepoint_lens_df['length'].describe()

count      96.000000
mean      220.885417
std       185.040187
min        28.000000
25%       100.750000
50%       162.500000
75%       279.250000
max      1104.000000
Name: length, dtype: float64

In [20]:
# these are the 5 videos that we're going to use for manual annotation
merged_df.sort_values(by=['count'], ascending=False).iloc[:7]

Unnamed: 0,file_id,length,count
8,M01003M18,107,4
15,M01003YN6,60,4
0,M01000AJ9,78,3
17,M01003M20,65,3
14,M01003MTK,93,3
10,M01003JUU,494,3
18,M01003JQV,73,3


In [21]:
# M01003M18, M01003YN6, M01003M20, M01003JQV
# sample annotation: M01003MTK

In [22]:
label_file_ids = merged_df.sort_values(by=['count'], ascending=False)['file_id'].iloc[:7].values

In [23]:
label_file_ids

array(['M01003M18', 'M01003YN6', 'M01000AJ9', 'M01003M20', 'M01003MTK',
       'M01003JUU', 'M01003JQV'], dtype=object)

In [24]:
metadata_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/metadata.csv')
metadata_df = pd.read_csv(metadata_filepath)

In [25]:
metadata_df[metadata_df['file_uid'].isin(label_file_ids)]

Unnamed: 0,release,file_uid,modality,url,emotion_count,valence_arousal_count,norms_count,changepoint_count,start,end,...,translated,catalog_id,version,data_type,lang_id_manual,wrapped_md5,unwrapped_md5,download_date,content_date,status_in_corpus
5,R1,M01000AJ9,video,na,3.0,3.0,1.0,1.0,0.0,300.0,...,True,LDC2022E11,V1.0,.mp4.ldcc,cmn,c739b30eaa58ea837433ffdfdd90ab29,3c3664043119cd44cb7b62a000db94c6,na,na,present
2131,R2,M01003JQV,video,http://vd2.bdstatic.com/mda-nh21qmizd0kf08gt/c...,3.0,3.0,1.0,1.0,29.0,329.0,...,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,e07a4251d24d7a8f0e65eb00bf191de7,2022-09-13,na,present
2139,R2,M01003JUU,video,https://www.bilibili.com/video/BV1Ks411r79F,3.0,3.0,1.0,1.0,1008.5,1308.5,...,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,daef705d7342a0756a18ac74c08a1014,2022-09-13,na,present
2311,R2,M01003M18,video,http://vd2.bdstatic.com/mda-nh4bibyv4v3ns4s0/c...,3.0,3.0,1.0,1.0,51.5,351.5,...,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,d0fd0c3a21694c1fb49197f18587a92b,2022-09-14,na,present
2315,R2,M01003M20,video,http://vd3.bdstatic.com/mda-nh1gbwd5gvki016y/c...,3.0,3.0,1.0,1.0,0.0,300.0,...,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,19cf5700aeea5e90a6ca3dc42c747dfb,2022-09-14,na,present
2347,R2,M01003MTK,video,http://vd2.bdstatic.com/mda-ncqdcucf3m6zgjzz/3...,3.0,3.0,1.0,1.0,0.0,300.0,...,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,bcc5b3f482b28fa9784b3961f9668b89,2022-09-14,na,present
3026,R2,M01003YN6,video,http://vd2.bdstatic.com/mda-ngf7np46s18zdrv8/3...,3.0,3.0,1.0,1.0,5.0,305.0,...,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,a121c9a6b04e69106f0025444d154c2e,2022-09-22,na,present


In [26]:
metadata_df[metadata_df['file_uid'].isin(label_file_ids)][['release', 'file_uid', 'url']].values

array([['R1', 'M01000AJ9', 'na'],
       ['R2', 'M01003JQV',
        'http://vd2.bdstatic.com/mda-nh21qmizd0kf08gt/cae_h264/1659489487157368122/mda-nh21qmizd0kf08gt.mp4'],
       ['R2', 'M01003JUU', 'https://www.bilibili.com/video/BV1Ks411r79F'],
       ['R2', 'M01003M18',
        'http://vd2.bdstatic.com/mda-nh4bibyv4v3ns4s0/cae_h264/1659758632534327528/mda-nh4bibyv4v3ns4s0.mp4'],
       ['R2', 'M01003M20',
        'http://vd3.bdstatic.com/mda-nh1gbwd5gvki016y/cae_h264/1659441381156335870/mda-nh1gbwd5gvki016y.mp4'],
       ['R2', 'M01003MTK',
        'http://vd2.bdstatic.com/mda-ncqdcucf3m6zgjzz/360p/h264_delogo/1648202114325469972/mda-ncqdcucf3m6zgjzz.mp4'],
       ['R2', 'M01003YN6',
        'http://vd2.bdstatic.com/mda-ngf7np46s18zdrv8/360p/h264/1657949089719494346/mda-ngf7np46s18zdrv8.mp4']],
      dtype=object)

## Evaluation

In [33]:
changepoint_df[changepoint_df['file_id'] == 'M01003MTK']

Unnamed: 0,user_id,file_id,timestamp,impact_scalar,comment,binary_impact_scalar,changepoint_occurred
2,212,M01003MTK,64,5,Pre-change: female introduced the male to her ...,1,True
3,212,M01003MTK,239,1,Pre-change: The female and male speakers were ...,0,True
4,212,M01003MTK,267,5,Pre-change: The group chit-chat with the male ...,1,True


In [28]:
# 0: good to bad, 1: bad to good
binary_map = {1: 0, 2: 0, 3: 0, 4: 1, 5: 1}

In [29]:
changepoint_df['binary_impact_scalar'] = changepoint_df['impact_scalar'].apply(lambda x: binary_map[x])

In [30]:
# binary changepoint flag
changepoint_df['changepoint_occurred'] = True

In [31]:
yanda_anno_file_df = changepoint_df[changepoint_df['file_id'] == 'M01003MTK'].copy()
yukun_anno_file_df = changepoint_df[changepoint_df['file_id'] == 'M01003MTK'].copy()

In [32]:
# load manual annotations
yanda_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/annotations/Circumplex Theory Annotations - Yanda - Sheet1.csv')
yukun_filepath = os.path.join(home_dir, 'Documents/datasets/charm/transformed/annotations/Circumplex Theory Annotations - Yukun - Sheet1.csv')

In [34]:
def convert_to_seconds(timestamp):
    mins, secs = timestamp.split(':')
    return int(mins)*60 + int(secs)

In [35]:
def load_manual_annotation(filepath):
    df = pd.read_csv(filepath, skiprows=1)
    # drop first col
    df = df[df.columns[1:]]

    # binary impact scalar (are these correct? - especially aloof-introverted, unassured-submissive, assured-dominant)
    tag_mapping = {'Gregarious-Extraverted': 1, 'Warm-Agreeable': 1, 'Arrogant-Calculating': 0, 
                   'Unassured-Submissive': 1, 'Cold': 0, 'Unassuming-Ingenuous': 1, 
                   'Aloof-Introverted': 0, 'Assured-Dominant': 0}

    df['timestamp_seconds'] = df['Timestamp'].apply(convert_to_seconds)
    df['binary_impact_scalar'] = df['Tag'].apply(lambda x: tag_mapping[x]).astype(float)
    df['matched'] = False
    return df

In [36]:
yanda_df = load_manual_annotation(yanda_filepath)
yukun_df = load_manual_annotation(yukun_filepath)

In [37]:
# TODO: generalize this
# filter Yukun's preds
yukun_df = yukun_df[yukun_df['File ID'] == 'M01003MTK']

In [38]:
def find_matches(anno_df, label_df, window_size=10):
    # for each ground-truth search +/- 10 seconds for a label
    # pull in any unmatched
    match_indicator = [] # true/false
    binary_pred = []
    for idx, row in anno_df.iterrows():
        start = row['timestamp'] - window_size
        end = row['timestamp'] + window_size
        filt_df = label_df[(label_df['timestamp_seconds'] >= start) & (label_df['timestamp_seconds'] <= end)]
        if len(filt_df) > 0:
            match_indicator.append(True)
            binary_pred.append(filt_df.iloc[0]['binary_impact_scalar'])
            # mark which ones we matched
            label_df.loc[filt_df.index, 'matched'] = [True] + [False]*(len(filt_df) - 1)
        else:
            match_indicator.append(False)
            binary_pred.append(-1)
    
    anno_df['matched'] = match_indicator
    anno_df['binary_pred'] = binary_pred
    return anno_df, label_df

In [39]:
yanda_anno_file_df, yanda_df = find_matches(yanda_anno_file_df, yanda_df)
yukun_anno_file_df, yukun_df = find_matches(yukun_anno_file_df, yukun_df)

In [40]:
def complete_predictions(anno_df, label_df):
    # extra preds
    extra_impact_scalar = label_df[~label_df['matched']]['binary_impact_scalar'].astype(float).values.tolist()
    changepoint_occurred = [True]*len(extra_impact_scalar)

    # complete preds
    changepoint_preds = anno_df['matched'].values.tolist() + changepoint_occurred
    binary_preds = anno_df['binary_pred'].values.tolist() + extra_impact_scalar

    # complete ground truth
    changepoint_ground_truth = anno_df['changepoint_occurred'].values.tolist() + [False]*len(changepoint_occurred)
    binary_ground_truth = anno_df['binary_impact_scalar'].values.tolist() + [-1]*len(changepoint_occurred)
    
    return {'changepoint_ground_truth': changepoint_ground_truth, 'changepoint_preds':changepoint_preds, 
            'binary_ground_truth': binary_ground_truth, 'binary_preds': binary_preds}

In [41]:
yanda_results = complete_predictions(yanda_anno_file_df, yanda_df)
yukun_results = complete_predictions(yukun_anno_file_df, yukun_df)

In [42]:
yanda_results

{'changepoint_ground_truth': [True,
  True,
  True,
  False,
  False,
  False,
  False,
  False],
 'changepoint_preds': [False, True, False, True, True, True, True, True],
 'binary_ground_truth': [1, 0, 1, -1, -1, -1, -1, -1],
 'binary_preds': [-1.0, 1.0, -1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}

In [43]:
yukun_results

{'changepoint_ground_truth': [True, True, True, False, False, False],
 'changepoint_preds': [True, True, True, True, True, True],
 'binary_ground_truth': [1, 0, 1, -1, -1, -1],
 'binary_preds': [1.0, 0.0, 1.0, 0.0, 0.0, 1.0]}

In [44]:
from sklearn.metrics import classification_report

In [45]:
# Yanda changepoint preds
print(classification_report(y_true=yanda_results['changepoint_ground_truth'], y_pred=yanda_results['changepoint_preds']))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00         5
        True       0.17      0.33      0.22         3

    accuracy                           0.12         8
   macro avg       0.08      0.17      0.11         8
weighted avg       0.06      0.12      0.08         8



In [53]:
# Yanda binary preds
print(classification_report(y_true=yanda_results['binary_ground_truth'], y_pred=yanda_results['binary_preds'], zero_division=0))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       5.0
           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       8.0
   macro avg       0.00      0.00      0.00       8.0
weighted avg       0.00      0.00      0.00       8.0



In [54]:
# Yukun changepoint preds
print(classification_report(y_true=yukun_results['changepoint_ground_truth'], y_pred=yukun_results['changepoint_preds'], zero_division=0))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00         3
        True       0.50      1.00      0.67         3

    accuracy                           0.50         6
   macro avg       0.25      0.50      0.33         6
weighted avg       0.25      0.50      0.33         6



In [55]:
# Yukun binary preds
print(classification_report(y_true=yukun_results['binary_ground_truth'], y_pred=yukun_results['binary_preds'], zero_division=0))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         3
           0       0.33      1.00      0.50         1
           1       0.67      1.00      0.80         2

    accuracy                           0.50         6
   macro avg       0.33      0.67      0.43         6
weighted avg       0.28      0.50      0.35         6

