# Identify 2 videos to manually label for changepoint

In [41]:
import os
import json

import pandas as pd

import utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

In [14]:
home_dir = os.path.expanduser('~')
anno_dir = os.path.join(home_dir, 'Documents/datasets/charm/raw/LDC2022E18_CCU_TA1_Mandarin_Chinese_Development_Annotation_V2.0/data')

In [21]:
anno_dfs, segment_df, versions_df = utils.load_ldc_annotations(os.path.join(home_dir, anno_dir))

In [16]:
anno_dfs.keys()

dict_keys(['valence_arousal.tab', 'changepoint.tab', 'norms.tab', 'emotions.tab'])

In [18]:
changepoint_df = anno_dfs['changepoint.tab']

In [20]:
changepoint_df['impact_scalar'].value_counts()

4    22
5    19
1    15
2    13
Name: impact_scalar, dtype: int64

In [30]:
change_anno_files = set(versions_df[versions_df['changepoint_count'] > 0]['file_id'].unique())
change_pos_anno_files = set(changepoint_df['file_id'].unique())
change_neg_anno_files = change_anno_files - change_pos_anno_files

In [33]:
len(change_pos_anno_files)

38

In [32]:
len(change_neg_anno_files)

32

## Identify 2 median length videos for annotation

In [37]:
output_dir = os.path.join(home_dir, 'Documents/datasets/charm/transformed/translations')

In [83]:
# load transcriptions/translations
translation_files, data_dfs, data = utils.load_translated_files(translation_dir=output_dir, return_translations=True)

In [57]:
# identify 2 videos that are median length (that we have transcriptions/translations for)
# and that we have a high number of changepoint annotations for

In [61]:
changepoint_files = changepoint_df['file_id'].value_counts().to_frame()

In [64]:
changepoint_counts_df = changepoint_files.reset_index().rename(columns={'index':'file_id', 'file_id':'count'})

In [59]:
changepoint_lens = []
for key in data_dfs:
    changepoint_lens.append((key, len(data_dfs[key])))

In [66]:
changepoint_lens_df = pd.DataFrame(changepoint_lens, columns=['file_id', 'length'])

In [71]:
merged_df = pd.merge(changepoint_lens_df, changepoint_counts_df, how='inner', on='file_id')

In [70]:
# identify median length of conversations
changepoint_lens_df['length'].describe()

count      96.000000
mean      220.885417
std       185.040187
min        28.000000
25%       100.750000
50%       162.500000
75%       279.250000
max      1104.000000
Name: length, dtype: float64

In [131]:
# these are the 5 videos that we're going to use for manual annotation
merged_df.sort_values(by=['count'], ascending=False).iloc[:7]

Unnamed: 0,file_id,length,count
8,M01003M18,107,4
15,M01003YN6,60,4
0,M01000AJ9,78,3
17,M01003M20,65,3
14,M01003MTK,93,3
10,M01003JUU,494,3
18,M01003JQV,73,3


In [None]:
# M01003M18, M01003YN6, M01003M20, M01003JQV
# sample annotation: M01003MTK

In [132]:
label_file_ids = merged_df.sort_values(by=['count'], ascending=False)['file_id'].iloc[:7].values

In [133]:
label_file_ids

array(['M01003M18', 'M01003YN6', 'M01000AJ9', 'M01003M20', 'M01003MTK',
       'M01003JUU', 'M01003JQV'], dtype=object)

In [134]:
metadata_filepath = os.path.join(home_dir, '/Users/tmorrill002/Documents/datasets/charm/transformed/metadata.csv')
metadata_df = pd.read_csv(metadata_filepath)

In [135]:
metadata_df[metadata_df['file_uid'].isin(label_file_ids)]

Unnamed: 0,release,file_uid,modality,url,emotion_count,valence_arousal_count,norms_count,changepoint_count,transcribed,translated,catalog_id,version,data_type,lang_id_manual,wrapped_md5,unwrapped_md5,download_date,content_date,status_in_corpus
5,R1,M01000AJ9,video,na,3.0,3.0,1.0,1.0,True,True,LDC2022E11,V1.0,.mp4.ldcc,cmn,c739b30eaa58ea837433ffdfdd90ab29,3c3664043119cd44cb7b62a000db94c6,na,na,present
2131,R2,M01003JQV,video,http://vd2.bdstatic.com/mda-nh21qmizd0kf08gt/c...,3.0,3.0,1.0,1.0,True,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,e07a4251d24d7a8f0e65eb00bf191de7,2022-09-13,na,present
2139,R2,M01003JUU,video,https://www.bilibili.com/video/BV1Ks411r79F,3.0,3.0,1.0,1.0,True,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,daef705d7342a0756a18ac74c08a1014,2022-09-13,na,present
2311,R2,M01003M18,video,http://vd2.bdstatic.com/mda-nh4bibyv4v3ns4s0/c...,3.0,3.0,1.0,1.0,True,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,d0fd0c3a21694c1fb49197f18587a92b,2022-09-14,na,present
2315,R2,M01003M20,video,http://vd3.bdstatic.com/mda-nh1gbwd5gvki016y/c...,3.0,3.0,1.0,1.0,True,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,19cf5700aeea5e90a6ca3dc42c747dfb,2022-09-14,na,present
2347,R2,M01003MTK,video,http://vd2.bdstatic.com/mda-ncqdcucf3m6zgjzz/3...,3.0,3.0,1.0,1.0,True,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,bcc5b3f482b28fa9784b3961f9668b89,2022-09-14,na,present
3026,R2,M01003YN6,video,http://vd2.bdstatic.com/mda-ngf7np46s18zdrv8/3...,3.0,3.0,1.0,1.0,True,True,LDC2022E19_R2,V1.0,.mp4.ldcc,cmn,tbd,a121c9a6b04e69106f0025444d154c2e,2022-09-22,na,present


In [136]:
metadata_df[metadata_df['file_uid'].isin(label_file_ids)][['release', 'file_uid', 'url']].values

array([['R1', 'M01000AJ9', 'na'],
       ['R2', 'M01003JQV',
        'http://vd2.bdstatic.com/mda-nh21qmizd0kf08gt/cae_h264/1659489487157368122/mda-nh21qmizd0kf08gt.mp4'],
       ['R2', 'M01003JUU', 'https://www.bilibili.com/video/BV1Ks411r79F'],
       ['R2', 'M01003M18',
        'http://vd2.bdstatic.com/mda-nh4bibyv4v3ns4s0/cae_h264/1659758632534327528/mda-nh4bibyv4v3ns4s0.mp4'],
       ['R2', 'M01003M20',
        'http://vd3.bdstatic.com/mda-nh1gbwd5gvki016y/cae_h264/1659441381156335870/mda-nh1gbwd5gvki016y.mp4'],
       ['R2', 'M01003MTK',
        'http://vd2.bdstatic.com/mda-ncqdcucf3m6zgjzz/360p/h264_delogo/1648202114325469972/mda-ncqdcucf3m6zgjzz.mp4'],
       ['R2', 'M01003YN6',
        'http://vd2.bdstatic.com/mda-ngf7np46s18zdrv8/360p/h264/1657949089719494346/mda-ngf7np46s18zdrv8.mp4']],
      dtype=object)