# Create a metadata file containing information about all releases

In [2]:
%load_ext autoreload
%autoreload 2
import os

import pandas as pd
import numpy as np

from charm.data import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load file_ids by release

In [3]:
home_dir = os.path.expanduser('~')
data_dir = 'Documents/data/charm/raw'
raw_data_dir = os.path.join(home_dir, data_dir)

In [4]:
release_dfs = utils.load_release_metadata(raw_data_dir)



In [5]:
meta_df = pd.concat(release_dfs, ignore_index=True)

In [6]:
meta_df.head()

Unnamed: 0,release,catalog_id,version,file_uid,data_type,url,lang_id_manual,wrapped_md5,unwrapped_md5,download_date,content_date,status_in_corpus,legacy_catalog_id,original_file_id,type,file_path,length
0,R1,LDC2022E11,V1.0,M010009A4,.mp4.ldcc,na,cmn,0e9942346f2972d73815ab63d2074efb,1c797fd8bd832fe4c7244a7b9b0aa2a7,na,na,present,LDC2015R18,VVC008300,,,
1,R1,LDC2022E11,V1.0,M010009BC,.mp4.ldcc,na,cmn,784156bdb456fa40aed2b13333dbdf2c,4b121a497725dcf1f232d00c119bd823,na,na,present,LDC2015R18,VVC016445,,,
2,R1,LDC2022E11,V1.0,M010009BE,.mp4.ldcc,na,cmn,6a19a627430da0a84fd35102c2f4fd4d,83d42079d22abe1f576be92afc182474,na,na,present,LDC2015R18,VVC020973,,,
3,R1,LDC2022E11,V1.0,M010009CZ,.mp4.ldcc,na,cmn,29e852f7f1be0cfa163cde11cf24202a,07843653514dad5894b5782fe1475ee4,na,na,present,LDC2015R18,VVC011554,,,
4,R1,LDC2022E11,V1.0,M010009D0,.mp4.ldcc,na,cmn,6d39cd8df0fdaf8e4fad55b1e29a964f,f0a8d0563a8d6480675f21df6c0acbd2,na,na,present,LDC2015R18,VVC011561,,,


In [13]:
# make sure no overlap between releases
file_by_release_df = meta_df.groupby(['file_uid', 'release']).agg(count=('catalog_id','count')).unstack().fillna(value=0)
assert (file_by_release_df.astype(bool).sum(axis=1) > 1).sum() == 0

In [14]:
meta_df['data_type'].value_counts()

.mp4.ldcc     6477
.ltf.xml      1360
.psm.xml      1360
.flac.ldcc     317
Name: data_type, dtype: int64

In [15]:
meta_df.sample(10)

Unnamed: 0,release,catalog_id,version,file_uid,data_type,url,lang_id_manual,wrapped_md5,unwrapped_md5,download_date,content_date,status_in_corpus,legacy_catalog_id,original_file_id,type,file_path,length
8748,Mini-Eval,LDC2022E22,V1.0,M01000EYA,.ltf.xml,na,cmn,na,4a11f0a4a698fd993df0055e7afb31d2,na,na,present,na,na,text,data/text/ltf/M01000EYA.ltf.xml,672.0
698,R1,LDC2022E11,V1.0,M01000GNZ,.ltf.xml,na,cmn,na,7e68b564fd719cd3f53d2de0ac1703ae,na,na,present,LDC2018T15,CHT_CMN_20121030.0022,,,
4972,R2,LDC2022E19_R2,V2.0,M01004KGK,.mp4.ldcc,https://www.bilibili.com/video/BV13K4y1U7vL,cmn,tbd,6bf386cd34c8259aaad4dd82c72c8de0,2022-10-06,na,present,,,,,
3546,R2,LDC2022E19_R2,V2.0,M010044NQ,.mp4.ldcc,https://www.bilibili.com/video/BV1rE411j7hK,cmn,tbd,1fa504e8e0b090fae117891bc8223e6f,2022-09-25,na,present,,,,,
8934,Mini-Eval,LDC2022E22,V1.0,M01000H3D,.ltf.xml,na,cmn,na,d4dad2e8c78143b9704af87da15190c0,na,na,present,na,na,text,data/text/ltf/M01000H3D.ltf.xml,874.0
3894,R2,LDC2022E19_R2,V2.0,M01004894,.mp4.ldcc,https://www.bilibili.com/video/BV12v411g7mk,cmn,tbd,c06b8df1a4be05836d696831ab663ad4,2022-09-27,na,present,,,,,
5980,R2,LDC2022E19_R2,V1.0,M01004Q7T,.mp4.ldcc,https://www.bilibili.com/video/BV1WD4y1z7Ss,cmn,tbd,f59c1dfcf2ac33682dc22122954d7ca3,2022-10-11,na,present,,,,,
9368,Mini-Eval,LDC2022E22,V1.0,M01000HAD,.psm.xml,na,cmn,na,0c8ca8eb442ce477e9a68382191a7572,na,na,present,na,na,text,data/text/psm/M01000HAD.psm.xml,1153.0
3800,R2,LDC2022E19_R2,V2.0,M010047FK,.mp4.ldcc,https://www.bilibili.com/video/BV1Nf4y1u7ve,cmn,tbd,9c87a9a88908090f0d5a11f274161adf,2022-09-27,na,present,,,,,
7946,Mini-Eval,LDC2022E22,V1.0,M01003WO0,.mp4.ldcc,na,cmn,0cd9befa976933ef10c1932b932d9162,c4f37f3e8538dd9590b949e2795190d3,na,na,present,na,na,video,data/video/M01003WO0.mp4.ldcc,376.0


In [16]:
# add in easy to understand data types
modalities = {'.mp4.ldcc': 'video', '.ltf.xml': 'text', '.psm.xml': 'text', '.flac.ldcc': 'text'}
meta_df['modality'] = meta_df['data_type'].apply(lambda x: modalities[x])

In [17]:
# # sanity check the numbers found in the README.txt files
# r1_count = 1143 + 976 # text files have 2 corresponding files, need to double count the text files
# r2_count = 4914
# r3_count = len(r3_df) # no README.txt
# file_count = r1_count + r2_count + r3_count
# assert file_count == len(meta_df)

## Add annotation and segment information

In [18]:
anno_dfs = utils.load_ldc_annotations(raw_data_dir)

In [21]:
# create version info (i.e. number of unique annotation versions per file to capture that there were multiple annotators)
versions_df = pd.concat({k: anno_dfs[k]['versions_df'] for k in anno_dfs}, ignore_index=True)

# make sure no overlap between releases
assert versions_df['file_id'].nunique() == len(versions_df)

In [22]:
# merge version info in
meta_df = pd.merge(meta_df, versions_df.rename(columns={'file_id':'file_uid'}), left_on='file_uid', right_on='file_uid', how='left')

### Add segment information

In [83]:
# create segment info (i.e. segments annotated)
segment_df = pd.concat({k: anno_dfs[k]['segment_df'] for k in anno_dfs}, ignore_index=True)

seg_start_df = segment_df.groupby('file_id')['start'].min().to_frame().reset_index()
seg_end_df = segment_df.groupby('file_id')['end'].max().to_frame().reset_index()

seg_df = pd.merge(seg_start_df, seg_end_df, on='file_id')

# some of these files have very long contigous stretches of annotations
# spot checking reveals that they are contiguous but some may not be
# also NB: the segments are not perfectly contiguous (there are typically a few gap seconds for music, etc.)
seg_df[(seg_df['end'] - seg_df['start']) > 310]

# merge segments in
meta_df = pd.merge(meta_df, seg_df.rename(columns={'file_id':'file_uid'}), left_on='file_uid', right_on='file_uid', how='left')

## TODO: Add manual audit data

## Add OLIVE transcription and translation status

In [107]:
# https://drive.google.com/drive/u/0/folders/1rhRJhBgtBuMSpcWn8nQHWmlMUGqfmAba
transcriptions_dir = 'Documents/data/charm/transformed/transcriptions'

In [121]:
asr_dirs = ['LDC Mini-Eval Release', 'LDC Release 2', 'LDC Release 3', 'LDC Release 1']
asr_dirs = [os.path.join(home_dir, transcriptions_dir, asr_dir) for asr_dir in asr_dirs]

# get all subdirs for each release folder except release 1
asr_subdirs = []
for asr_dir in asr_dirs[:-1]:
    for item in os.listdir(asr_dir):
        subpath = os.path.join(asr_dir, item)
        if os.path.isdir(subpath):
            asr_subdirs.append(subpath)

# manually add in Release 1 subdirs
asr_subdirs.append(os.path.join(asr_dirs[-1], 'audio_processed'))
asr_subdirs.append(os.path.join(asr_dirs[-1], 'video_processed'))

asr_data = utils.load_transcribed_files(asr_subdirs, return_data=True)
asr_files, file_ids, files_by_dir, dir_by_file, raw_asr_data, asr_data_dfs = asr_data

asr_len_data = []
for key in asr_data_dfs:
    asr_len_data.append((key, len(asr_data_dfs[key])))

# add transcription status
asrd_df = pd.DataFrame(asr_len_data, columns=['file_uid', 'utterance_count'])
asrd_df['transcribed'] = True

meta_df = pd.merge(meta_df, asrd_df.rename(columns={'file_id':'file_uid'}), left_on='file_uid', right_on='file_uid', how='left')

### Add translation status

In [30]:
# # add translation status
# translation_dir = os.path.join(home_dir, 'Documents/datasets/charm/transformed/translations')
# translation_files = utils.load_translated_files(translation_dir)

# translation_file_ids = [os.path.split(x)[1].split('.')[0] for x in translation_files]
# translated_df = pd.DataFrame(translation_file_ids, columns=['file_id'])
# translated_df['translated'] = True
# meta_df = pd.merge(meta_df, translated_df.rename(columns={'file_id':'file_uid'}), left_on='file_uid', right_on='file_uid', how='left', )

## Save metadata

In [140]:
col_order = ['release', 'catalog_id', 'file_uid', 'url', 'modality', 'start', 'end', 'transcribed', 'utterance_count','emotion_count', 'valence_arousal_count', 'norms_count',
       'changepoint_count', 'emotions_count', 'data_type', 'lang_id_manual', 'wrapped_md5', 'unwrapped_md5', 'download_date',
       'content_date', 'status_in_corpus', 'legacy_catalog_id',
       'original_file_id', 'type', 'file_path', 'length', 'version']

meta_df = meta_df[col_order]

meta_df.head()

# save to transformed dir
meta_filepath = os.path.join(home_dir, 'Documents/data/charm/transformed/metadata.csv')
meta_df.to_csv(meta_filepath, index=False)