# Identify known changepoints for annotation
- identify 30 videos and define 1 span containing a change point and 1 span containing a known negative segment

TODO:
- download updates
- update metadata

In [85]:
import os
import zipfile
from io import StringIO
import random
import hashlib
import subprocess
import json

import yaml
import pandas as pd
import numpy as np

from charm.eval.eval import mapping, categorize_pairs, precision, recall
from charm.data import utils

In [2]:
# load metadata
meta_df = pd.read_csv('/home/iron-man/Documents/data/charm/transformed/metadata.csv')

# load annotations
anno_dict = utils.load_ldc_annotations('/home/iron-man/Documents/data/charm/raw')

In [3]:
change_point_df = meta_df[meta_df['changepoint_count'] >= 1.0]

In [4]:
change_point_df['release'].value_counts()

R3           1197
Mini-Eval    1088
R2           1051
R1            413
Name: release, dtype: int64

In [5]:
change_point_df = change_point_df[(change_point_df['release'] == 'Mini-Eval') & (change_point_df['modality'] == 'video')]

In [6]:
change_point_df = change_point_df.sample(n=30, random_state=42)

In [7]:
mini_eval_df = anno_dict['Mini-Eval-Annotations']['anno_dfs']['changepoint.tab']

In [8]:
# merge in meta_df
mini_eval_df = mini_eval_df.merge(meta_df, how='left', left_on='file_id', right_on='file_uid')

In [9]:
mini_eval_df = mini_eval_df[mini_eval_df['modality'] == 'video']

In [10]:
# sample one change point from each file_id, then sample 30 changepoints
sample_df = mini_eval_df.groupby('file_id').sample(n=1, random_state=42).sample(n=30, random_state=42)

In [11]:
# get all file_ids from sample_df, and then remove 40 second windows (+/- 20 secs) around each change point
# among remaining intervals, chunk into 40 second intervals
# randomly select 1
sample_annos_df = mini_eval_df[mini_eval_df['file_id'].isin(sample_df['file_id'].unique())]

In [12]:
sample_annos_df['timestamp_start_raw'] = sample_annos_df['timestamp'].apply(lambda x: max(float(x) - 20, 0)) 
sample_annos_df['timestamp_end_raw'] = sample_annos_df['timestamp'].apply(lambda x: float(x) + 20) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_annos_df['timestamp_start_raw'] = sample_annos_df['timestamp'].apply(lambda x: max(float(x) - 20, 0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_annos_df['timestamp_end_raw'] = sample_annos_df['timestamp'].apply(lambda x: float(x) + 20)


In [13]:
# ensure we start and end within the LDC annotated region
sample_annos_df['timestamp_start'] =  sample_annos_df[['timestamp_start_raw', 'start']].max(axis=1)
sample_annos_df['timestamp_end'] =  sample_annos_df[['timestamp_end_raw', 'end']].min(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_annos_df['timestamp_start'] =  sample_annos_df[['timestamp_start_raw', 'start']].max(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_annos_df['timestamp_end'] =  sample_annos_df[['timestamp_end_raw', 'end']].min(axis=1)


In [14]:
def create_segments(group_df):
    # create tuples out of start and end interval
    start = group_df['timestamp_start'].values.tolist()
    end = group_df['timestamp_end'].values.tolist()
    timestamps = group_df['timestamp'].values.tolist()
    impact_scalars = group_df['impact_scalar'].values.tolist()
    start_end = list(zip(start, end))
    all_vals = list(zip(start_end, timestamps, impact_scalars))
    # sort by timestamp
    sorted_vals = sorted(all_vals, key=lambda x: x[1])
    # now unzip to separate columns
    return pd.Series(zip(*sorted_vals))

In [230]:
# get annotated segments for each file
interval_df = sample_annos_df.groupby(['file_id'], as_index=True).apply(create_segments).rename(columns={0: 'intervals', 1: 'timestamp', 2: 'impact_scalar'})

In [231]:
# # intervals column contains changepoint labeled intervals
# # sort by starting times
# interval_df['intervals'] = interval_df['intervals'].apply(lambda x: sorted(x, key=lambda x: x[0]))

In [232]:
# if intervals overlap or if gap between intervals is less than 40 seconds, merge
def merge_intervals(intervals):
    final_intervals = [intervals[0]]
    for start, end in intervals[1:]:
        # start of next interval isn't more than 40 seconds after the end of the previous interval, merge
        if start < final_intervals[-1][1] + 40:
            new_start = final_intervals[-1][0]
            new_end = end
            final_intervals[-1] = (new_start, new_end)
        else:
            final_intervals.append((start, end))
    return final_intervals

In [233]:
interval_df['merged_intervals'] = interval_df['intervals'].apply(lambda x: merge_intervals(x))

In [234]:
# merge in start/end times to extract 40 second intervals without annotations
interval_df = interval_df.reset_index()
interval_df = interval_df.merge(sample_annos_df[['file_id', 'start', 'end']].drop_duplicates(), how='left', on='file_id')

In [235]:
interval_df['start_end'] = list(zip(interval_df['start'], interval_df['end']))

In [236]:
def create_chunks(interval, chunk_size):
    """Create chunk_size blocks of time based on interval=(start, end)."""
    chunks = []
    for i in np.arange(interval[0], interval[1], chunk_size):
        # drop last chunk
        if i + chunk_size > interval[1]:
            continue
        chunks.append((i, i + chunk_size))
    return chunks

In [237]:
def get_free_intervals(row):
    """Identify all regions not tagged as a change point and create 40 second chunks."""
    free_intervals = []
    start_time = row['start_end'][0]
    for start, end in row['merged_intervals']:
        if start - start_time > 40:
            free_intervals.extend(create_chunks((start_time, start), 40))
        start_time = end


    # add any time at the end
    end = row['merged_intervals'][-1][1]
    if row['start_end'][1] - end > 40:
        free_intervals.extend(create_chunks((end, row['start_end'][1]), 40))
    return free_intervals

In [238]:
interval_df['free_intervals'] = interval_df.apply(lambda x: get_free_intervals(x), axis=1)

### TODO: Select one labeled interval and one unlabeled interval
Determine if we need to tie back to a known change point for evaluation purposes

In [239]:
random.seed(42)
interval_df['chosen_interval'] = interval_df['intervals'].apply(lambda x: random.randint(0, len(x)-1))

In [240]:
# retain timestamp and impact_scalar for provenance
interval_df['change_point'] = interval_df.apply(lambda x: x['intervals'][x['chosen_interval']], axis=1)
interval_df['timestamp_selected'] = interval_df.apply(lambda x: x['timestamp'][x['chosen_interval']], axis=1)
interval_df['impact_scalar_selected'] = interval_df.apply(lambda x: x['impact_scalar'][x['chosen_interval']], axis=1)

In [241]:
# randomly choose a free interval as a non change point
interval_df['non_change_point'] = interval_df['free_intervals'].apply(lambda x: random.choice(x))

In [242]:
# verify that change points don't overlap
change_after_non = interval_df['change_point'].apply(lambda x: x[0]) >= interval_df['non_change_point'].apply(lambda x: x[1])
change_before_non = interval_df['change_point'].apply(lambda x: x[1]) <= interval_df['non_change_point'].apply(lambda x: x[0])
assert len(interval_df[~(change_after_non | change_before_non)]) == 0

In [243]:
interval_df = interval_df[['file_id', 'change_point', 'non_change_point', 'timestamp_selected', 'impact_scalar_selected']]

In [244]:
interval_df.head()

Unnamed: 0,file_id,change_point,non_change_point,timestamp_selected,impact_scalar_selected
0,M01003JVY,"(147.0, 187.0)","(74.3, 114.3)",167.0,3
1,M01003L5X,"(292.0, 332.0)","(372.0, 412.0)",312.0,4
2,M01003LV3,"(291.0, 331.0)","(226.0, 266.0)",311.0,2
3,M01003MK7,"(524.0, 564.0)","(383.2, 423.2)",544.0,4
4,M01003N2J,"(453.0, 493.0)","(283.6, 323.6)",473.0,4


In [245]:
interval_df = pd.melt(interval_df, id_vars=['file_id', 'timestamp_selected', 'impact_scalar_selected'], value_vars=['change_point', 'non_change_point'], value_name='interval')

In [246]:
interval_df['interval_start'] = interval_df['interval'].apply(lambda x: x[0])

In [247]:
interval_df = interval_df.sort_values(by=['file_id', 'interval_start'])

In [248]:
interval_df.loc[interval_df['variable'] == 'non_change_point', ['timestamp_selected', 'impact_scalar_selected']] = np.nan

In [249]:
len(interval_df)

60

In [250]:
# drop cols
interval_df.drop(columns=['interval_start'], inplace=True)

### Create a list of these file_ids, download them, push to Google Drive, and get links

In [251]:
# assert that all videos are not urls
assert (meta_df[meta_df['file_uid'].isin(interval_df['file_id'].unique())]['url'] != 'na').sum() == 0

In [252]:
# push these files to Google Drive
from charm.data.gdrive.upload import create_folder, upload_basic

In [41]:
# # create a folder within the Circumplex Theory folder
# parent_dir = ['1w5L4T9LN0imrMhuTSXtPKODSzSKEHE2C']
# folder_name = 'Annotation Videos'
# dir_id = create_folder(folder_name, parents=parent_dir)

In [47]:
dir_id = '1ts2M2iVGrYYNZBjTTufyWkCGjLcoaBTv'

In [43]:
# upload files from mini-eval folder
meta_df[meta_df['file_uid'].isin(interval_df['file_id'].unique())]['release'].value_counts()

Mini-Eval    30
Name: release, dtype: int64

In [146]:
mini_eval_dir = '/home/iron-man/Documents/data/charm/raw/LDC2022E22_CCU_TA1_Mandarin_Chinese_Mini_Evaluation_Source_Data/data/video'
# save back to mini_eval_dir and remove the .ldcc formatted file to save disk space
# TODO: remove ldcc formatted file
# TODO: loaders should first try to load .mp4 and will remove .ldcc headers if not found

# loop over files and strip LDC header
for file_id in interval_df['file_id']:
    filename = f'{file_id}.mp4.ldcc'
    file_path = os.path.join(mini_eval_dir, filename)
    out_filepath = utils.strip_ldc_header(file_path, mini_eval_dir)

In [147]:
# this will take some time
file_paths = []
for file_id in interval_df['file_id'].unique():
    filename = f'{file_id}.mp4'
    file_path = os.path.join(mini_eval_dir, filename)
    file_paths.append(file_path)
    probe_command = ['ffprobe', 
               '-show_format', 
               '-show_streams', '-loglevel',
               'quiet',
               '-print_format',
               'json',
               file_path]

    out = subprocess.check_output(probe_command)
    file_info = json.loads(out.decode())
    codec_name = None
    for stream in file_info['streams']:
        if stream['codec_type'] == 'video':
            codec_name = stream['codec_name']
    
    print(file_id, codec_name)
    
    # if problematic codec, convert using ffmpeg
    if codec_name in ['av1', 'hevc']:
        print(f'File: {filename} had {codec_name} codec. Converting..')
        # load raw data
        with open(file_path, 'rb') as f:
            raw_data = f.read()
        ffmpeg_command = ['ffmpeg',
                  '-y',
                  '-i',
                  '-',
                  '-loglevel',
                  'quiet',
                  '-vcodec',
                  'libx264',
                  file_path]
        proc = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        out, err = proc.communicate(input=raw_data)

M01003JVY av1
File: M01003JVY.mp4 had av1 codec. Converting..
M01003L5X hevc
File: M01003L5X.mp4 had hevc codec. Converting..
M01003LV3 h264
M01003MK7 av1
File: M01003MK7.mp4 had av1 codec. Converting..
M01003N2J av1
File: M01003N2J.mp4 had av1 codec. Converting..
M01003N5G h264
M01003N7T h264
M01003N9I av1
File: M01003N9I.mp4 had av1 codec. Converting..
M01003OZ6 h264
M01003P4V av1
File: M01003P4V.mp4 had av1 codec. Converting..
M01003PP6 av1
File: M01003PP6.mp4 had av1 codec. Converting..
M01003Q62 av1
File: M01003Q62.mp4 had av1 codec. Converting..
M01003QIW h264
M01003QOD h264
M01003R7U h264
M01003TO9 h264
M01003UIN h264
M01003VU6 av1
File: M01003VU6.mp4 had av1 codec. Converting..
M01003WSU h264
M01003XPK h264
M01003YUC h264
M01003ZFK h264
M01004D5B h264
M01004GB3 h264
M01004I6Q h264
M01004KDW h264
M01004KEK h264
M01004NKE h264
M01004V1Z h264
M01004W7Z h264


In [149]:
# piped input
# probe_command = ['ffprobe', 
#            '-show_format', 
#            '-show_streams', '-loglevel',
#            'quiet',
#            '-print_format',
#            'json',
#            '-']

# proc = subprocess.Popen(probe_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
# out, err = proc.communicate(input=complete_content) 

In [150]:
# https://drive.google.com/file/d/1ZTbIwiA2a3IxPl1onMmmd-a7XQFo8Vs0/view?usp=share_link
# f'https://drive.google.com/open?id={id_}'

In [152]:
# store file_ids linked to gdrive ids
# TODO: check if we've already uploaded
uploads = []
for file_path in file_paths:
    file_id = os.path.basename(file_path).split('.')[0]
    gdrive_file_id = upload_basic(file_path, parents=[dir_id])
    uploads.append((file_id, gdrive_file_id))

File ID: 1bzGkh4qEDNjiemuZcPU26hJb76iczBxc
File ID: 1EF0flwbdQacRr_GjIY2Dn6Tm-YpmqO5c
File ID: 10jd5nuY-KAc9u1dYHUgBkIRTcHlmYW4b
File ID: 1Ogr0UDivdOxMyO_9hlgVuhTVTT4yF3Xo
File ID: 1D8340xsBZMmOefciMgKsIUENC2OK7B3q
File ID: 1VS37ePAocRjclQEF0WT2z9QGr62_QpdA
File ID: 19OrkAR0Nxyb8uYS_PLm_P87Aw4eGkJkl
File ID: 1x0T7Fm9kMRCzOLo7QBVZOoAsihGHqfQy
File ID: 1N05E655E1_PuC0tugGIrQ8tlWFiInBes
File ID: 1iQk13eSE6KQY8lj98gQm2jOU0zOzvkm6
File ID: 11VkRAsf3_IyXtNXC8pK-3sm4sV1ZpwZc
File ID: 15saY3puv35IwEpoL3n48s2HWPnZcBzMp
File ID: 1VrZgdzo9YbvCCFkYPvSj0z7qZhftOput
File ID: 1hg22NgE-lDdE5DGumU6BKflGGNURNe29
File ID: 1NnsCcre8xQ9-B-9l6JlQ3F9Ihssw9gxN
File ID: 1dTBCzb4Smu6Xlz7vLLDJi-MEpRcecbBI
File ID: 1IOgnkwqTciWtI7u78LB3a3bjhn4wcZIZ
File ID: 1YDjg0s_MDbryrkfimQfd4Rha7bAAAXY4
File ID: 1ruiLL3nQQZk5R4IWKyNH1YVy1hGSCmU4
File ID: 1k_xS_6GiKzTXjpviJ9l8FCvhaK3qPg-6
File ID: 1cHECuyVko7-upcSE0GLycWwgUbdBvo1c
File ID: 1K7aGvZaN4d6LiJLjnrQu_L6xEk7QwQYd
File ID: 10Q9qaKlPFCtcN7mSBXNIz9kegskYx7AI
File ID: 1S

In [153]:
gdrive_df = pd.DataFrame(uploads, columns=['file_id', 'gdrive_id'])

In [154]:
gdrive_df['url'] = gdrive_df['gdrive_id'].apply(lambda x: f'https://drive.google.com/open?id={x}')

In [155]:
gdrive_df.head()

Unnamed: 0,file_id,gdrive_id,url
0,M01003JVY,1bzGkh4qEDNjiemuZcPU26hJb76iczBxc,https://drive.google.com/open?id=1bzGkh4qEDNji...
1,M01003L5X,1EF0flwbdQacRr_GjIY2Dn6Tm-YpmqO5c,https://drive.google.com/open?id=1EF0flwbdQacR...
2,M01003LV3,10jd5nuY-KAc9u1dYHUgBkIRTcHlmYW4b,https://drive.google.com/open?id=10jd5nuY-KAc9...
3,M01003MK7,1Ogr0UDivdOxMyO_9hlgVuhTVTT4yF3Xo,https://drive.google.com/open?id=1Ogr0UDivdOxM...
4,M01003N2J,1D8340xsBZMmOefciMgKsIUENC2OK7B3q,https://drive.google.com/open?id=1D8340xsBZMmO...


In [156]:
gdrive_df['url'].iloc[0]

'https://drive.google.com/open?id=1bzGkh4qEDNjiemuZcPU26hJb76iczBxc'

In [253]:
# merge into interval df
interval_df = interval_df.merge(gdrive_df, how='left', on='file_id')

In [254]:
interval_df.columns

Index(['file_id', 'timestamp_selected', 'impact_scalar_selected', 'variable',
       'interval', 'gdrive_id', 'url'],
      dtype='object')

In [255]:
column_map = {'file_id': 'File ID', 'interval': 'Interval', 'url': 'URL'}
interval_df = interval_df.rename(columns=column_map)

In [256]:
# add in Annotator, Speaker ID, Start Tag, End Tag, Speaker Descriptor, Annotator Notes
new_cols = ['Annotator', 'Speaker ID', 'Start Tag', 'End Tag', 'Speaker Descriptor', 'Annotator Notes']
interval_df = interval_df.reindex(columns=interval_df.columns.tolist() + new_cols)

In [257]:
final_col_order = ['Annotator', 'File ID', 'URL', 'Interval', 'Speaker ID', 'Start Tag', 'End Tag', 'Speaker Descriptor', 'Annotator Notes', 'timestamp_selected', 'impact_scalar_selected', 'variable', 'gdrive_id']

In [258]:
interval_df = interval_df[final_col_order]

In [259]:
# clean up formatting
# convert start/stop seconds to minute:seconds
def convert_seconds(start_end):
    interval = ['(']
    for idx, seconds in enumerate(start_end):
        m, s = divmod(seconds, 60)
        interval.append(f'{int(m)}:{s:02.0f}')
        if idx == 0:
            interval.append(', ')
    interval.append(')')
    return ''.join(interval)

In [260]:
interval_df['Interval'] = interval_df['Interval'].apply(lambda x: convert_seconds(x))

In [262]:
# save a version containing all metadata
labeled_filepath = '/home/iron-man/Documents/data/charm/transformed/annotations/circumplex_60_intervals_labeled.csv'
interval_df.to_csv(labeled_filepath, index=False)

In [263]:
# and a version containing only annotator data
unlabeled_filepath = '/home/iron-man/Documents/data/charm/transformed/annotations/circumplex_60_intervals.csv'
interval_df[['Annotator', 'File ID', 'URL', 'Interval', 'Speaker ID', 'Start Tag', 'End Tag', 'Speaker Descriptor', 'Annotator Notes']].to_csv(unlabeled_filepath, index=False)