In [1]:
import csv
import cv2
import json
import pandas as pd

In [2]:
def read_csv(csv_path):
    content = []

    with open(csv_path, 'r') as csvfile:
        reader = csv.reader(csvfile, quoting = csv.QUOTE_ALL)
        for row in reader: # each row is a list
            content.append(row)

    return content


def list_to_dict(lst):
    dct = {}
    for row in lst:
        vidname = row[1]
        if vidname in dct.keys():
            dct[vidname].append({'segment': [float(row[4]), float(row[5])], 'frames': [row[2], row[3]], 'shot_type': row[6], 'split': row[11]})
        else:
            dct[vidname] = [{'segment': [float(row[4]), float(row[5])], 'frames': [row[2], row[3]], 'shot_type': row[6], 'split': row[11]}]
    return dct


def frame_to_ts(frame_number, fps):
    return round(float(float(frame_number)/fps) , 2)

## Load deduplicated labels

In [3]:
labels_csvfile = '/usr/local/data02/zahra/datasets/Tempuckey/labels/tempuckey_ground_truth_labels.csv'
annotations_path = '/usr/local/data02/zahra/datasets/Tempuckey/labels/tempuckey_ground_truth_annotations.json'
labels_info_path = '/usr/local/data02/zahra/datasets/Tempuckey/labels/tempuckey_video_info_and_gt_labels_split.csv'

videos_path = '/usr/local/data02/zahra/datasets/Tempuckey/videos'
labels_with_frame = read_csv(labels_csvfile)
labels_with_ts = []

## Sample train/valid/test sets (according to `shot_type`)

In [4]:
# load labels as a df
df = pd.DataFrame(labels_with_frame)
headers = df.iloc[0]
df  = pd.DataFrame(df.values[1:], columns=headers)

In [5]:
### split train/test/valid sets with 60/20/20 ratio

# test set: take 20% 
df_test = df.groupby('type').apply(lambda x: x.sample(frac=0.2, random_state=42))
df_test = df_test.reset_index(drop=True)

# train and valid sets: remains 80% for train and valid
df_train_and_valid = df[~df.faceoff_ID.isin(df_test.faceoff_ID)]

# validation set: take 25% of the 80% to amount to 20% of the 100% of the original data
df_valid = df_train_and_valid.groupby('type').apply(lambda x: x.sample(frac=0.25, random_state=42))
df_valid = df_valid.reset_index(drop=True)

# train set: take the remaining 60% for training set
df_train = df_train_and_valid[~df_train_and_valid.faceoff_ID.isin(df_valid.faceoff_ID)]

## Create Video Info csv and annotations json 

In [6]:
train_faceoff_IDs = df_train.faceoff_ID.unique()
test_faceoff_IDs = df_test.faceoff_ID.unique()
valid_faceoff_IDs = df_valid.faceoff_ID.unique()

splits = {}
for fid in train_faceoff_IDs:
    splits[fid] = 'train'
for fid in test_faceoff_IDs:
    splits[fid] = 'test'
for fid in valid_faceoff_IDs:
    splits[fid] = 'valid'


In [7]:
header = ['faceoff_ID', 'video_name', 'beg_frame', 'end_frame', 'beg_ts', 'end_ts', 'shot_type', 'fps', 'rfps', 'frame_count', 'video_duration', 'split']

for line in labels_with_frame:
    if line[0] == 'faceoff_ID':
        continue
    
    path_ = '{}/{}'.format(videos_path, line[1])
    cap = cv2.VideoCapture(path_)
    fps = cap.get(cv2.CAP_PROP_FPS)
    rfps = round(fps)
    total_frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = float(total_frame_count)/fps
    
    fid = line[0]
    vidname = line[1]
    beg_frame = line[2]
    end_frame = line[3]
    shot_type = line[4]
    
    video_info = [fid, \
               vidname, \
               beg_frame, \
               end_frame, \
               # beg_ts
               frame_to_ts(float(beg_frame), fps), \
               # end_ts
               frame_to_ts(float(end_frame), fps), \
               # shot_type
               shot_type, \
               fps,  \
               rfps, \
               # total frame count in the video
               total_frame_count, \
               # video duration in seconds
               duration, \
               splits[fid]
              ]
    
    labels_with_ts.append(video_info)

In [8]:
labels_json = list_to_dict(labels_with_ts)

In [9]:
labels_json_annotation = {}
labels_json_annotation['version'] = 'VERSION 1.0'
labels_json_annotation['results'] = labels_json
labels_json_annotation['external_data'] = {}
json.dump(labels_json_annotation, open(annotations_path,'w'))

In [10]:
labels_df = pd.DataFrame(labels_with_ts, columns = header)

In [11]:
labels_df.to_csv(labels_info_path, sep = ',', index = False)

In [12]:
labels_df.groupby(['split']).count()['faceoff_ID']

split
test      64
train    194
valid     64
Name: faceoff_ID, dtype: int64

In [13]:
labels_json_annotation['results']['video_FACEOFF_000055.mp4']

[{'frames': ['0', '135'],
  'segment': [0.0, 4.5],
  'shot_type': 'close',
  'split': 'test'},
 {'frames': ['135', '390'],
  'segment': [4.5, 13.01],
  'shot_type': 'close',
  'split': 'valid'}]

## video info csv for custom input

In [14]:
bsn_video_info_path = '/usr/local/data02/zahra/datasets/Tempuckey/labels/tempuckey_video_info_split.csv'
bsn_video_info_df = labels_df[['faceoff_ID','video_name', 'frame_count','video_duration','fps','rfps','split']]
bsn_video_info_df.to_csv(bsn_video_info_path, sep = ',', index = False)