#### This notebook contains the code required to prepare IC15-Train and YVT datasets

In [None]:
import os, sys, subprocess, json, shutil
import xml.etree.ElementTree as etree
import cv2
import pickle

In [None]:
# Auxiliary function to split video into PNG frames. Assumes you have 'ffmpeg' installed
def video_to_frames(path_to_video, path_to_frames_dir):
    subprocess.call(['ffmpeg', '-i', path_to_video, \
                     os.path.join(path_to_frames_dir, '%05d.png')])

In [None]:
# Loads IDs of objects from ground truth text files in IC15-Train
def load_ic15_gt_ids(path_to_gt_txt):
    lines = []
    with open(path_to_gt_txt, encoding='utf8') as fs:
        lines = [x.strip() for x in list(fs)]
    split_lines = [[y.strip().strip('"') for y in x.split(',')] for x in lines]
    return {obj_id : obj_gt for obj_id, obj_gt in split_lines}

In [None]:
# Loads coordinates of text objects from XML ground truth fules of IC15-Train
def load_ic15_gt_xml(path_to_gt_xml, ids):
    tree = etree.parse(path_to_gt_xml)
    root = tree.getroot()
    objects = []
    for frame in root:
        frame_id = int(frame.attrib['ID'])
        for obj in frame:
            # ignoring objects not of our IDS
            if obj.attrib['ID'] not in ids.keys():
                continue
            # ignoring mirrored objects
            if 'Mirrored' in obj.attrib.keys():
                if obj.attrib['Mirrored'] == 'Mirrored':
                    continue
            quad = []
            for point in obj:
                quad.append((int(point.attrib['x']), int(point.attrib['y'])))
            objects.append((frame_id, obj.attrib['ID'], ids[obj.attrib['ID']], quad))
    return objects

In [None]:
# Splits video from IC15-Train into separate text object clips
def populate_dataset_from_ic15_video(path_to_dataset, path_to_video, path_to_gt_txt, path_to_gt_xml):
    TEMP_DIR = 'temp_dir'
    os.mkdir(TEMP_DIR)
    
    objects = load_ic15_gt_xml(path_to_gt_xml, load_ic15_gt_ids(path_to_gt_txt))
    video_to_frames(path_to_video, TEMP_DIR)
    frame_files = [os.path.join(TEMP_DIR, x) for x in sorted(os.listdir(TEMP_DIR))]
    
    for obj in objects:
        frame_id, obj_id, gt, quad = obj
        
        obj_name = os.path.split(path_to_video)[-1].split('.')[0] + '_' + obj_id
        obj_dir = os.path.join(path_to_dataset, obj_name)
        if not os.path.exists(obj_dir):
            os.mkdir(obj_dir)
            with open(os.path.join(obj_dir + '.json'), 'w') as js:
                js.write(json.dumps({'gt': gt.lower()}, indent = 2))
        
        frame = cv2.imread(frame_files[frame_id - 1])
        xmin = min([p[0] for p in quad])
        ymin = min([p[1] for p in quad])
        xmax = max([p[0] for p in quad]) + 1
        ymax = max([p[1] for p in quad]) + 1
        
        width = frame.shape[0]
        height = frame.shape[1]
        if xmin < 0 or ymax > width:
            continue
        if ymin < 0 or ymax > height:
            continue
        
        cropped = frame[ymin:ymax, xmin:xmax]
        
        cv2.imwrite(os.path.join(obj_dir, '%05d.png' % frame_id), cropped)
    
    shutil.rmtree(TEMP_DIR)

In [None]:
# Splits IC15-Train dataset into separate text object clips
def populate_ic15(path_to_src, path_to_dst):
    videos = [os.path.join(path_to_src, x) for x in sorted(os.listdir(path_to_src)) if x.endswith('mp4')]
    txts   = [x.replace('.mp4', '_GT.txt') for x in videos]
    xmls   = [x.replace('.mp4', '_GT.xml') for x in videos]
    for v, t, x in zip(videos, txts, xmls):
        print('Processing %s...' % v)
        populate_dataset_from_ic15_video(path_to_dst, v, t, x)

In [None]:
# To split IC15-Train dataset into separate clips: run populate_ic15 with two parameters:
#   1: path to ch3_train directory of IC15 Text in Videos dataset
#   2: path to an output directory (should be created beforehand)

# populate_ic15('/path/to/ch3_train',\
#               '/path/to/ic15_separate_clips')

In [None]:
# Loads paths to YVT frames
def load_yvt_frames(path_to_frames_dir):
    l1dirs = [os.path.join(path_to_frames_dir, x) for x in sorted(os.listdir(path_to_frames_dir))]
    l2dirs = []
    for l1dir in l1dirs:
        l2dirs.extend([os.path.join(l1dir, x) for x in sorted(os.listdir(l1dir))])
    frame_paths = []
    for l2dir in l2dirs:
        frame_paths.extend([os.path.join(l2dir, x) for x in sorted(os.listdir(l2dir))])
    return {os.path.split(f)[-1].split('.')[0] : f for f in frame_paths}

In [None]:
# Loads YVT ground truth objects
def load_yvt_gt(path_to_gt):
    lines = []
    with open(path_to_gt, encoding='utf8') as fs:
        lines = [x.strip() for x in list(fs)]
    lines = [x.split() for x in lines]
    objects = []
    for line in lines:
        # if annotation is lost
        if line[6] != '0':
            continue
        # if annotation is occluded
        if line[7] != '0':
            continue
        frame_id = line[5]
        xmin = int(float(line[1]) * 720 / 1280 + 0.5)
        ymin = int(float(line[2]) * 720 / 1280 + 0.5)
        xmax = int(float(line[3]) * 720 / 1280 + 0.5)
        ymax = int(float(line[4]) * 720 / 1280 + 0.5)
        obj_id = line[0]
        gt = line[9].lower().strip('"')
        
        objects.append((frame_id, obj_id, gt, xmin, ymin, xmax, ymax))
    return objects

In [None]:
# Splits video from YVT into separate text object clips
def populate_dataset_from_yvt_video(path_to_dataset, path_to_frames_dir, path_to_gt):

    frame_files = load_yvt_frames(path_to_frames_dir)
    objects = load_yvt_gt(path_to_gt)
    
    for obj in objects:
        frame_id, obj_id, gt, xmin, ymin, xmax, ymax = obj
        
        obj_name = os.path.split(path_to_frames_dir)[-1].split('.')[0] + '_' + obj_id
        obj_dir = os.path.join(path_to_dataset, obj_name)
        if not os.path.exists(obj_dir):
            os.mkdir(obj_dir)
            with open(os.path.join(obj_dir + '.json'), 'w') as js:
                js.write(json.dumps({'gt': gt.lower()}, indent = 2))
        
        frame = cv2.imread(frame_files[frame_id])
        xmax = xmax + 1
        ymax = ymax + 1
        
        width = frame.shape[0]
        height = frame.shape[1]
        if xmin < 0 or ymax > width:
            continue
        if ymin < 0 or ymax > height:
            continue
        
        cropped = frame[ymin:ymax, xmin:xmax]
        
        cv2.imwrite(os.path.join(obj_dir, '%05d.png' % int(frame_id)), cropped)

In [None]:
# Splits YVT dataset into separate text object clips
def populate_yvt(path_to_src, path_to_dst):
    subsets = [os.path.join(path_to_src, 'frames', x) for x in sorted(os.listdir(os.path.join(path_to_src, 'frames')))]
    for subset in subsets:
        clips = [os.path.join(subset, x) for x in sorted(os.listdir(subset))]
        annotations = [x.replace('frames', 'annotations') + '.txt' for x in clips]
        for v, t in zip(clips, annotations):
            if not os.path.exists(v):
                continue
            if not os.path.exists(t):
                continue
            print('Processing %s...' % v)
            populate_dataset_from_yvt_video(path_to_dst, v, t)

In [None]:
# To split YVT dataset into separate clips: run populate_yvt with two parameters:
#   1: path to the YVT dataset directory
#   2: path to an output directory (should be created beforehand)

# populate_yvt('/path/to/YVT',\
#              '/path/to/yvt_separate_clips')

In [None]:
# Filters objects with alphanumeric characters, splits text object clips into subclips with 30 frames
def filter_and_convert_to_clips(path_to_src, path_to_dst):
    clip_dirs = [os.path.join(path_to_src, x) for x in sorted(os.listdir(path_to_src)) if not x.endswith('.json')]
    clip_gts  = [x + '.json' for x in clip_dirs]
    for i in range(len(clip_gts)):
        gt_data = None
        with open(clip_gts[i]) as js:
            gt_data = json.load(js)
        clip_gts[i] = gt_data['gt']
    
    alphabet = set('0123456789abcdefghijklmnopqrstuvwxyz')
    for clip_dir, clip_gt in zip(clip_dirs, clip_gts):
        # discarding clip if it has non-alphabet character in it
        discard_clip = False
        for c in clip_gt:
            if c not in alphabet:
                discard_clip = True
                break
        if discard_clip:
            continue
            
        frames_list = [os.path.join(clip_dir, x) for x in sorted(os.listdir(clip_dir))]
        
        # discarding clip if it has fewer than 30 frames
        if len(frames_list) < 30:
            continue
        
        for i_subclip in range(len(frames_list) // 30):
            subclip_name = os.path.split(clip_dir)[-1] + '__%02d___%s' % (i_subclip, clip_gt)
            subclip_dir  = os.path.join(path_to_dst, subclip_name)
            os.mkdir(subclip_dir)
            for i_frame in range(i_subclip * 30, i_subclip * 30 + 30):
                shutil.copy(frames_list[i_frame], os.path.join(subclip_dir, os.path.split(frames_list[i_frame])[-1]))

In [None]:
# To convert text object clips into filtered truncated clips, call 'filter_and_convert_to_clips' twice with two parameters:
#  1: Path to split clips, prepared in the previous steps
#  2: Path to output directory (which should be created beforehand)

# filter_and_convert_to_clips('/path/to/ic15_separate_clips',\
#                             '/path/to/ic15_filtered_clips')
# filter_and_convert_to_clips('/path/to/yvt_separate_clips',\
#                             '/path/to/yvt_filtered_clips')

In [None]:
# This function launches recognition of each clip using clovaai model (https://github.com/clovaai/deep-text-recognition-benchmark),
# extracts character membership estimations, and converts the dataset to a set of pickled files which work as an 
# input for stopping rules evaluation.

# To run this, you need to do some preparations:
# 1. Clone the git repository for clovaai model (https://github.com/clovaai/deep-text-recognition-benchmark)
# 2. Install all dependencies as per the instructions in the clovaai github's README
# 3. Download the pretrained model which will be used for recognition. We used TPS-ResNet-BiLSTM-Attn.pth. Link 
#    for downloading the pretrained model is in the clovaai github's README
# 4. Create directories 'images' and 'results' in the directory with cloned repo
# 5. Some modifications need to be made in the demonstration code, in order to extract class membership estimations (all changes are in demo.py):

# diff --git a/demo.py b/demo.py
# index 45f3d04..b5fe9f6 100755
# --- a/demo.py
# +++ b/demo.py
# @@ -1,5 +1,6 @@
#  import string
#  import argparse
# +import pickle
 
#  import torch
#  import torch.backends.cudnn as cudnn
# @@ -11,6 +12,16 @@ from dataset import RawDataset, AlignCollate
#  from model import Model
#  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
# +ALPHABET='$^0123456789abcdefghijklmnopqrstuvwxyz'
# +def decode_character(prob):
# +    _, index = prob.max(dim=0)
# +    return ALPHABET[index]
# +
# +def make_ocrcell(prob):
# +    pruned = [float(x) for x in prob[2:]]
# +    s = sum(pruned)
# +    normalized = [x / s for x in pruned]
# +    return {c : p for c, p in zip(ALPHABET[2:], normalized)}
 
#  def demo(opt):
#      """ model configuration """
# @@ -77,7 +88,7 @@ def demo(opt):
 
#              preds_prob = F.softmax(preds, dim=2)
#              preds_max_prob, _ = preds_prob.max(dim=2)
# -            for img_name, pred, pred_max_prob in zip(image_path_list, preds_str, preds_max_prob):
# +            for img_name, pred, pred_max_prob, pred_prob in zip(image_path_list, preds_str, preds_max_prob, preds_prob):
#                  if 'Attn' in opt.Prediction:
#                      pred_EOS = pred.find('[s]')
#                      pred = pred[:pred_EOS]  # prune after "end of sentence" token ([s])
# @@ -86,6 +97,16 @@ def demo(opt):
#                  # calculate confidence score (= multiply of pred_max_prob)
#                  confidence_score = pred_max_prob.cumprod(dim=0)[-1]
 
# +                ocr_cells = []
# +                for pred_prob_char in pred_prob:
# +                    if decode_character(pred_prob_char) == '^':
# +                        break
# +                    ocr_cells.append(make_ocrcell(pred_prob_char))
# +                res_name = img_name.replace('images', 'results') + '.pkl'
# +                with open(res_name, 'wb') as ps:
# +                    pickle.dump(ocr_cells, ps)
# +
# +
#                  print(f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}')
#                  log.write(f'{img_name:25s}\t{pred:25s}\t{confidence_score:0.4f}\n')

def recognize_and_convert_to_final_dataset(path_to_clips, path_to_final_dataset):
    # These paths should be defined and un-commented
#     PATH_TO_MODEL_DEMO = '/path/to/cloned/repo/deep-text-recognition-benchmark/demo.py'
#     PATH_TO_MODEL_FILE = '/path/to/downloaded/pretrained/model/TPS-ResNet-BiLSTM-Attn.pth'
#     PATH_TO_MODEL_INPUT = '/path/to/cloned/repo/deep-text-recognition-benchmark/images'
#     PATH_TO_MODEL_OUTPUT = '/path/to/cloned/repo/deep-text-recognition-benchmark/results'
    
    pkls_path = os.path.join(path_to_final_dataset, 'none')
    os.mkdir(pkls_path)
    
    input_clips = [os.path.join(path_to_clips, x) for x in sorted(os.listdir(path_to_clips))]
    for input_clip in input_clips:
        clip_id = os.path.split(input_clip)[-1]
        ideal = clip_id.split('___')[-1]
        # cleaning up model input & output
        input_items = [os.path.join(PATH_TO_MODEL_INPUT, x) for x in sorted(os.listdir(PATH_TO_MODEL_INPUT))]
        output_items = [os.path.join(PATH_TO_MODEL_OUTPUT, x) for x in sorted(os.listdir(PATH_TO_MODEL_OUTPUT))]
        for item in input_items:
            os.remove(item)
        for item in output_items:
            os.remove(item)
        # copying input
        input_frames = [os.path.join(input_clip, x) for x in sorted(os.listdir(input_clip))]
        for input_frame in input_frames:
            shutil.copy(input_frame, os.path.join(PATH_TO_MODEL_INPUT, os.path.split(input_frame)[-1]))
        # running recognition
        subprocess.call([
            'python', 
            PATH_TO_MODEL_DEMO,
            '--Transformation',
            'TPS',
            '--FeatureExtraction',
            'ResNet',
            '--SequenceModeling',
            'BiLSTM',
            '--Prediction',
            'Attn',
            '--image_folder',
            PATH_TO_MODEL_INPUT,
            '--saved_model',
            PATH_TO_MODEL_FILE
        ])
        # gathering results
        output_string_pkls = [os.path.join(PATH_TO_MODEL_OUTPUT, x) for x in sorted(os.listdir(PATH_TO_MODEL_OUTPUT))]
        output_result = {
            'clip_id': clip_id,
            'field_name': clip_id,
            'field_type': 'none',
            'ideal': ideal,
            'clip': []
        }
        for output_string_pkl in output_string_pkls:
            output_string = None
            with open(output_string_pkl, 'rb') as ps:
                output_string = pickle.load(ps)
            output_result['clip'].append(output_string)
        if len(output_result['clip']) == 0:
            continue
        with open(os.path.join(pkls_path, clip_id + '.pkl'), 'wb') as ps:
            pickle.dump(output_result, ps)

In [None]:
# Finally, now to convert the filtered truncated clips into final datasets for analysis, 
#   'recognize_and_convert_to_final_dataset' should be called with two parameters:
#  1: Path to filtered clips
#  2: Path to output directory (which should be created beforehand)

# recognize_and_convert_to_final_dataset('/path/to/ic15_filtered_clips',\
#                                        '/path/to/data_ic15')
# recognize_and_convert_to_final_dataset('/path/to/yvt_filtered_clips',\
#                                        '/path/to/data_yvt')