### This notebook contains the code for precalculating estimation values in order to further analyze them

In [1]:
import os, sys, json, time
import multiprocessing
import pickle

from metrics import *
from combination import *
from combination_with_estimation import *

In [2]:
# DATASET = 'midv500'
DATASET = 'midv2019'

FIELD_TYPES = ['docnum', 'date', 'latin', 'mrz']

DATASET_DIRECTORIES = {
    'docnum': './data_%s/docnum' % DATASET,
    'date': './data_%s/date' % DATASET,
    'latin': './data_%s/latin' % DATASET,
    'mrz': './data_%s/mrz' % DATASET
}

METHODS = ['base', 'summation', 'treap']

PRECALC_DIRECTORIES = {
    'base': './precalc_base_%s' % DATASET,
    'summation': './precalc_summation_%s' % DATASET,
    'treap': './precalc_treap_%s' % DATASET
}

# creating precalc directories if there are none
for method in METHODS:
    if not os.path.exists(PRECALC_DIRECTORIES[method]):
        os.mkdir(PRECALC_DIRECTORIES[method])

In [3]:
def convert_ocrstring(serialized_ocrstring):
    '''
    Converts a serialized text string recognition result to a list of Cells
    '''
    ret = []
    for serialized_ocrcell in serialized_ocrstring:
        varmap = serialized_ocrcell
        if '@' not in varmap.keys():
            varmap['@'] = 0.0
        ret.append(Cell(varmap))
    return ret

In [4]:
def process_clip(job):
    pickled_data, method = job
    '''
    Precalculates modelling sums for each prefix of normalize clip (serialized as pickled_data) 
    using method 'method'. Outputs precalculating results in a newly generated JSON file in
    the precalculating directory for this method
    '''
    # loading pickled clip
    loaded_clip = None
    with open(pickled_data, 'rb') as ps:
        loaded_clip = pickle.load(ps)
    
    # converting text string recognition results and bringing clip to length 30
    clip_strings = [convert_ocrstring(x) for x in (loaded_clip['clip'] * 30)[:30]]
    
    # precalculation results: [(error_level, modelling sum, time of combination, time of modelling)]
    ret = [] 
        
    alignment = None
    if method == 'base':
        alignment = Alignment(0.6)
    elif method == 'summation':
        alignment = AlignmentWithEstimation(0.6, ListBasedSequenceStructure)
    elif method == 'treap':
        alignment = AlignmentWithEstimation(0.6, TreapBasedSequenceStructure)
    else:
        raise Exception('unknown method %s' % method)
        
    for i_frame, frame_string in enumerate(clip_strings):
        # combining frame_string with currently accumulated results
        combination_start = 0
        combination_end = 0
        if method == 'base':
            combination_start = time.time()
            alignment.add_string(frame_string, 1.0)
            combination_end = time.time()
        else: # for this AlignmentWithEstimation implementation there are no input sample weights
            combination_start = time.time()
            alignment.add_string(frame_string)
            combination_end = time.time()
        
        combined_result_string = alignment.get_string_result()
        
        modelling_sum = 0.0
        modelling_start = 0
        modelling_end = 0
        if method == 'base':
            modelling_start = time.time()
            # computing the modelling sum directly by test-combining previous samples
            for j_frame in range(i_frame + 1):
                copied_alignment = alignment.clone()
                copied_alignment.add_string(clip_strings[j_frame], 1.0)
                modelling_sum += levmetric_ocr(copied_alignment.base, alignment.base)
            modelling_end = time.time()
        else:
            modelling_start = time.time()
            modelling_sum = alignment.get_modelling_sum()
            modelling_end = time.time()
        
        ret.append((levmetric(combined_result_string, loaded_clip['ideal']), \
                    modelling_sum, \
                    combination_end - combination_start, \
                    modelling_end - modelling_start))
    
    output_filename = '%s_%s_%s_%s_precalc.json' % (loaded_clip['field_type'], \
                                                    loaded_clip['clip_id'], \
                                                    loaded_clip['field_name'], \
                                                    method)
    
    with open(os.path.join(PRECALC_DIRECTORIES[method], output_filename), 'w') as js:
        js.write(json.dumps(ret, indent = 2))

In [5]:
def precalculate(field_types, method, parallel_processes):
    '''
    Runs precalculation for clips of types in 'field_types' list, using method 'method', 
    and using a certain number of parallel processes'
    
    It needs to be run with parallel_processes = 1 if time measurement have to be obtained.
    '''
    datafiles = []
    for field_type in field_types:
        dataset_directory = DATASET_DIRECTORIES[field_type]
        datafiles.extend([os.path.join(dataset_directory, x) for x in sorted(os.listdir(dataset_directory))])
    
    jobs = [(datafile, method) for datafile in datafiles]
    if parallel_processes == 1:
        for job in jobs:
            process_clip(job)
    else:
        pool = multiprocessing.Pool(parallel_processes)
        pool.map(process_clip, jobs)
        pool.close()
        pool.join()