In [64]:
import os, requests, yaml, json
from mimetypes import guess_type
from concurrent.futures import ThreadPoolExecutor

SAMPLE_DIR = './samples_floor'
ENDPOINT_URL = 'https://ty5whne207.execute-api.us-east-1.amazonaws.com/dev/Benefits-AI-ID-Extract-Handler'
BENCHMARK_YAML_PATH = 'benchmark_floor.yaml'
TEST_YAML_PATH = 'test_floor.yaml'

In [65]:
def process_image(image_file):
    image_path = os.path.join(SAMPLE_DIR, image_file)
    content_type, _ = guess_type(image_path)
    content_type = content_type or 'application/octet-stream'  # Default MIME type if unknown
    # Prepare and send the file
    with open(image_path, 'rb') as image_file:
        image_data = image_file.read()
        headers = {'Content-Type': 'image/jpeg'}  # Adjust the content type based on your image format
        response = requests.post(ENDPOINT_URL, data=image_data, headers=headers)
        return os.path.basename(image_path).split(".")[0], response.text

In [66]:
files = os.listdir(SAMPLE_DIR)
image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

results_dict = {}
with ThreadPoolExecutor(max_workers=12) as executor:
    results = executor.map(process_image, image_files)
    results_dict = dict(results)

print(results_dict)

{'112': '{"ADDITIONAL_INFORMATION": {}, "ID_TYPE": "", "FIRST_NAME": "FIRST", "MIDDLE_NAME": "M", "LAST_NAME": "LAST", "DATE_OF_BIRTH": "", "ADDRESS": "1100 314", "ORGANIZATION": "cca commonwealth care alliance cca commonwealth care", "ZIP_CODE_IN_ADDRESS": ""}', '15': '{"ADDITIONAL_INFORMATION": {"MIDDLE_NAME": "CHAMONTS", "SUFFIX": "", "CITY_IN_ADDRESS": "SAN DEGOCA", "STATE_IN_ADDRESS": "", "STATE_NAME": "MEXICO", "DOCUMENT_NUMBER": "91912", "EXPIRATION_DATE": "09 MAR 2019", "DATE_OF_ISSUE": "09 MAR 2019", "ENDORSEMENTS": "", "VETERAN": "", "RESTRICTIONS": "", "CLASS": "", "COUNTY": "", "PLACE_OF_BIRTH": "", "MRZ_CODE": ""}, "FIRST_NAME": "SAMANTHA", "LAST_NAME": "", "ZIP_CODE_IN_ADDRESS": "91912", "DATE_OF_BIRTH": "", "ID_TYPE": "DRIVER LICENSE FRONT", "ADDRESS": ""}', '23': '{"ADDITIONAL_INFORMATION": {"MIDDLE_NAME": "", "SUFFIX": "", "CITY_IN_ADDRESS": "ANYTOWN", "STATE_IN_ADDRESS": "NY", "STATE_NAME": "NEVADA", "DOCUMENT_NUMBER": "123456789123", "EXPIRATION_DATE": "07/01/2014", 

In [67]:
# Function to convert JSON to YAML
def convert_json_to_yaml(result_dict: dict):
    dict_int_keys = {int(k): result_dict[k] for k in result_dict}
    sorted_dict = {k: dict_int_keys[k] for k in sorted(dict_int_keys, key=int)}
    parsed_data = {}
    for key, value in sorted_dict.items():
        fields = json.loads(value)
        # Extract the required fields
        entry_dict = {}
        for field in fields:
            if field != 'ADDITIONAL_INFORMATION':
                entry_dict[field] = fields.get(field)
        parsed_data[key] = entry_dict
    # Convert dictionary to YAML string
    yaml_data = yaml.dump(parsed_data, default_flow_style=False)
    return yaml_data
# Convert JSON to YAML
yaml_result = convert_json_to_yaml(results_dict)
# Optionally, write to a YAML file
with open(TEST_YAML_PATH, "w") as f:
    f.write(yaml_result)

In [68]:
import yaml

stats_dict = {}

def load_yaml(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def add_statistic(id_type, entry_field_count, diff_field_count):
    diff_entry_increment = 1 if diff_field_count != 0 else 0
    if id_type not in stats_dict:
        stats_dict[id_type] = (1,diff_entry_increment,entry_field_count, diff_field_count)
    else:
        total_entry_counter, diff_entry_counter, total_field_counter, diff_field_counter = stats_dict[id_type]
        stats_dict[id_type] = (total_entry_counter + 1, diff_entry_counter + diff_entry_increment, total_field_counter + entry_field_count, diff_field_counter + diff_field_count)

def compare_yaml(benchmark_path, test_path):
    benchmark_data, test_data = load_yaml(benchmark_path), load_yaml(test_path)
    if not benchmark_data:
        raise Exception('Need to populate benchmark YAML')
    for entry_name in benchmark_data:
        print(f'Now reviewing entry {entry_name}')
        if entry_name not in test_data:
            print(f'Skipped {entry_name}')
            continue
        entry_field_counter, diff_field_counter = 0, 0
        benchmark_entry, test_entry, id_type = benchmark_data[entry_name], test_data[entry_name], ''
        for field_name in benchmark_entry:
            entry_field_counter += 1
            if field_name not in test_entry:
                diff_field_counter += 1
                continue
            benchmark_field, test_field = benchmark_entry[field_name], test_entry[field_name]
            if field_name == 'ID_TYPE':
                id_type = test_field
            if benchmark_field != test_field:
                print(f'{test_field if test_field else "[EMPTY]"} should be: {benchmark_field if benchmark_field else "[EMPTY]"}')
                diff_field_counter += 1
        if not id_type:
            add_statistic('Misc', entry_field_counter, diff_field_counter)
            continue
        add_statistic(id_type, entry_field_counter, diff_field_counter)

# Compare the files and print the number of different entries
compare_yaml(BENCHMARK_YAML_PATH, TEST_YAML_PATH)

Now reviewing entry 1
69 RUE BIG HAMMER should be: 69 RUE BIG HAMMER MONTREAL
[EMPTY] should be: 03/30/1996
Now reviewing entry 2
[EMPTY] should be: 6989 SUMMERSIDE BLVD PORTLAND ME 04101
02/14/1974 should be: 02/17/1981
[EMPTY] should be: JULIE
[EMPTY] should be: SUMMERS
Now reviewing entry 3
123 MAIN STREET should be: 123 MAIN STREET INDIANAPOLIS IN 46204-0000
Now reviewing entry 4
123 YOUR ST should be: 123 YOUR ST SIOUX FALLS SD 57105
Now reviewing entry 5
1234 MAIN STREET RD should be: 1234 MAIN STREET RD RIVERDALE GA 30274
Now reviewing entry 6
123 MAIN STREET should be: 123 MAIN STREET ANYTOWN GA 39999
Now reviewing entry 7
[EMPTY] should be: ADDRESS LINE 1 BOISE IO
[EMPTY] should be: FIRSTNAME
[EMPTY] should be: LASTNAME
Now reviewing entry 8
[EMPTY] should be: 123 ANY STREET GLEN BURNIE MD 21062
SAMPLE should be: JANE
[EMPTY] should be: SAMPLE
Now reviewing entry 9
1284 WHITTY DR should be: 1284 WHITTY DR SLIDELL LA
Now reviewing entry 10
[EMPTY] should be: 312 15TH STREET DUN

In [69]:
def report_results():
    total_entries, total_diff_entries, total_fields, total_diff_fields = 0,0,0,0
    for id_type in stats_dict:
        if id_type == 'DRIVER LICENSE FRONT' or id_type == 'PASSPORT':
            entries, diff_entries, fields, diff_fields = stats_dict[id_type]
            entries_acc, fields_acc = round(float(100-(100*diff_entries/entries)),2), round(float(100-(100*diff_fields/fields)),2)
            print(f'For {id_type}:\n\tEntries Accuracy:\t{entries_acc}%\n\tFields Accuracy:\t{fields_acc}%')
            total_entries += entries
            total_diff_entries += diff_entries
            total_fields += fields
            total_diff_fields += diff_fields
    total_entries_acc, total_fields_acc = round(float(100-(100*total_diff_entries/total_entries)),2), round(float(100-(100*total_diff_fields/total_fields)),2)
    print(f'Total:\n\tEntries Accuracy:\t{total_entries_acc}%\n\tFields Accuracy:\t{total_fields_acc}%')
report_results()

For DRIVER LICENSE FRONT:
	Entries Accuracy:	28.33%
	Fields Accuracy:	79.67%
For DRIVER'S LICENCE and SERVICES CARD:
	Entries Accuracy:	0.0%
	Fields Accuracy:	0.0%
For COLORADO DRIVER LICENSE:
	Entries Accuracy:	0.0%
	Fields Accuracy:	60.0%
For LICA of ANGOLA:
	Entries Accuracy:	0.0%
	Fields Accuracy:	0.0%
For PASSPORT:
	Entries Accuracy:	65.22%
	Fields Accuracy:	91.3%
For PC ADE K11000944 HISEYHU:
	Entries Accuracy:	0.0%
	Fields Accuracy:	50.0%
For SUNDESRIPUBLIK DEUTSCHLAND:
	Entries Accuracy:	0.0%
	Fields Accuracy:	40.0%
For Passport:
	Entries Accuracy:	0.0%
	Fields Accuracy:	0.0%
For LIETUVOS RESPUBLIKA:
	Entries Accuracy:	0.0%
	Fields Accuracy:	50.0%
For ACORTA FLORES FEDERACO ROSA CITIALS MERICARN:
	Entries Accuracy:	0.0%
	Fields Accuracy:	0.0%
For Republica - / Nation Print - - . - Max - 2013 - Tank Signature Andracco:
	Entries Accuracy:	0.0%
	Fields Accuracy:	0.0%
For gb:
	Entries Accuracy:	0.0%
	Fields Accuracy:	20.0%
For Misc:
	Entries Accuracy:	10.0%
	Fields Accuracy:	32.0%
