In [59]:
import os, requests, yaml, json
from mimetypes import guess_type
from concurrent.futures import ThreadPoolExecutor

SAMPLE_DIR = './samples'
ENDPOINT_URL = 'https://ty5whne207.execute-api.us-east-1.amazonaws.com/dev/Benefits-AI-ID-Extract-Handler'

In [60]:
def process_image(image_file):
    image_path = os.path.join(SAMPLE_DIR, image_file)
    content_type, _ = guess_type(image_path)
    content_type = content_type or 'application/octet-stream'  # Default MIME type if unknown
    # Prepare and send the file
    with open(image_path, 'rb') as image_file:
        image_data = image_file.read()
        headers = {'Content-Type': 'image/jpeg'}  # Adjust the content type based on your image format
        response = requests.post(ENDPOINT_URL, data=image_data, headers=headers)
        return os.path.basename(image_path).split(".")[0], response.text

In [61]:
files = os.listdir(SAMPLE_DIR)
image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

results_dict = {}
with ThreadPoolExecutor(max_workers=12) as executor:
    results = executor.map(process_image, image_files)
    results_dict = dict(results)

print(results_dict)

{'112': '{"ADDITIONAL_INFORMATION": {}, "ID_TYPE": "Medicare", "FIRST_NAME": "<FIRST>", "LAST_NAME": "LAST", "DATE_OF_BIRTH": "610802", "ADDRESS": "CCA sample NVTD $3.100", "ZIP_CODE_IN_ADDRESS": "80117981"}', '15': '{"ADDITIONAL_INFORMATION": {"MIDDLE_NAME": "CHAMONTS", "SUFFIX": "", "CITY_IN_ADDRESS": "SAN DEGOCA", "STATE_IN_ADDRESS": "", "STATE_NAME": "MEXICO", "DOCUMENT_NUMBER": "91912", "EXPIRATION_DATE": "09 MAR 2019", "DATE_OF_ISSUE": "09 MAR 2019", "ENDORSEMENTS": "", "VETERAN": "", "RESTRICTIONS": "", "CLASS": "", "COUNTY": "", "PLACE_OF_BIRTH": "", "MRZ_CODE": ""}, "FIRST_NAME": "SAMANTHA", "LAST_NAME": "", "ZIP_CODE_IN_ADDRESS": "91912", "DATE_OF_BIRTH": "", "ID_TYPE": "DRIVER LICENSE FRONT", "ADDRESS": ""}', '23': '{"ADDITIONAL_INFORMATION": {"MIDDLE_NAME": "", "SUFFIX": "", "CITY_IN_ADDRESS": "ANYTOWN", "STATE_IN_ADDRESS": "NY", "STATE_NAME": "NEVADA", "DOCUMENT_NUMBER": "123456789123", "EXPIRATION_DATE": "07/01/2014", "DATE_OF_ISSUE": "07/01/2009", "ENDORSEMENTS": "", "VE

In [62]:
# Function to convert JSON to YAML
def convert_json_to_yaml(result_dict: dict):
    dict_int_keys = {int(k): result_dict[k] for k in result_dict}
    sorted_dict = {k: dict_int_keys[k] for k in sorted(dict_int_keys, key=int)}
    parsed_data = {}
    for key, value in sorted_dict.items():
        required_fields = json.loads(value)
        # Extract the required fields
        parsed_data[key] = {
            'ID_TYPE': required_fields.get('ID_TYPE'),
            'FIRST_NAME': required_fields.get('FIRST_NAME'),
            'LAST_NAME': required_fields.get('LAST_NAME'),
            'DATE_OF_BIRTH': required_fields.get('DATE_OF_BIRTH'),
            'ADDRESS': required_fields.get('ADDRESS')
        }
    # Convert dictionary to YAML string
    yaml_data = yaml.dump(parsed_data, default_flow_style=False)
    return yaml_data
# Convert JSON to YAML
yaml_result = convert_json_to_yaml(results_dict)
# Optionally, write to a YAML file
with open("test.yaml", "w") as f:
    f.write(yaml_result)

In [70]:
import yaml

def load_yaml(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

def compare_yaml(benchmark_path, test_path):
    benchmark_data, test_data = load_yaml(benchmark_path), load_yaml(test_path)
    total_entry_counter, total_field_counter, diff_entry_counter, diff_field_counter = 0,0,0,0
    for entry_name in benchmark_data:
        print(f'Now reviewing entry {entry_name}')
        is_entry_diff = False
        total_entry_counter += 1
        if entry_name not in test_data:
            diff_entry_counter += 1
            continue
        benchmark_entry, test_entry = benchmark_data[entry_name], test_data[entry_name]
        for field_name in benchmark_entry:
            total_field_counter += 1
            if field_name not in test_entry:
                diff_field_counter += 1
                is_entry_diff = True
                continue
            benchmark_field, test_field = benchmark_entry[field_name], test_entry[field_name]
            if benchmark_field != test_field:
                print(f'{test_field if test_field else "[EMPTY]"} should be: {benchmark_field if benchmark_field else "[EMPTY]"}')
                diff_field_counter += 1
                is_entry_diff = True
        if is_entry_diff:
            diff_entry_counter += 1
    return float(diff_entry_counter*100/total_entry_counter), float(diff_field_counter*100/total_field_counter) 

# Paths to your YAML files
file_path_a = 'benchmark.yaml'
file_path_b = 'test.yaml'

# Compare the files and print the number of different entries
entry_per, field_per = compare_yaml(file_path_a, file_path_b)
print(f'{round(entry_per, 1)}% of entries are inaccurate, {round(field_per, 1)}% of fields are inaccurate')


Now reviewing entry 1
69 RUE BIG HAMMER should be: 69 RUE BIG HAMMER MONTREAL
[EMPTY] should be: 03/30/1996
Now reviewing entry 2
[EMPTY] should be: 6989 SUMMERSIDE BLVD PORTLAND ME 04101
02/14/1974 should be: 02/17/1981
[EMPTY] should be: JULIE
[EMPTY] should be: SUMMERS
Now reviewing entry 3
123 MAIN STREET should be: 123 MAIN STREET INDIANAPOLIS IN 46204-0000
Now reviewing entry 4
123 YOUR ST should be: 123 YOUR ST SIOUX FALLS SD 57105
Now reviewing entry 5
1234 MAIN STREET RD should be: 1234 MAIN STREET RD RIVERDALE GA 30274
Now reviewing entry 6
123 MAIN STREET should be: 123 MAIN STREET ANYTOWN GA 39999
Now reviewing entry 7
[EMPTY] should be: ADDRESS LINE 1 BOISE IO
[EMPTY] should be: FIRSTNAME
[EMPTY] should be: LASTNAME
Now reviewing entry 8
[EMPTY] should be: 123 ANY STREET GLEN BURNIE MD 21062
SAMPLE should be: JANE
[EMPTY] should be: SAMPLE
Now reviewing entry 9
1284 WHITTY DR should be: 1284 WHITTY DR SLIDELL LA
Now reviewing entry 10
[EMPTY] should be: 312 15TH STREET DUN