# Redact Text from Images
The info_types variable is a list of strings containing the info
type that should be redacted. In the below example, we are redacting PERSON_NAME, DATE, 
PHONE_NUMBER, and so on. Once this cell has been run, go to the next cell for further instructions.

In [2]:
import mimetypes

# set project ID
project_id = 'mortgagedocscv'

# specify built-in info types
info_types = ["PERSON_NAME","DATE","PHONE_NUMBER","STREET_ADDRESS","LOCATION",
                  "US_SOCIAL_SECURITY_NUMBER","US_STATE","AGE","CREDIT_CARD_NUMBER",
                 "DATE_OF_BIRTH","US_DRIVERS_LICENSE_NUMBER","US_EMPLOYER_IDENTIFICATION_NUMBER",
                 "US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER","US_VEHICLE_IDENTIFICATION_NUMBER",
                 "US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER","PASSPORT"]

# specify custom info types
custom_info_types = [{'info_type':{'name':'DATE'},"regex": {"pattern": "[0-9]"}}]

This next cell sets up the redact_image function which will be called in the cell after.

In [3]:
def redact_image(
    project,
    filename,
    output_filename,
    info_types,
    custom_info_types,
    min_likelihood=None,
    mime_type=None,
):
    """Uses the Data Loss Prevention API to redact protected data in an image.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]
#     print('\ninfo_types:',info_types)
#     print('\ncustom_info_types:',custom_info_types)
    
    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []
    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({"info_type": info_type})
#     print('\nimage_redaction_configs:',image_redaction_configs)
    
    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "custom_info_types": custom_info_types,
        "info_types": info_types,
        "min_likelihood": min_likelihood
    }
#     print('\ninspect_config:',inspect_config)
    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or "application/octet-stream"

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        "image/jpeg": 1,
        "image/bmp": 2,
        "image/png": 3,
        "image/svg": 4,
        "text/plain": 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode="rb") as f:
        byte_item = {"type": content_type_index, "data": f.read()}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent,
        inspect_config=inspect_config,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item,
    )

    # Write out the results.
    with open(output_filename, mode="wb") as f:
        f.write(response.redacted_image)
    print(
        "\nWrote {byte_count} to {filename}".format(
            byte_count=len(response.redacted_image), filename=output_filename
        )
    )

# Set Path for Images

In [4]:
path = []
path.append(r'C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing')
path.append(r'C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\validation\page1')
path.append(r'C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\validation\other')
path.append(r'C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\training\page1')
path.append(r'C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\training\other')
print('path: ',path)

path:  ['C:\\Users\\pwalsh\\env-google-cloud\\Lib\\site-packages\\images\\original\\testing', 'C:\\Users\\pwalsh\\env-google-cloud\\Lib\\site-packages\\images\\original\\validation\\page1', 'C:\\Users\\pwalsh\\env-google-cloud\\Lib\\site-packages\\images\\original\\validation\\other', 'C:\\Users\\pwalsh\\env-google-cloud\\Lib\\site-packages\\images\\original\\training\\page1', 'C:\\Users\\pwalsh\\env-google-cloud\\Lib\\site-packages\\images\\original\\training\\other']


In [51]:
# path = []
# path.append(r'C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\test')
# print('path: ',path)

path:  ['C:\\Users\\pwalsh\\env-google-cloud\\Lib\\site-packages\\images\\original\\test']


# Redact Images

Here, we call the redact_image function. This function has 4 inputs: 
1. the pathway to the image which needs redacting.
2. the pathway to where you want the redacted version of the image saved.
3. the info_types variable that we set in the first cell of this notebook.
4. the custom_info_types variable that we set in the first cell of this notebook.

In [None]:
# input parameters:
# project, filename, output_filename, info_types, custom_info_types, min_likelihood=None, mime_type=None
import os, shutil
counter = 1

for folder in path:
    os.chdir(folder)
    print(os.getcwd())
    files = os.listdir()
    subfolder = folder.split("original")
    subfolder = subfolder[1]
    counter = 1
#     print(r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\redacted"+subfolder)
    for img in files:
        if (img.split(".")[-1] == 'jpg'):
            img = img[:-4]
#             print(img)
            os.chdir(r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages")
#             print("{}\{}_redact.jpg".format(folder,img))
#             print(r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\redacted"+subfolder+"\{}_redact.jpg".format(img))

            # call redact_image function
            redact_image(project_id,
                         "{}\{}.jpg".format(folder,img),
                         "{}\{}_redact.jpg".format(folder,img),
                         info_types,custom_info_types)
            print("image no. {}".format(counter))
            counter += 1
            shutil.move("{}\{}_redact.jpg".format(folder,img),
                       r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\redacted"+subfolder+"\{}.jpg".format(img))

C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing

Wrote 634220 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing\acbp_other_test_3040w_11_redact.jpg
image no. 1

Wrote 630199 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing\ance_other_test_3036w_8_redact.jpg
image no. 2

Wrote 71297 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing\aqnj_other_test_3013w_17_redact.jpg
image no. 3

Wrote 584317 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing\atrs_other_test_3029w_5_redact.jpg
image no. 4

Wrote 129583 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\original\testing\awpf_other_test_3508_5_redact.jpg
image no. 5


In [2]:
# # input parameters:
# # project, filename, output_filename, info_types, custom_info_types, min_likelihood=None, mime_type=None
# img = 'bnmg_page1_test_3505_1'
# redact_image(project_id,
#              r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\{}.jpg".format(img),
#              r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\{}_redact13.jpg".format(img),
#              info_types,custom_info_types)                                                                                 


info_types: [{'name': 'DATE'}, {'name': 'PERSON_NAME'}, {'name': 'US_STATE'}]

custom_info_types: [{'info_type': {'name': 'DATE'}, 'regex': {'pattern': '[0-9]'}}]

image_redaction_configs: [{'info_type': {'name': 'DATE'}}, {'info_type': {'name': 'PERSON_NAME'}}, {'info_type': {'name': 'US_STATE'}}]

inspect_config: {'custom_info_types': [{'info_type': {'name': 'DATE'}, 'regex': {'pattern': '[0-9]'}}], 'info_types': [{'name': 'DATE'}, {'name': 'PERSON_NAME'}, {'name': 'US_STATE'}], 'min_likelihood': None}

Wrote 501921 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\bnmg_page1_test_3505_1_redact13.jpg
