# Redact Text from Images
The info_types variable is a list of dictionaries containing key/value pairs of the info
type that should be redacted. In the below example, we are redacting PERSON_NAME, DATE, 
PHONE_NUMBER, and so on. Once this cell has been run, go to the next cell for further instructions.

In [4]:
# Import the client library
import google.cloud.dlp

# Instantiate a client.
dlp_client = google.cloud.dlp_v2.DlpServiceClient()

# The string to inspect
content = "Robert Frost"

# Construct the item to inspect.
item = {"value": content}

# The info types to search for in the content. Required.
info_types = [{"name": "PERSON_NAME"},
              {"name":"DATE"},
              {"name":"PHONE_NUMBER"},
              {"name":"STREET_ADDRESS"},
              {"name":"LOCATION"},
              {"name":"US_SOCIAL_SECURITY_NUMBER"},
              {"name":"US_STATE"},
              {"name":"AGE"},
              {"name":"CREDIT_CARD_NUMBER"},
              {"name":"DATE_OF_BIRTH"},
              {"name":"GENERIC_ID"},
              {"name":"IBAN_CODE"},
              {"name":"ORGANIZATION_NAME"},
              {"name":"TIME"},
              {"name":"US_DRIVERS_LICENSE_NUMBER"},
              {"name":"US_EMPLOYER_IDENTIFICATION_NUMBER"},
              {"name":"US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER"},
              {"name":"US_TOLLFREE_PHONE_NUMBER"},
              {"name":"US_VEHICLE_IDENTIFICATION_NUMBER"},
              {"name":"ICD9_CODE"},
              {"name":"ICD10_CODE"},
              {"name":"IMEI_HARDWARE_ID"},
              {"name":"MAC_ADDRESS"},
              {"name":"PASSPORT"},
              {"name":"SWIFT_CODE"},
              {"name":"AMERICAN_BANKERS_CUSIP_ID"},
              {"name":"FDA_CODE"},
              {"name":"US_BANK_ROUTING_MICR"},
              {"name":"US_DEA_NUMBER"},
              {"name":"US_HEALTHCARE_NPI"},
              {"name":"US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER"},
              {"name":"PASSWORD"}
             ]

# custom_types = [
#       {
#         "regex": {
#           "pattern": "[0-9]"
#         },
#         "likelihood": "LIKELIHOOD_UNSPECIFIED"
#       }
#     ]
# custom_types = [{"customInfoTypes":[{"infoType":{"name":"C_MRN"},
#                      "regex":{"pattern":"[1-9]{3}-[1-9]{1}-[1-9]{5}"},"likelihood":"POSSIBLE"}]}]



# The minimum likelihood to constitute a match. Optional.
min_likelihood = "LIKELIHOOD_UNSPECIFIED"

# The maximum number of findings to report (0 = server maximum). Optional.
max_findings = 0

# Whether to include the matching string in the results. Optional.
include_quote = True

# set project id
project_id = "mortgagedocscv"

# Construct the configuration dictionary. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
    "info_types": info_types,
#     "custom_info_types": custom_types,
    "min_likelihood": min_likelihood,
    "include_quote": include_quote,
    "limits": {"max_findings_per_request": max_findings},
}
print("inspect_config:",inspect_config)
# Convert the project id into a full resource id.
parent = dlp_client.project_path(project_id)
print("parent:",parent)

# Call the API.
response = dlp_client.inspect_content(parent, inspect_config, item)
print("response:",response)

# Print out the results.
if response.result.findings:
    for finding in response.result.findings:
        try:
            print("Quote: {}".format(finding.quote))
        except AttributeError:
            pass
        print("Info type: {}".format(finding.info_type.name))
        # Convert likelihood value to string respresentation.
        likelihood = (
            google.cloud.dlp.types.Finding.DESCRIPTOR.fields_by_name["likelihood"]
            .enum_type.values_by_number[finding.likelihood]
            .name
        )
        print("Likelihood: {}".format(likelihood))
else:
    print("No findings.")

inspect_config: {'info_types': [{'name': 'PERSON_NAME'}, {'name': 'DATE'}, {'name': 'PHONE_NUMBER'}, {'name': 'STREET_ADDRESS'}, {'name': 'LOCATION'}, {'name': 'US_SOCIAL_SECURITY_NUMBER'}, {'name': 'US_STATE'}, {'name': 'AGE'}, {'name': 'CREDIT_CARD_NUMBER'}, {'name': 'DATE_OF_BIRTH'}, {'name': 'GENERIC_ID'}, {'name': 'IBAN_CODE'}, {'name': 'ORGANIZATION_NAME'}, {'name': 'TIME'}, {'name': 'US_DRIVERS_LICENSE_NUMBER'}, {'name': 'US_EMPLOYER_IDENTIFICATION_NUMBER'}, {'name': 'US_INDIVIDUAL_TAXPAYER_IDENTIFICATION_NUMBER'}, {'name': 'US_TOLLFREE_PHONE_NUMBER'}, {'name': 'US_VEHICLE_IDENTIFICATION_NUMBER'}, {'name': 'ICD9_CODE'}, {'name': 'ICD10_CODE'}, {'name': 'IMEI_HARDWARE_ID'}, {'name': 'MAC_ADDRESS'}, {'name': 'PASSPORT'}, {'name': 'SWIFT_CODE'}, {'name': 'AMERICAN_BANKERS_CUSIP_ID'}, {'name': 'FDA_CODE'}, {'name': 'US_BANK_ROUTING_MICR'}, {'name': 'US_DEA_NUMBER'}, {'name': 'US_HEALTHCARE_NPI'}, {'name': 'US_PREPARER_TAXPAYER_IDENTIFICATION_NUMBER'}, {'name': 'PASSWORD'}], 'min_lik

This next cell sets up the redact_image function which will be called in the cell after.

In [157]:
import mimetypes



def redact_image(
    project, filename, output_filename, info_types, min_likelihood=None, mime_type=None
):
    """Uses the Data Loss Prevention API to redact protected data in an image.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        filename: The path to the file to inspect.
        output_filename: The path to which the redacted image will be written.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        mime_type: The MIME type of the file. If not specified, the type is
            inferred via the Python standard library's mimetypes module.
    Returns:
        None; the response from the API is printed to the terminal.
    """
    # Import the client library
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
#     info_types = [{"name": info_type} for info_type in info_types]

    # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
    # contains an info_type and optionally the color used for the replacement.
    # The color is omitted in this sample, so the default (black) will be used.
    image_redaction_configs = []

    if info_types is not None:
        for info_type in info_types:
            image_redaction_configs.append({"info_type": info_type})

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {"min_likelihood": min_likelihood, "info_types": info_types}

    # If mime_type is not specified, guess it from the filename.
    if mime_type is None:
        mime_guess = mimetypes.MimeTypes().guess_type(filename)
        mime_type = mime_guess[0] or "application/octet-stream"

    # Select the content type index from the list of supported types.
    supported_content_types = {
        None: 0,  # "Unspecified"
        "image/jpeg": 1,
        "image/bmp": 2,
        "image/png": 3,
        "image/svg": 4,
        "text/plain": 5,
    }
    content_type_index = supported_content_types.get(mime_type, 0)

    # Construct the byte_item, containing the file's byte data.
    with open(filename, mode="rb") as f:
        byte_item = {"type": content_type_index, "data": f.read()}

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Call the API.
    response = dlp.redact_image(
        parent,
        inspect_config=inspect_config,
        image_redaction_configs=image_redaction_configs,
        byte_item=byte_item,
    )

    # Write out the results.
    with open(output_filename, mode="wb") as f:
        f.write(response.redacted_image)
    print(
        "Wrote {byte_count} to {filename}".format(
            byte_count=len(response.redacted_image), filename=output_filename
        )
    )


Here, we call the redact_image function. This function has 3 inputs: 
1. the pathway to the image which needs redacting.
2. the pathway to where you want the redacted version of the image saved.
3. the info_types variable that we set in the first cell of this notebook.

In [158]:
# input parameters:
# project, filename, output_filename, info_types, min_likelihood=None, mime_type=None

redact_image(project_id,
             r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\bnmg_page1_test_3505_1.jpg",
             r"C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\bnmg_page1_test_3505_1_redact.jpg",
             info_types)

Wrote 494926 to C:\Users\pwalsh\env-google-cloud\Lib\site-packages\images\bnmg_page1_test_3505_1_redact.jpg
