In [None]:
"""
    Anonymouus pseudonymizes files by translating existing pseudocodes into new ones. It looks for, and replaces,
    codes that match a specific pattern in the content of text-based file types, and in their filenames, including
    the file path. Besides replacing pseudocodes, the program can also replace date- or date/time-strings. The
    program does not alter original files, but instead makes pseudonomyzed copies.
    
    Pseudocodes are recognized by matching strings to a regular expression. Date(time)-strings are matched against
    a list of regular expressions, representing different possible date(teime)-formats.
    
    Anonymouus can retrieve the relationship between old and new pseudocodes from an existing keyfile, or create
    entirely new codes on the fly, and store them in a new keyfile.
    
    Supported filetypes:
    - Text-based files (txt, html, htm, xml, json). Anonymouus looks for matches anywhere in the file.
    - Spreadsheets (csv, odf, odt, ods). Idem, plus the possibility of specifying in which columns to look.
      Additionally, you can specify columns that should be completely excluded from the output.
    - Archives (zip, gz, gzip). These are decompressed, pseudonymized, and the re-compressed.

    
    Possible pseudonymization scenarios:
    
    1. Generate new pseudocodes on the fly, and write them to a new keyfile. Pseudocodes will be unique within 
       a session, while being consistent across different files within the same session.
    2. Use an existing keyfile that maps existing psuedocodes to new ones. This can be a keyfile from a previous
       session following scenario #1, or be a keyfile from another origin (handmade). Only pseudocodes that exist
       in the keyfile are replaced; pseudocodes that are not remain present in the pseudonomyzed files.
    3. Hybrid: as 2), but missing pseudocodes are replaced on the fly (NOT YET IMPLEMENTED)
    
    
    Pseudonomizing fields other than pseudocodes: if you can regex it, you can pseudonomize it.
    
"""

In [21]:
import json
import os
import re
import sys
import uuid
from pathlib import Path
sys.path.append('..')
from anonymouus import Anonymize
from utils import get_logger

In [22]:
input_folder = '/data/youth/data_test/'
output_folder = '/data/youth/pseudonimized_auto/'
result_keyfile = '/data/youth/translation_table_both.csv'

session_id = 'test1'
pattern = r'(?<![^\W_])(A|B)(\d){5}(?![^\W_])'

In [23]:
mapper = DynamicSubstitution(logger=get_logger(name='DynamicSubstitution'))

In [24]:
use_custom_pseudocode = True

if use_custom_pseudocode:

    pseudocode_length = 8
    pseudocode_prefix = 'p_'

    class PseudoCodeCreator:
        def __init__(self, code_len: int, prefix: str = "") -> None:
            self.code_len = code_len
            self.prefix = prefix
            self.codes = []

        def generate_code(self) -> str:
            return f"{self.prefix}{str(uuid.uuid4())[:self.code_len]}"

        def get_code(self, x: str) -> str:
            code = self.generate_code()
            while code in self.codes:
                code = self.generate_code()
            self.codes.append(code)
            return code


    creator = PseudoCodeCreator(pseudocode_length,pseudocode_prefix)
    mapper.set_code_generator(creator.get_code)

In [25]:
anon = Anonymize(
    flags=re.IGNORECASE,
    pattern=pattern,
    mapping=mapper.subtitute,
    use_word_boundaries=False,
    session_id=session_id
)

In [26]:
# anon.set_cols_pseudonymize(["Pseudocode", "SubjectNo"])
# anon.set_cols_exclude(["IP Address", "Date", "Date last action", "Date started", "Date submitted", "End time", "Primary assistant", "Secondary assistant", "Start time", "AGE", "DATE_DT", "DateLastSavedByUser", "DOB_DT", "FilledFormID", "InclusionDate", "TODAY.._DAT", "TODAY_DT", "TREATMENT_HOSP_NAME", "TREATMENT_HOSP_PLACE"])
# anon.set_cols_case_sensitive(False)
# anon.set_spread_sheets_pseudonymize(["Sheet 1"])
# anon.set_spread_sheets_exclude(["Sheet 2"])

In [27]:
anon.substitute(Path(input_folder),Path(output_folder))

TypeError: subtitute() got an unexpected keyword argument 'session_id'

In [None]:


# sys.tracebacklimit = 0

parser = argparse.ArgumentParser(description='Pseudonymizer')
parser.add_argument("--config-file", type=str)
parser.add_argument("--input-folder", type=str, required=True)
parser.add_argument("--output-folder", type=str, required=True)
parser.add_argument("--mapping-file", type=str)
parser.add_argument("--mapping-result-file", type=str)
parser.add_argument("--log-file", type=str)
parser.add_argument("--session-id", type=str, required=True)

# parser.add_argument("--mapping-only",action='store_true')
args = parser.parse_args()

if args.config_file and not os.path.exists(args.config_file):
    raise FileNotFoundError(f"Configuration file not found: {args.config_file}")

if not os.path.exists(args.input_folder):
    raise FileNotFoundError(f"Input folder not found: {args.input_folder}")

if not args.mapping_file and not args.mapping_result_file:
    raise ValueError("Need either a mapping file (pre-defined mapping) or a mapping result file (output for ad hoc generated mapping)")

if args.mapping_file and not os.path.exists(args.mapping_file):
    raise FileNotFoundError(f"Mapping file not found: {args.mapping_file}")

if args.config_file:
    print(f"Reading config file '{args.config_file}'")
    f = open(args.config_file)
    cfg = json.load(f)
    f.close()
else:
    cfg = None

if args.mapping_file:

    anon = Anonymize(
        mapping=args.mapping_file,
        log_file=args.log_file,
        use_word_boundaries=False,
        session_id=args.session_id
    )

else:

    mapper = DynamicSubstitution(log_file=args.log_file)

    # import random
    # mapper.set_code_generator(lambda x: f"P{str(random.randint(1, 666))}")

    import uuid

    class PseudoCreator:
        def __init__(self, code_len: int, prefix: str = "") -> None:
            self.code_len = code_len
            self.prefix = prefix
            self.codes = []

        def generate_code(self):
            return f"{self.prefix}{str(uuid.uuid4())[:self.code_len]}"

        def get_code(self, x: str):
            code = self.generate_code()
            while code in self.codes:
                code = self.generate_code()
            self.codes.append(code)
            return code


    creator = PseudoCreator(8,"psd_")
    mapper.set_code_generator(creator.get_code)

    if cfg and "pattern" in cfg:
        pattern = cfg["pattern"]
    else:
        # pattern = r'(?<![^\W_])(A|B)(\d){5}(?![^\W_])'
        # take care: \b does not consider underscore part of a word boudary
        pattern = r"\b(A|B)(\d){5}\b"

    pattern = r'(?<![^\W_])(A|B)(\d){5}(?![^\W_])'

    anon = Anonymize(
        flags=re.IGNORECASE,
        pattern=pattern,
        mapping=mapper.subtitute,
        use_word_boundaries=False,
        session_id=args.session_id,
        log_file=args.log_file
    )

if cfg:

    if "settings" in cfg and "cols_pseudonymize" in cfg["settings"]:
        anon.set_cols_pseudonymize(cfg["settings"]["cols_pseudonymize"])

    if "settings" in cfg and "cols_exclude" in cfg["settings"]:
        anon.set_cols_exclude(cfg["settings"]["cols_exclude"])

    if "settings" in cfg and "cols_case_sensitive" in cfg["settings"]:
        anon.set_cols_case_sensitive(cfg["settings"]["cols_case_sensitive"]==1)

    if "settings" in cfg and "spreadsheet_sheets_pseudonymize" in cfg["settings"]:
        anon.set_spread_sheets_pseudonymize(cfg["settings"]["spreadsheet_sheets_pseudonymize"])

    if "settings" in cfg and "spreadsheet_sheets_exclude" in cfg["settings"]:
        anon.set_spread_sheets_exclude(cfg["settings"]["spreadsheet_sheets_exclude"])

anon.substitute(Path(args.input_folder),Path(args.output_folder))

if 'mapper' in locals():
    mapper.write_translation_table(args.mapping_result_file)

# anon.set_cols_match_style('starts_with')
