Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the SR anonymisation #1524

Merged
merged 23 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
17c4058
CTP_SRAnonTool.sh - accept -y default.yaml
Apr 19, 2023
17d3022
CTP_SRAnonTool.sh - if output file does not exist then copy input file
Apr 19, 2023
cecd7f4
Added modify_SR.py to randomly add real names into a DICOM SR
Apr 19, 2023
0489c9d
Added news
Apr 20, 2023
d59b341
StructuredReport - added a list of all tags which contain names (vr i…
Apr 20, 2023
865bbd3
DicomText - output a list of all names (tags where vr is PN) in the […
Apr 20, 2023
ceb7a9a
StructuredReport - handle a contentsequence value of dict { 'Alphabet…
Apr 20, 2023
5738ce5
StructuredReport - oops, call self._SR_parse_key()
Apr 20, 2023
871232a
StructuredReport - test a PersonName tag
Apr 20, 2023
479bdf1
Dicom - add 'atomic' parameter to tag_val() so it can extract text fr…
Apr 20, 2023
3febacb
CTP_DicomToText.py - decode SOPInstanceUID better when constructing f…
Apr 20, 2023
b3ede85
StructuredReport - output a set of [[Other Names]] headers for all th…
Apr 20, 2023
f801f55
StructuredReport - comment out print
Oct 24, 2023
401189b
Merge branch 'master' into feature/srmod
Oct 24, 2023
d647172
StructuredReport - find more Names, and add test
Oct 25, 2023
b4b128c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 25, 2023
a0c3da0
CTP_SRAnonTool.sh - don't override PYTHONPATH
Oct 25, 2023
ab79d36
CTP_SRAnonTool.sh - pass on custom semehr_dir option to semehr_anon
Oct 25, 2023
99095e5
DicomText.py - redact dates to 1111 not 9999 so that they validate ok
Oct 25, 2023
1a5a345
CTP_XMLToDicom.py - also takes args --replace-html and --replace-newl…
Oct 26, 2023
0c7e3ce
DicomText.py - refactor common code into function
Oct 26, 2023
623343f
DicomText.py - fix parsing headers
Oct 31, 2023
20f4d62
Ensure output file is writeable
Nov 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/1524-bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
StructuredReport improvements - collect names from anywhere in text body not just in header
5 changes: 3 additions & 2 deletions src/applications/SRAnonTool/CTP_DicomToText.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import re
from deepmerge import Merger # for deep merging yaml dictionaries
from SmiServices import Mongo
from SmiServices import Dicom
from SmiServices import DicomText
from SmiServices import StructuredReport as SR
from SmiServices import IdentifierMapper
Expand Down Expand Up @@ -81,10 +82,10 @@ def extract_mongojson(mongojson, output, metadata_output=None, DicomTextArgs = N
DicomTextArgs = {}

if os.path.isdir(output):
filename = mongojson['SOPInstanceUID'] + '.txt'
filename = Dicom.tag_val(mongojson,'SOPInstanceUID', atomic=True) + '.txt'
output = os.path.join(output, filename)
if metadata_output and os.path.isdir(metadata_output):
mfilename = mongojson['SOPInstanceUID'] + '.json'
mfilename = Dicom.tag_val(mongojson,'SOPInstanceUID', atomic=True) + '.json'
metadata_output = os.path.join(metadata_output, mfilename)
logging.info('Parse %s' % mongojson.get('header',{}).get('DicomFilePath','<NoFilePath?>'))
if 'PatientID' in mongojson:
Expand Down
32 changes: 21 additions & 11 deletions src/applications/SRAnonTool/CTP_SRAnonTool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@

prog=$(basename "$0")
progdir=$(dirname "$0")
usage="usage: ${prog} [-d] [-v] [-e virtualenv] [-s semehr_root] -i read_from.dcm -o write_into.dcm"
options="dve:s:i:o:"
usage="usage: ${prog} [-d] [-v] [-e virtualenv] [-s semehr_root] [-y yaml] -i read_from.dcm -o write_into.dcm"
options="dve:s:y:i:o:"
semehr_dir="/opt/semehr"
virtenv=""
debug=0
Expand Down Expand Up @@ -58,14 +58,17 @@ tidy_exit()

# Default executable PATHs and Python libraries
export PATH=${PATH}:${SMI_ROOT}/bin:${SMI_ROOT}/scripts:${progdir}
export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib64/python3.6/site-packages
if [ "$PYTHONPATH" == "" ]; then
export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib64/python3.6/site-packages
fi

# Command line arguments
while getopts ${options} var; do
case $var in
d) debug=1;;
v) verbose=1;;
e) virtenv="$OPTARG";;
y) default_yaml0="$OPTARG";;
i) input_dcm="$OPTARG";;
o) output_dcm="$OPTARG";;
s) semehr_dir="$OPTARG";;
Expand All @@ -78,7 +81,9 @@ if [ ! -f "$input_dcm" ]; then
tidy_exit 2 "ERROR: cannot read input file '${input_dcm}'"
fi
if [ ! -f "$output_dcm" ]; then
tidy_exit 3 "ERROR: cannot write to ${output_dcm} because it must already exist"
#tidy_exit 3 "ERROR: cannot write to ${output_dcm} because it must already exist"
cp "$input_dcm" "$output_dcm"
chmod +w "$output_dcm"
fi

# Activate the virtual environment
Expand All @@ -91,15 +96,20 @@ if [ "$virtenv" != "" ]; then
fi
fi

# Find the config files
if [ -d $SMI_ROOT/configs ]; then
default_yaml0="$SMI_ROOT/configs/smi_dataLoad_mysql.yaml"
default_yaml1="$SMI_ROOT/configs/smi_dataExtract.yaml"
else
default_yaml0="${progdir}/../../../data/microserviceConfigs/default.yaml"
# Find the config files, if not specified try SMI defaults otherwise in the repo
if [ "$default_yaml0" == "" ]; then
if [ -f "$SMI_ROOT/configs/smi_dataExtract.yaml" ]; then
default_yaml0="$SMI_ROOT/configs/smi_dataLoad_mysql.yaml"
default_yaml1="$SMI_ROOT/configs/smi_dataExtract.yaml"
else
default_yaml0="${progdir}/../../../data/microserviceConfigs/default.yaml"
fi
fi
if [ "$default_yaml1" == "" ]; then
default_yaml1="$default_yaml0"
fi


# ---------------------------------------------------------------------
# Determine the SemEHR filenames - create per-process directories
semehr_input_dir=$(mktemp -d -t input_docs.XXXX --tmpdir=${semehr_dir}/data)
Expand Down Expand Up @@ -132,7 +142,7 @@ CTP_DicomToText.py -y $default_yaml0 -y $default_yaml1 \
# Reads $input_doc
# Writes $anon_doc, and $anon_xml via the --xml option
#
semehr_anon.py -i "${input_doc}" -o "${anon_doc}" --xml || tidy_exit 5 "Error running SemEHR-anon given ${input_doc} from ${input_dcm}"
semehr_anon.py -s "${semehr_dir}" -i "${input_doc}" -o "${anon_doc}" --xml || tidy_exit 5 "Error running SemEHR-anon given ${input_doc} from ${input_dcm}"
# If there's still no XML file then exit
if [ ! -f "$anon_xml" ]; then
tidy_exit 6 "ERROR: SemEHR-anon failed to convert $input_doc to $anon_xml"
Expand Down
22 changes: 21 additions & 1 deletion src/applications/SRAnonTool/CTP_XMLToDicom.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
parser.add_argument('-i', dest='input_dcm', action="store", help='Path to raw DICOM file')
parser.add_argument('-x', dest='input_xml', action="store", help='Path to annotation XML file')
parser.add_argument('-o', dest='output_dcm', action="store", help='Path to anonymised DICOM file to have redacted text inserted')
parser.add_argument('--replace-html', action="store", help='replace HTML with a character, default is dot (.), or "squash" to eliminate')
parser.add_argument('--replace-newlines', action="store", help='replace carriage returns and newlines with a character (e.g. a space) or "squash" to eliminate')
args = parser.parse_args()
if not args.input_dcm or not args.input_xml or not args.output_dcm:
parser.print_help()
Expand Down Expand Up @@ -71,9 +73,27 @@
logging.error('ERROR: no such file named {} (redacted text is written into this so it must exist)'.format(args.output_dcm))
exit(1)

# ---------------------------------------------------------------------
# If the file is a DICOM then DicomText has options to change the output format.
# These are passed to the DicomText and StructuredReport constructors.
DicomTextArgs = {
#'include_header' : True,
#'replace_HTML_entities' : True,
'replace_HTML_char' : '.',
'replace_newline_char' : '\n'
}
if args.replace_html:
DicomTextArgs['replace_HTML_char'] = args.replace_html
if args.replace_html == "squash":
DicomTextArgs['replace_HTML_char'] = ''
if args.replace_newlines:
DicomTextArgs['replace_newline_char'] = args.replace_newlines
if args.replace_newlines == "squash":
DicomTextArgs['replace_newline_char'] = ''

# ---------------------------------------------------------------------
# Read the original DICOM file and parse the original text
dicomtext = DicomText.DicomText(args.input_dcm)
dicomtext = DicomText.DicomText(args.input_dcm, **DicomTextArgs)
dicomtext.parse()

# Read the annotated XML file
Expand Down
34 changes: 34 additions & 0 deletions src/applications/SRAnonTool/test/modify_SR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python
# Read in report02.dcm and write report02mod.dcm
# with all instances of "The patient" replaced by one of the actual names
# found in the DICOM tags.

import random
import pydicom

ds = pydicom.dcmread('report02.dcm')

names = set()
for elem in ds.iterall():
if elem.VR == 'PN':
names.add(str(elem.value))

name_parts = set()
for name in names:
# family name complex, given name complex, middle name, name prefix, name suffix
for part in name.split('^'):
for s in part.split(' '):
if len(s) > 3:
name_parts.add(s)
name_parts = list(name_parts)
print(name_parts)

print(random.sample(name_parts, 1)[0])

for elem in ds.iterall():
if 'The patient' in str(elem.value):
elem.value = str(elem.value).replace('The patient', random.sample(name_parts, 1)[0])
if 'the patient' in str(elem.value):
elem.value = str(elem.value).replace('the patient', random.sample(name_parts, 1)[0])

ds.save_as('report02mod.dcm')
10 changes: 5 additions & 5 deletions src/common/Smi_Common_Python/SmiServices/Dicom.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def tag_alt(tag):
alt = '{:0>8X}'.format(pydicom.datadict.tag_for_keyword(tag))
return(alt)

def tag_val(dicomdict, tagname):
def tag_val(dicomdict, tagname, atomic = False):
""" Look up dicomdict['tagname']
where tagname can be a hex string or a name string
and the dicomdict can hold either the hex string or the name
Expand All @@ -48,7 +48,7 @@ def tag_val(dicomdict, tagname):
retval = dicomdict[tagname]
elif alt_tagname in dicomdict:
retval = dicomdict[alt_tagname]
# The dcm2jsom or pydicom style has 'vr' and 'Value' keys
# The dcm2json or pydicom style has 'vr' and 'Value' keys
# so extract the Value (also sometimes has vr but no Value).
if isinstance(retval, Mapping):
if 'vr' in retval:
Expand All @@ -57,9 +57,9 @@ def tag_val(dicomdict, tagname):
val = retval.get('val', '') # but I've also seen val
retval = val
# Single element list reduced to just the first element
# but doing this breaks the assertions below.
#if isinstance(retval, list) and len(retval)==1:
# retval = retval[0]
# only if you explicitly request this with atomic=True.
if isinstance(retval, list) and len(retval)==1 and atomic:
retval = retval[0]
return(retval)

def tag_is(tagA, tagB):
Expand Down
73 changes: 52 additions & 21 deletions src/common/Smi_Common_Python/SmiServices/DicomText.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pydicom
import re
import random
from SmiServices import Dicom
from SmiServices.StructuredReport import sr_keys_to_extract, sr_keys_to_ignore
from SmiServices.StringUtils import string_match_ignore_linebreak, redact_html_tags_in_string

Expand Down Expand Up @@ -124,29 +125,54 @@ def tag(self, tagname):
else:
return ''

def _dataset_read_callback(self, dataset, data_element):
""" Internal function called during a walk of the dataset.
Builds a class-member string _p_text as it goes.
def list_of_PNAMEs(self):
""" Return a list of the values of all tags with a VR of PN
"""
names = set()
for elem in self._dicom_raw.iterall():
if elem.VR == 'PN' and len(str(elem.value)):
names.add(str(elem.value))
return list(names)

def _data_element_parser(self, data_element):
""" Internal function called by the parse and redact callbacks
to consistently convert the data_element into the string which
will be returned, in both raw and html-redacted versions.
Returns the tuple (rc, rc_parsed).
If html redaction is disabled then rc_parsed == rc.
"""
rc = ''
rc_parsed = ''
if data_element.VR in ['SH', 'CS', 'SQ', 'UI']:
# "SH" Short String, "CS" Code String, "SQ" Sequence, "UI" UID ignored
pass
elif data_element.VR == 'LO':
# "LO" Long String typically used for headings
rc = rc + ('# %s' % str(data_element.value)) + '\n'
rc = rc_parsed = rc + ('# %s' % str(data_element.value)) + '\n'
else:
rc = rc + ('%s' % (str(data_element.value)))
rc += '\n'
# Replace HTML tags with spaces
if self._replace_HTML_entities and '<' in rc:
rc = redact_html_tags_in_string(rc,
rc_parsed = redact_html_tags_in_string(rc,
replace_char = self._replace_HTML_char,
replace_newline = self._replace_newline_char)
rc += '\n'
if rc == '':
else:
rc_parsed = rc
return (rc, rc_parsed)

def _dataset_read_callback(self, dataset, data_element):
""" Internal function called during a walk of the dataset.
Builds a class-member string _p_text as it goes.
"""
rc, rc_parsed = self._data_element_parser(data_element)
if rc_parsed == '':
return
self._offset_list.append( { 'offset':len(self._p_text), 'string': rc} )
self._p_text = self._p_text + rc
self._offset_list.append( {
'offset':len(self._p_text),
'string': rc_parsed
} )
self._p_text = self._p_text + rc_parsed

def parse(self):
""" Walk the dataset to extract the text which can then be
Expand All @@ -157,10 +183,16 @@ def parse(self):
# except explicitly do not include TextValue, handled below
list_of_tagname_desired = [ k['tag'] for k in sr_keys_to_extract ]
if self._include_header:
# Add all the known [[something]] headers
for srkey in sr_keys_to_extract:
if srkey['tag'] in self._dicom_raw and srkey['tag'] != 'TextValue':
line = '[[%s]] %s\n' % (srkey['label'], srkey['decode_func'](str(self._dicom_raw[srkey['tag']].value)))
self._p_text = self._p_text + line
# Collect all names in the whole document and add [[Other Names]] header
names_list = self.list_of_PNAMEs()
for name in names_list:
line = '[[Other Names]] %s\n' % Dicom.sr_decode_PNAME(name)
self._p_text = self._p_text + line
# Now read ALL tags and use a blacklist (and ignore already done in whitelist).
# Private tags will have tagname='' so ignore those too.
if self._include_header:
Expand Down Expand Up @@ -210,6 +242,10 @@ def redact_string(self, plaintext, offset, rlen, VR):
redact_char = DicomText._redact_char_digit
if DicomText._redact_random_length:
redact_length = random.randint(-int(rlen/2), int(rlen/2))
# Replace all dates with 11111111 to that they validate ok
if VR in ['DA', 'DT']:
redact_length = 8
redact_char = '1'
redacted_part = redact_char.rjust(redact_length, redact_char) if redact_char else ''
rc = plaintext[0:offset] + redacted_part + plaintext[offset+rlen:]
return rc
Expand All @@ -219,22 +255,17 @@ def _dataset_redact_callback(self, dataset, data_element):
Builds a class-member string _r_text as it goes.
Uses the annotation list in self._annotations to redact text.
"""

rc = ''
if data_element.VR in ['SH', 'CS', 'SQ', 'UI']:
pass
elif data_element.VR == 'LO':
rc = rc + ('# %s' % str(data_element.value)) + '\n'
else:
rc = rc + ('%s' % (str(data_element.value))) + '\n'
if rc == '':
rc, rc_parsed = self._data_element_parser(data_element)
if rc_parsed == '':
return
rc_without_html = rc_parsed
# The current string is now len(self._r_text) ..to.. +len(rc)
current_start = len(self._r_text)
current_end = current_start + len(rc)
replacement = rc
replacedAny = False
#print('At %d = %s' % (current_start, str(data_element.value)))
# Check every annotation to see, if not already done, if it appears in this rc
for annot in self._annotations:
# Sometimes it reports text:None so ignore
if not annot['text'] or (annot['start_char'] == annot['end_char']):
Expand All @@ -252,9 +283,6 @@ def _dataset_redact_callback(self, dataset, data_element):
# SemEHR may have an extra line at the start so start_char offset need adjusting
for offset in [self._redact_offset] + list(range(-32, 32)):
# Do the comparison using text without html but replace inside text with html
rc_without_html = redact_html_tags_in_string(rc,
replace_char = self._replace_HTML_char,
replace_newline = self._replace_newline_char) if self._replace_HTML_entities else rc
if string_match_ignore_linebreak(rc_without_html[annot_at+offset : annot_end+offset], annot['text']):
replacement = self.redact_string(replacement, annot_at+offset, annot_end-annot_at, data_element.VR)
replaced = replacedAny = True
Expand All @@ -270,8 +298,10 @@ def _dataset_redact_callback(self, dataset, data_element):
# Always fully redact the content of PersonName and Date tags
replacement = self.redact_string(rc, 0, len(rc), data_element.VR)
replacedAny = True
# Put this replacement value back into the DICOM
if replacedAny:
data_element.value = replacement
# _r_text is the original, _redacted_text has been redacted
self._r_text = self._r_text + rc
self._redacted_text = self._redacted_text + replacement
return replacement if replacedAny else None
Expand Down Expand Up @@ -401,6 +431,7 @@ def test_DicomText():
[[Patient Birth Date]] 19781024
[[Patient Sex]] M
[[Referring Physician Name]]
[[Other Names]] John R Walz
[[ContentSequence]]
# Request
MRI: Knee
Expand Down
Loading
Loading