SMI · jas88 · Nov 20, 2023 · Apr 19, 2023 · Apr 19, 2023 · Apr 19, 2023
diff --git a/news/1524-bugfix.md b/news/1524-bugfix.md
@@ -0,0 +1 @@
+StructuredReport improvements - collect names from anywhere in text body not just in header
diff --git a/src/applications/SRAnonTool/CTP_DicomToText.py b/src/applications/SRAnonTool/CTP_DicomToText.py
@@ -29,6 +29,7 @@
 import re
 from deepmerge import Merger    # for deep merging yaml dictionaries
 from SmiServices import Mongo
+from SmiServices import Dicom
 from SmiServices import DicomText
 from SmiServices import StructuredReport as SR
 from SmiServices import IdentifierMapper
@@ -81,10 +82,10 @@ def extract_mongojson(mongojson, output, metadata_output=None, DicomTextArgs = N
         DicomTextArgs = {}
 
     if os.path.isdir(output):
-        filename = mongojson['SOPInstanceUID'] + '.txt'
+        filename = Dicom.tag_val(mongojson,'SOPInstanceUID', atomic=True) + '.txt'
         output = os.path.join(output, filename)
     if metadata_output and os.path.isdir(metadata_output):
-        mfilename = mongojson['SOPInstanceUID'] + '.json'
+        mfilename = Dicom.tag_val(mongojson,'SOPInstanceUID', atomic=True) + '.json'
         metadata_output = os.path.join(metadata_output, mfilename)
     logging.info('Parse %s' % mongojson.get('header',{}).get('DicomFilePath','<NoFilePath?>'))
     if 'PatientID' in mongojson:

diff --git a/src/applications/SRAnonTool/CTP_SRAnonTool.sh b/src/applications/SRAnonTool/CTP_SRAnonTool.sh
@@ -10,8 +10,8 @@
 
 prog=$(basename "$0")
 progdir=$(dirname "$0")
-usage="usage: ${prog} [-d] [-v] [-e virtualenv] [-s semehr_root] -i read_from.dcm  -o write_into.dcm"
-options="dve:s:i:o:"
+usage="usage: ${prog} [-d] [-v] [-e virtualenv] [-s semehr_root] [-y yaml] -i read_from.dcm  -o write_into.dcm"
+options="dve:s:y:i:o:"
 semehr_dir="/opt/semehr"
 virtenv=""
 debug=0
@@ -58,14 +58,17 @@ tidy_exit()
 
 # Default executable PATHs and Python libraries
 export PATH=${PATH}:${SMI_ROOT}/bin:${SMI_ROOT}/scripts:${progdir}
-export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib64/python3.6/site-packages
+if [ "$PYTHONPATH" == "" ]; then
+	export PYTHONPATH=${SMI_ROOT}/lib/python3:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib/python3.6/site-packages:${SMI_ROOT}/lib/python3/virtualenvs/semehr/$(hostname -s)/lib64/python3.6/site-packages
+fi
 
 # Command line arguments
 while getopts ${options} var; do
 case $var in
 	d) debug=1;;
 	v) verbose=1;;
 	e) virtenv="$OPTARG";;
+	y) default_yaml0="$OPTARG";;
 	i) input_dcm="$OPTARG";;
 	o) output_dcm="$OPTARG";;
 	s) semehr_dir="$OPTARG";;
@@ -78,7 +81,9 @@ if [ ! -f "$input_dcm" ]; then
 	tidy_exit 2 "ERROR: cannot read input file '${input_dcm}'"
 fi
 if [ ! -f "$output_dcm" ]; then
-	tidy_exit 3 "ERROR: cannot write to ${output_dcm} because it must already exist"
+	#tidy_exit 3 "ERROR: cannot write to ${output_dcm} because it must already exist"
+	cp "$input_dcm" "$output_dcm"
+	chmod +w "$output_dcm"
 fi
 
 # Activate the virtual environment
@@ -91,15 +96,20 @@ if [ "$virtenv" != "" ]; then
 	fi
 fi
 
-# Find the config files
-if [ -d $SMI_ROOT/configs ]; then
-	default_yaml0="$SMI_ROOT/configs/smi_dataLoad_mysql.yaml"
-	default_yaml1="$SMI_ROOT/configs/smi_dataExtract.yaml"
-else
-	default_yaml0="${progdir}/../../../data/microserviceConfigs/default.yaml"
+# Find the config files, if not specified try SMI defaults otherwise in the repo
+if [ "$default_yaml0" == "" ]; then
+	if [ -f "$SMI_ROOT/configs/smi_dataExtract.yaml" ]; then
+		default_yaml0="$SMI_ROOT/configs/smi_dataLoad_mysql.yaml"
+		default_yaml1="$SMI_ROOT/configs/smi_dataExtract.yaml"
+	else
+		default_yaml0="${progdir}/../../../data/microserviceConfigs/default.yaml"
+	fi
+fi
+if [ "$default_yaml1" == "" ]; then
 	default_yaml1="$default_yaml0"
 fi
 
+
 # ---------------------------------------------------------------------
 # Determine the SemEHR filenames - create per-process directories
 semehr_input_dir=$(mktemp  -d -t input_docs.XXXX --tmpdir=${semehr_dir}/data)
@@ -132,7 +142,7 @@ CTP_DicomToText.py  -y $default_yaml0 -y $default_yaml1 \
 #  Reads  $input_doc
 #  Writes $anon_doc, and $anon_xml via the --xml option
 #
-semehr_anon.py -i "${input_doc}" -o "${anon_doc}" --xml || tidy_exit 5 "Error running SemEHR-anon given ${input_doc} from ${input_dcm}"
+semehr_anon.py -s "${semehr_dir}" -i "${input_doc}" -o "${anon_doc}" --xml || tidy_exit 5 "Error running SemEHR-anon given ${input_doc} from ${input_dcm}"
 # If there's still no XML file then exit
 if [ ! -f "$anon_xml" ]; then
 	tidy_exit 6 "ERROR: SemEHR-anon failed to convert $input_doc to $anon_xml"

diff --git a/src/applications/SRAnonTool/CTP_XMLToDicom.py b/src/applications/SRAnonTool/CTP_XMLToDicom.py
@@ -35,6 +35,8 @@
     parser.add_argument('-i', dest='input_dcm', action="store", help='Path to raw DICOM file')
     parser.add_argument('-x', dest='input_xml', action="store", help='Path to annotation XML file')
     parser.add_argument('-o', dest='output_dcm', action="store", help='Path to anonymised DICOM file to have redacted text inserted')
+    parser.add_argument('--replace-html', action="store", help='replace HTML with a character, default is dot (.), or "squash" to eliminate')
+    parser.add_argument('--replace-newlines', action="store", help='replace carriage returns and newlines with a character (e.g. a space) or "squash" to eliminate')
     args = parser.parse_args()
     if not args.input_dcm or not args.input_xml or not args.output_dcm:
         parser.print_help()
@@ -71,9 +73,27 @@
         logging.error('ERROR: no such file named {} (redacted text is written into this so it must exist)'.format(args.output_dcm))
         exit(1)
 
+    # ---------------------------------------------------------------------
+    # If the file is a DICOM then DicomText has options to change the output format.
+    # These are passed to the DicomText and StructuredReport constructors.
+    DicomTextArgs = {
+        #'include_header' : True,
+        #'replace_HTML_entities' : True,
+        'replace_HTML_char' : '.',
+        'replace_newline_char' : '\n'
+    }
+    if args.replace_html:
+        DicomTextArgs['replace_HTML_char'] = args.replace_html
+        if args.replace_html == "squash":
+            DicomTextArgs['replace_HTML_char'] = ''
+    if args.replace_newlines:
+        DicomTextArgs['replace_newline_char'] = args.replace_newlines
+        if args.replace_newlines == "squash":
+            DicomTextArgs['replace_newline_char'] = ''
+
     # ---------------------------------------------------------------------
     # Read the original DICOM file and parse the original text
-    dicomtext = DicomText.DicomText(args.input_dcm)
+    dicomtext = DicomText.DicomText(args.input_dcm, **DicomTextArgs)
     dicomtext.parse()
 
     # Read the annotated XML file

diff --git a/src/applications/SRAnonTool/test/modify_SR.py b/src/applications/SRAnonTool/test/modify_SR.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# Read in report02.dcm and write report02mod.dcm
+# with all instances of "The patient" replaced by one of the actual names
+# found in the DICOM tags.
+
+import random
+import pydicom
+
+ds = pydicom.dcmread('report02.dcm')
+
+names = set()
+for elem in ds.iterall():
+    if elem.VR == 'PN':
+        names.add(str(elem.value))
+
+name_parts = set()
+for name in names:
+    # family name complex, given name complex, middle name, name prefix, name suffix
+    for part in name.split('^'):
+        for s in part.split(' '):
+            if len(s) > 3:
+                name_parts.add(s)
+name_parts = list(name_parts)
+print(name_parts)
+
+print(random.sample(name_parts, 1)[0])
+
+for elem in ds.iterall():
+    if 'The patient' in str(elem.value):
+        elem.value = str(elem.value).replace('The patient', random.sample(name_parts, 1)[0])
+    if 'the patient' in str(elem.value):
+        elem.value = str(elem.value).replace('the patient', random.sample(name_parts, 1)[0])
+
+ds.save_as('report02mod.dcm')
diff --git a/src/common/Smi_Common_Python/SmiServices/Dicom.py b/src/common/Smi_Common_Python/SmiServices/Dicom.py
@@ -35,7 +35,7 @@ def tag_alt(tag):
     alt = '{:0>8X}'.format(pydicom.datadict.tag_for_keyword(tag))
     return(alt)
 
-def tag_val(dicomdict, tagname):
+def tag_val(dicomdict, tagname, atomic = False):
     """ Look up dicomdict['tagname']
       where tagname can be a hex string or a name string
       and the dicomdict can hold either the hex string or the name
@@ -48,7 +48,7 @@ def tag_val(dicomdict, tagname):
         retval = dicomdict[tagname]
     elif alt_tagname in dicomdict:
         retval = dicomdict[alt_tagname]
-    # The dcm2jsom or pydicom style has 'vr' and 'Value' keys
+    # The dcm2json or pydicom style has 'vr' and 'Value' keys
     # so extract the Value (also sometimes has vr but no Value).
     if isinstance(retval, Mapping):
         if 'vr' in retval:
@@ -57,9 +57,9 @@ def tag_val(dicomdict, tagname):
                 val = retval.get('val', '') # but I've also seen val
             retval = val
     # Single element list reduced to just the first element
-    # but doing this breaks the assertions below.
-    #if isinstance(retval, list) and len(retval)==1:
-    #	retval = retval[0]
+    # only if you explicitly request this with atomic=True.
+    if isinstance(retval, list) and len(retval)==1 and atomic:
+    	retval = retval[0]
     return(retval)
 
 def tag_is(tagA, tagB):

diff --git a/src/common/Smi_Common_Python/SmiServices/DicomText.py b/src/common/Smi_Common_Python/SmiServices/DicomText.py
@@ -5,6 +5,7 @@
 import pydicom
 import re
 import random
+from SmiServices import Dicom
 from SmiServices.StructuredReport import sr_keys_to_extract, sr_keys_to_ignore
 from SmiServices.StringUtils import string_match_ignore_linebreak, redact_html_tags_in_string
 
@@ -124,29 +125,54 @@ def tag(self, tagname):
         else:
             return ''
 
-    def _dataset_read_callback(self, dataset, data_element):
-        """ Internal function called during a walk of the dataset.
-        Builds a class-member string _p_text as it goes.
+    def list_of_PNAMEs(self):
+        """ Return a list of the values of all tags with a VR of PN
+        """
+        names = set()
+        for elem in self._dicom_raw.iterall():
+            if elem.VR == 'PN' and len(str(elem.value)):
+                names.add(str(elem.value))
+        return list(names)
+
+    def _data_element_parser(self, data_element):
+        """ Internal function called by the parse and redact callbacks
+        to consistently convert the data_element into the string which
+        will be returned, in both raw and html-redacted versions.
+        Returns the tuple (rc, rc_parsed).
+        If html redaction is disabled then rc_parsed == rc.
         """
         rc = ''
+        rc_parsed = ''
         if data_element.VR in ['SH', 'CS', 'SQ', 'UI']:
             # "SH" Short String, "CS" Code String, "SQ" Sequence, "UI" UID ignored
             pass
         elif data_element.VR == 'LO':
             # "LO" Long String typically used for headings
-            rc = rc + ('# %s' % str(data_element.value)) + '\n'
+            rc = rc_parsed = rc + ('# %s' % str(data_element.value)) + '\n'
         else:
             rc = rc + ('%s' % (str(data_element.value)))
+            rc += '\n'
             # Replace HTML tags with spaces
             if self._replace_HTML_entities and '<' in rc:
-                rc = redact_html_tags_in_string(rc,
+                rc_parsed = redact_html_tags_in_string(rc,
                     replace_char = self._replace_HTML_char,
                     replace_newline = self._replace_newline_char)
-            rc += '\n'
-        if rc == '':
+            else:
+                rc_parsed = rc
+        return (rc, rc_parsed)
+
+    def _dataset_read_callback(self, dataset, data_element):
+        """ Internal function called during a walk of the dataset.
+        Builds a class-member string _p_text as it goes.
+        """
+        rc, rc_parsed = self._data_element_parser(data_element)
+        if rc_parsed == '':
             return
-        self._offset_list.append( { 'offset':len(self._p_text), 'string': rc} )
-        self._p_text = self._p_text + rc
+        self._offset_list.append( {
+            'offset':len(self._p_text),
+            'string': rc_parsed
+            } )
+        self._p_text = self._p_text + rc_parsed
 
     def parse(self):
         """ Walk the dataset to extract the text which can then be
@@ -157,10 +183,16 @@ def parse(self):
         #  except explicitly do not include TextValue, handled below
         list_of_tagname_desired = [ k['tag'] for k in sr_keys_to_extract ]
         if self._include_header:
+            # Add all the known [[something]] headers
             for srkey in sr_keys_to_extract:
                 if srkey['tag'] in self._dicom_raw and srkey['tag'] != 'TextValue':
                     line = '[[%s]] %s\n' % (srkey['label'], srkey['decode_func'](str(self._dicom_raw[srkey['tag']].value)))
                     self._p_text = self._p_text + line
+            # Collect all names in the whole document and add [[Other Names]] header
+            names_list = self.list_of_PNAMEs()
+            for name in names_list:
+                line = '[[Other Names]] %s\n' % Dicom.sr_decode_PNAME(name)
+                self._p_text = self._p_text + line
         # Now read ALL tags and use a blacklist (and ignore already done in whitelist).
         # Private tags will have tagname='' so ignore those too.
         if self._include_header:
@@ -210,6 +242,10 @@ def redact_string(self, plaintext, offset, rlen, VR):
             redact_char = DicomText._redact_char_digit
         if DicomText._redact_random_length:
             redact_length = random.randint(-int(rlen/2), int(rlen/2))
+        # Replace all dates with 11111111 to that they validate ok
+        if VR in ['DA', 'DT']:
+            redact_length = 8
+            redact_char = '1'
         redacted_part = redact_char.rjust(redact_length, redact_char) if redact_char else ''
         rc = plaintext[0:offset] + redacted_part + plaintext[offset+rlen:]
         return rc
@@ -219,22 +255,17 @@ def _dataset_redact_callback(self, dataset, data_element):
         Builds a class-member string _r_text as it goes.
         Uses the annotation list in self._annotations to redact text.
         """
-
-        rc = ''
-        if data_element.VR in ['SH', 'CS', 'SQ', 'UI']:
-            pass
-        elif data_element.VR == 'LO':
-            rc = rc + ('# %s' % str(data_element.value)) + '\n'
-        else:
-            rc = rc + ('%s' % (str(data_element.value))) + '\n'
-        if rc == '':
+        rc, rc_parsed = self._data_element_parser(data_element)
+        if rc_parsed == '':
             return
+        rc_without_html = rc_parsed
         # The current string is now len(self._r_text) ..to.. +len(rc)
         current_start = len(self._r_text)
         current_end   = current_start + len(rc)
         replacement = rc
         replacedAny = False
         #print('At %d = %s' % (current_start, str(data_element.value)))
+        # Check every annotation to see, if not already done, if it appears in this rc
         for annot in self._annotations:
             # Sometimes it reports text:None so ignore
             if not annot['text'] or (annot['start_char'] == annot['end_char']):
@@ -252,9 +283,6 @@ def _dataset_redact_callback(self, dataset, data_element):
                 # SemEHR may have an extra line at the start so start_char offset need adjusting
                 for offset in [self._redact_offset] + list(range(-32, 32)):
                     # Do the comparison using text without html but replace inside text with html
-                    rc_without_html = redact_html_tags_in_string(rc,
-                        replace_char = self._replace_HTML_char,
-                        replace_newline = self._replace_newline_char) if self._replace_HTML_entities else rc
                     if string_match_ignore_linebreak(rc_without_html[annot_at+offset : annot_end+offset], annot['text']):
                         replacement = self.redact_string(replacement, annot_at+offset, annot_end-annot_at, data_element.VR)
                         replaced = replacedAny = True
@@ -270,8 +298,10 @@ def _dataset_redact_callback(self, dataset, data_element):
             # Always fully redact the content of PersonName and Date tags
             replacement = self.redact_string(rc, 0, len(rc), data_element.VR)
             replacedAny = True
+        # Put this replacement value back into the DICOM
         if replacedAny:
             data_element.value = replacement
+        # _r_text is the original, _redacted_text has been redacted
         self._r_text = self._r_text + rc
         self._redacted_text = self._redacted_text + replacement
         return replacement if replacedAny else None
@@ -401,6 +431,7 @@ def test_DicomText():
 [[Patient Birth Date]] 19781024
 [[Patient Sex]] M
 [[Referring Physician Name]] 
+[[Other Names]] John R Walz
 [[ContentSequence]]
 # Request
 MRI: Knee