SMI · rkm · Mar 3, 2022 · Sep 29, 2021 · Sep 29, 2021 · Sep 29, 2021
diff --git a/news/929-bugfix.md b/news/929-bugfix.md
@@ -0,0 +1,10 @@
+Structured Reports improvements from PR#929
+
+* Updated documentation
+* Simplify SRAnonTool using external program semehr_anon.py
+* Handle ConceptNameCodeSequence which has VR but no Value
+* Ensure 'replaced' flag is not reset
+* Write replacement DICOM whichever content tag is found
+* Extract metadata from Mongo to go alongside anonymised text
+* Redact numeric DICOM tags with all '9' not all 'X'
+* Allow badly-formatted text content which contains HTML but does not escape non-HTML symbols
diff --git a/src/applications/SRAnonTool/CTP_DicomToText.py b/src/applications/SRAnonTool/CTP_DicomToText.py
@@ -8,6 +8,7 @@
 #  -i = input DICOM file, full path or relative to FileSystemRoot,
 #       or if not found then looked up in the MongoDB.
 #  -o = output filename or directory for the plain text file
+#  -m = output filename or directory for metadata json file
 #  --semehr-unique = only extract records from Mongo dicom database
 #    if they are not already in the SemEHR database.
 # Needs both dataLoad and dataExtract yaml files because Mongo is
@@ -26,14 +27,37 @@
 import yaml
 import pydicom
 import re
-from deepmerge import Merger    # for deep merging dictionaries
+from deepmerge import Merger    # for deep merging yaml dictionaries
 from SmiServices import Mongo
 from SmiServices import DicomText
 from SmiServices import StructuredReport as SR
+from SmiServices import IdentifierMapper
+
+# List of DICOM SR tags which we want exported in metadata json files
+metadata_fields = [
+  "SOPClassUID",
+  "SOPInstanceUID",
+  "StudyInstanceUID",
+  "SeriesInstanceUID",
+  "ContentDate",
+  "ModalitiesInStudy",
+  "PatientID", # this one will be mapped from CHI to EUPI
+]
+
 
 # ---------------------------------------------------------------------
+# PatientID mapping from CHI to EUPI
+
+def patientid_map(PatientID):
+    eupi = IdentifierMapper.CHItoEUPI().lookup(PatientID)
+    if not eupi:
+        return 'UNKNOWN'
+    return eupi
+
 
-def extract_mongojson(mongojson, output):
+# ---------------------------------------------------------------------
+
+def extract_mongojson(mongojson, output, metadata_output=None):
     """ Called by extract_mongojson_file
     to parse the JSON from Mongo and write to output.
     mongojson - the DICOM in JSON format.
@@ -42,24 +66,33 @@ def extract_mongojson(mongojson, output):
     if os.path.isdir(output):
         filename = mongojson['SOPInstanceUID'] + '.txt'
         output = os.path.join(output, filename)
+    if metadata_output and os.path.isdir(metadata_output):
+        filename = mongojson['SOPInstanceUID'] + '.json'
+        metadata_output = os.path.join(metadata_output, filename)
     logging.info('Parse %s' % mongojson.get('header',{}).get('DicomFilePath','<NoFilePath?>'))
+    if 'PatientID' in mongojson:
+        mongojson['PatientID'] = patientid_map(mongojson['PatientID'])
     with open(output, 'w') as fd:
-        SR.SR_parse(mongojson, filename, fd)    
+        SR.SR_parse(mongojson, filename, fd)
+    if metadata_output:
+        with open(metadata_output, 'w') as fd:
+            print(json.dumps({k:mongojson[k] for k in metadata_fields if k in mongojson}), file=fd)
+        logging.info(f'Wrote {metadata_output}')
     logging.info(f'Wrote {output}')
 
 
-def extract_mongojson_file(input, output):
+def extract_mongojson_file(input, output, metadata_output=None):
     """ Read MongoDB data in JSON format from input file
     convert to output, which can be a filename or directory.
     """
     with open(input, 'r') as fd:
         mongojson = json.load(fd)
-    extract_mongojson(mongojson, output)
+    extract_mongojson(mongojson, output, metadata_output=metadata_output)
 
 
 # ---------------------------------------------------------------------
 
-def extract_dicom_file(input, output):
+def extract_dicom_file(input, output, metadata_output=None):
     """ Extract text from a DICOM file input
     into the output, which can be a filename,
     or a directory in which case the file is named by SOPInstanceUID.
@@ -72,14 +105,23 @@ def extract_dicom_file(input, output):
     if os.path.isdir(output):
         filename = dicomtext.SOPInstanceUID() + '.txt'
         output = os.path.join(output, filename)
+    if metadata_output and os.path.isdir(metadata_output):
+        filename = dicomtext.SOPInstanceUID() + '.json'
+        metadata_output = os.path.join(metadata_output, filename)
     with open(output, 'w') as fd:
         fd.write(dicomtext.text())
+    if metadata_output:
+        with open(metadata_output, 'w') as fd:
+            metadata_json = {k:dicomtext.tag(k) for k in metadata_fields if dicomtext.tag(k)}
+            metadata_json['PatientID'] = patientid_map(metadata_json.get('PatientID',''))
+            print(json.dumps(metadata_json), file=fd)
+        logging.info(f'Wrote {metadata_output}')
     logging.info(f'Wrote {output}')
 
 
 # ---------------------------------------------------------------------
 
-def extract_file(input, output):
+def extract_file(input, output, metadata_output=None):
     """ If it's a readable DICOM file then extract it
     otherwise try to find it in MongoDB.
     """
@@ -90,9 +132,9 @@ def extract_file(input, output):
         is_dcm = False
 
     if is_dcm:
-        extract_dicom_file(input, output)
+        extract_dicom_file(input, output, metadata_output)
     else:
-        extract_mongojson_file(input, output)
+        extract_mongojson_file(input, output, metadata_output)
 
 
 
@@ -104,6 +146,7 @@ def extract_file(input, output):
     parser.add_argument('-y', dest='yamlfile', action="append", help='path to yaml config file (can be used more than once)')
     parser.add_argument('-i', dest='input', action="store", help='SOPInstanceUID or path to raw DICOM file from which text will be redacted')
     parser.add_argument('-o', dest='output_dir', action="store", help='path to directory where extracted text will be written')
+    parser.add_argument('-m', dest='metadata_dir', action="store", help='path to directory where extracted metadata will be written')
     parser.add_argument('--semehr-unique', dest='semehr_unique', action="store_true", help='only extract from MongoDB/dicom if not already in MongoDB/semehr')
     args = parser.parse_args()
     if not args.input:
@@ -120,11 +163,17 @@ def extract_file(input, output):
             # Merge all the yaml dicts into one
             cfg_dict = Merger([(list, ["append"]),(dict, ["merge"])],["override"],["override"]).merge(cfg_dict, yaml.safe_load(fd))
 
+    # Initialise the PatientID mapping by opening a DB connection
+    if cfg_dict:
+        IdentifierMapper.CHItoEUPI(cfg_dict)
+
+    # For reading SRs
     mongo_dicom_host = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('HostName',{})
     mongo_dicom_user = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('UserName',{})
     mongo_dicom_pass = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('Password',{})
     mongo_dicom_db   = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('DatabaseName',{})
 
+    # For writing annotations
     mongo_semehr_host = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('HostName',{})
     mongo_semehr_user = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('UserName',{})
     mongo_semehr_pass = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('Password',{})
@@ -143,15 +192,15 @@ def extract_file(input, output):
     # ---------------------------------------------------------------------
     if os.path.isfile(args.input):
         # actual path to DICOM
-        extract_file(args.input, args.output_dir)
+        extract_file(args.input, args.output_dir, args.metadata_dir)
     elif os.path.isfile(os.path.join(root_dir, args.input)):
         # relative to FileSystemRoot
-        extract_file(os.path.join(root_dir, args.input), args.output_dir)
+        extract_file(os.path.join(root_dir, args.input), args.output_dir, args.metadata_dir)
     elif os.path.isdir(args.input):
         # Recurse directory
         for root, dirs, files in os.walk(args.input, topdown=False):
             for name in files:
-                extract_file(os.path.join(root, name), args.output_dir)
+                extract_file(os.path.join(root, name), args.output_dir, args.metadata_dir)
     elif mongo_dicom_db != {}:
         # Only DicomFilePath and StudyDate are indexed in MongoDB.
         # Passing a SOPInstanceUID would be handy but no point if not indexed.
@@ -164,11 +213,11 @@ def extract_file(input, output):
             for mongojson in mongodb_in.StudyDateToJSONList(args.input):
                 # If it's already in the annotation database then don't bother extracting.
                 if not args.semehr_unique or not mongodb_out.findSOPInstanceUID(mongojson['SOPInstanceUID']):
-                    extract_mongojson(mongojson, args.output_dir)
+                    extract_mongojson(mongojson, args.output_dir, args.metadata_dir)
         # Otherwise assume a DICOM file path which can be retrieved from MongoDB
         else:
             mongojson = mongodb_in.DicomFilePathToJSON(args.input)
-            extract_mongojson(mongojson, args.output_dir)
+            extract_mongojson(mongojson, args.output_dir, args.metadata_dir)
     else:
         logging.error(f'Cannot find {args.input} as file and MongoDB not configured')
         exit(1)
diff --git a/src/applications/SRAnonTool/CTP_SRAnonTool.sh b/src/applications/SRAnonTool/CTP_SRAnonTool.sh
@@ -130,41 +130,9 @@ CTP_DicomToText.py  -y $default_yaml0 -y $default_yaml1 \
 # ---------------------------------------------------------------------
 # Run the SemEHR anonymiser using a set of private directories
 #  Reads  $input_doc
-#  Writes $anon_doc and $anon_xml
+#  Writes $anon_doc, and $anon_xml via the --xml option
 #
-# If the new anonymiser exists then use it
-if [ -f $semehr_dir/CogStack-SemEHR/anonymisation/anonymiser.py ]; then
-	# Create a custom config file in the output directory
-	jq < $semehr_dir/CogStack-SemEHR/anonymisation/conf/anonymisation_task.json > $semehr_output_dir/anonymisation_task.json \
-		'.text_data_path="'${semehr_input_dir}'"|'\
-'.anonymisation_output="'${semehr_output_dir}'"|'\
-'.extracted_phi="'${semehr_output_dir}'/phi"|'\
-'.grouped_phi_output="'${semehr_output_dir}'/phi_grouped"|'\
-'.logging_file="'${semehr_output_dir}'/log"|'\
-'.annotation_mode=true'
-	# Run the new anonymiser
-	if [ $verbose -gt 0 ]; then
-		echo "RUN: ${semehr_dir}/CogStack-SemEHR/anonymisation/anonymiser.py $semehr_output_dir/anonymisation_task.json"
-	fi
-	(cd $semehr_dir/CogStack-SemEHR/anonymisation; python3 ./anonymiser.py $semehr_output_dir/anonymisation_task.json) >> $log 2>&1
-	rc=$?
-else
-	if [ $verbose -gt 0 ]; then
-		echo "RUN: ${semehr_dir}/CogStack-SemEHR/analysis/clinical_doc_wrapper.py -i ${semehr_input_dir} -o ${semehr_output_dir}"
-	fi
-	# NOTE: This requires that SemEHR be modified to accept the -i and -o options.
-	(cd ${semehr_dir}/CogStack-SemEHR/analysis; ./clinical_doc_wrapper.py -i ${semehr_input_dir} -o ${semehr_output_dir}) >> $log 2>&1
-	rc=$?
-fi
-if [ $rc -ne 0 ]; then
-	tidy_exit $rc "Possible failure (exit code $rc) of SemEHR-anon given ${input_doc} from ${input_dcm}"
-fi
-# The new SemEHR anonymiser can be configured to create knowtator.xml files
-# but they aren't as complete as the PHI file. Convert the PHI to XML.
-if [ $verbose -gt 0 ]; then
-	echo "Convert PHI to Knowtator.XML"
-fi
-CTP_PhiToXML.py -p ${semehr_output_dir}/phi
+semehr_anon.py -i "${input_doc}" -o "${anon_doc}" --xml || tidy_exit 5 "Error running SemEHR-anon given ${input_doc} from ${input_dcm}"
 # If there's still no XML file then exit
 if [ ! -f "$anon_xml" ]; then
 	tidy_exit 6 "ERROR: SemEHR-anon failed to convert $input_doc to $anon_xml"

diff --git a/src/applications/SRAnonTool/CTP_XMLToDicom.py b/src/applications/SRAnonTool/CTP_XMLToDicom.py
@@ -79,8 +79,8 @@
     # Read the annotated XML file
     xmlroot = xml.etree.ElementTree.parse(args.input_xml).getroot()
     xmldictlist = Knowtator.annotation_xml_to_dict(xmlroot)
-    if xmldictlist == []:
-        print('WARNING: empty document in {}'.format(args.input_xml))
+    #if xmldictlist == []:
+    #    print('WARNING: empty document in {}'.format(args.input_xml))
     #for annot in xmldictlist:
     #    print('REMOVE {} from DICOM at {}'.format(annot['text'], annot['start_char']))
 

diff --git a/src/applications/SRAnonTool/README.md b/src/applications/SRAnonTool/README.md
@@ -29,7 +29,7 @@ Install the python library `SmiServices` to `$SMI_ROOT/lib/python3/` or a virtua
 
 Ensure the python package dependencies are installed system-wide or in a virtualenv on the host machine.
 
-Modify the `default.yaml` file: in the section `CTPAnonymiserOptions` add `SRAnonTool: /path/to/SRAnonTool.sh`
+Modify the `default.yaml` file: in the section `CTPAnonymiserOptions` add `SRAnonTool: /path/to/CTP_SRAnonTool.sh`
 
 Ensure the `default.yaml` file contains the necessary `FileSystemOptions`, `LoggingOptions>LogsRoot`, `MongoDatabases`, `RabbitOptions`, etc.
 
@@ -46,18 +46,18 @@ If using the test stub then only the data directories are required and Python2 i
 
 ## Usage as part of CTP
 
-Configure CTP to call the script SRAnonTool.sh when it detects a DICOM file with `SR` in the `Modality` tag, by editing `default.yaml` as above. CTP will call the script with two options:
+Configure CTP to call the script CTP_SRAnonTool.sh when it detects a DICOM file with `SR` in the `Modality` tag, by editing `default.yaml` as above. CTP will call the script with two options:
 * `-i input.dcm` - the raw DICOM file before anonymisation
 * `-o output.dcm` - the DICOM file which CTP has already anonymised
 
 The script will extract the text from the `input.dcm` file, anonymise it, and write the redacted text into the `output.dcm` file, which must already exist.
 
 ## Standalone usage
 
-The script `SRAnonTool.sh` calls three components:
+The script `CTP_SRAnonTool.sh` calls three components:
 
 * `CTP_DicomToText.py` - extracts the text from the raw DICOM file into a format suitable for SemEHR-CogStack.
-* `clinical_doc_wrapper.py` - this is the component within SemEHR-CogStack which anonymises the text.
+* `CogStack-SemEHR/anonymisation/anonymiser.py` - this is the script in SemEHR-CogStack which anonymises the text.
 * `CTP_XMLToDicom.py` - redacts the text from the raw DICOM file and write the redacted text into the output DICOM file.
 
 Usage: `[-e virtualenv] [-s semehr_dir]  -i read_from.dcm  -o write_into.dcm`
@@ -71,29 +71,42 @@ The SemEHR directory (`/opt/semehr`) can be changed with the `-s` option for tes
 
 This program can be used as part of the SRAnonTool pipeline or it can be used standalone to extract documents in bulk for later SemEHR processing.
 
-Usage: `-y default.yaml -i input.dcm -o outfile [--semehr-unique]`
+Usage: `-y default.yaml -i input.dcm -o output [-m metadata_output] [--semehr-unique]`
 
 `-y default.yaml` - may be specified more than once if the configuration parameters are spread across multiple yaml files.
 
 `-i input.dcm` - full path to the input DICOM file, or a partial path to be extracted from MongoDB, or a StudyDate to extract all records that day from MongoDB.
 
 `-o output` - full path to the output text file, or directory for multiple files.
 
+`-m metadata_output` - full path to the output metadata json file, or directory for multiple files.
+
 `--semehr-unique` - if extracting a StudyDate from MongoDB then ignore any documents which have a SOPInstanceUID that is already in the SemEHR MongoDB database. This is intended to allow reprocessing of any documents that previously failed without having to reprocess the whole day.
 
-The MongoDB configuration read from the yaml files needs to be in `MongoDatabases | DicomStoreOptions` and `SemEHRStoreOptions`. The former is to read DICOM documents from the `dicom.image_SR` database.collection; the latter is to check if the SOPInstanceUID is already in the `semehr.semehr_results` database.collection.
+If metadata output is requested then JSON output files are created containing the values of these tags:
+`SOPClassUID, SOPInstanceUID, StudyInstanceUID, SeriesInstanceUID, ContentDate, ModalitiesInStudy, PatientID`.
+The latter is mapped from CHI to EUPI.
+
+The MongoDB configuration read from the yaml files needs to be in `MongoDatabases | DicomStoreOptions` and `SemEHRStoreOptions`.
+The former is to read DICOM documents from the `dicom.image_SR` database.collection;
+the latter is to check if the SOPInstanceUID is already in the `semehr.semehr_results` database.collection.
+
+The MySQL configuration read from the yaml files needs to be in `IdentifierMapperOptions`
+with keys `MappingConnectionString, MappingTableName, SwapColumnName, ReplacementColumnName`.
+This is used to map PatientID.
 
 Examples:
 
 ```
 * CTP_DicomToText.py -i /path/to/file.dcm -o output.txt
 * CTP_DicomToText.py -i 2015/01/01/AccNum/file.dcm -o output.txt -y smi_dataLoad.yaml
-* CTP_DicomToText.py -i 20150101 -o output_dir -y smi_dataLoad.yaml
+* CTP_DicomToText.py -i 20150101 -o output_dir -m metadata_dir -y smi_dataLoad.yaml
 ```
 
 ### `clinical_doc_wrapper.py`
 
 This script performs the anonymisation.
+It is the old Python-2 version and is no longer used.
 
 Usage: `[-s semehr_dir] [-i input_docs] [-o anonymised]` in the stub version
 
@@ -158,3 +171,10 @@ The defaults are:
 `-p pattern_to_redact` - `Baker` (to suit the example DICOM file)
 
 `-y default.yaml` - `../../../../data/microserviceConfigs/default.yaml`
+
+To run in the test directory
+```
+mkdir -p ./data/input_docs
+mkdir -p ./data/anonymised
+./CTP_SRAnonTool_test.py -s .
+```
diff --git a/src/common/Smi_Common_Python/README.md b/src/common/Smi_Common_Python/README.md
@@ -11,6 +11,7 @@ pydicom
 pymongo
 PyYAML
 xml.etree (comes with python)
+mysql-connector-python (which requires six, protobuf, dnspython) for IdentifierMapper
 ```
 
 ## Installation
@@ -32,9 +33,9 @@ pytest SmiServices/*.py
 
 Test each module individually, for example:
 ```
-python3 -m pytest SmiService/Dicom.py
-python3 -m pytest SmiService/DicomText.py
-python3 -m pytest SmiService/StructuredReport.py
+python3 -m pytest SmiServices/Dicom.py
+python3 -m pytest SmiServices/DicomText.py
+python3 -m pytest SmiServices/StructuredReport.py
 ```
 
 ## Usage
@@ -49,6 +50,7 @@ from SmiServices import Rabbit
 from SmiServices import Dicom
 from SmiServices import DicomText
 from SmiServices import StructuredReport as SR
+from SmiServices import IdentifierMapper
 ```
 
 ## Dicom.py
@@ -73,6 +75,20 @@ OR
 write_redacted_text_into_dicom_file  # to rewrite a second file with redacted text
 ```
 
+It also contains a `tag` method to return the value of the given named tag.
+
+## IdentifierMapper.py
+
+Provide a class CHItoEUPI for mapping from CHI to EUPI.
+Create one instance with the SMI yaml dictionary to open a connection
+to MySQL. Future instances can be created without the yaml and will
+reuse the mysql connection.
+
+```
+IdentifierMapper.CHItoEUPI(yaml_dict)
+eupi = IdentifierMapper.CHItoEUPI().lookup(chi)
+```
+
 ## Knowtator.py
 
 Provides a function for parsing the XML files containing annotations

diff --git a/src/common/Smi_Common_Python/SmiServices/Dicom.py b/src/common/Smi_Common_Python/SmiServices/Dicom.py
@@ -160,6 +160,8 @@ def test_sr_decode_ReferencedSOPSequence():
 # Decode the ConceptNameCodeSequence by returning the value of CodeMeaning inside
 
 def sr_decode_ConceptNameCodeSequence(cncs):
+    if not cncs:
+        return ''
     assert isinstance(cncs, list)
     for cncs_item in cncs:
         if has_tag(cncs_item, 'CodeMeaning'):