Skip to content

Commit

Permalink
Merge branch 'master' into is-ident-package-use
Browse files Browse the repository at this point in the history
  • Loading branch information
tznind committed Mar 7, 2022
2 parents 57bcfca + 6b66f9b commit a845c2a
Show file tree
Hide file tree
Showing 18 changed files with 277 additions and 95 deletions.
2 changes: 2 additions & 0 deletions news/1088-bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Use DirectConnection instead of obsolete ConnectionMode. Fixes #990

10 changes: 10 additions & 0 deletions news/929-bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Structured Reports improvements from PR#929

* Updated documentation
* Simplify SRAnonTool using external program semehr_anon.py
* Handle ConceptNameCodeSequence which has VR but no Value
* Ensure 'replaced' flag is not reset
* Write replacement DICOM whichever content tag is found
* Extract metadata from Mongo to go alongside anonymised text
* Redact numeric DICOM tags with all '9' not all 'X'
* Allow badly-formatted text content which contains HTML but does not escape non-HTML symbols
77 changes: 63 additions & 14 deletions src/applications/SRAnonTool/CTP_DicomToText.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# -i = input DICOM file, full path or relative to FileSystemRoot,
# or if not found then looked up in the MongoDB.
# -o = output filename or directory for the plain text file
# -m = output filename or directory for metadata json file
# --semehr-unique = only extract records from Mongo dicom database
# if they are not already in the SemEHR database.
# Needs both dataLoad and dataExtract yaml files because Mongo is
Expand All @@ -26,14 +27,37 @@
import yaml
import pydicom
import re
from deepmerge import Merger # for deep merging dictionaries
from deepmerge import Merger # for deep merging yaml dictionaries
from SmiServices import Mongo
from SmiServices import DicomText
from SmiServices import StructuredReport as SR
from SmiServices import IdentifierMapper

# List of DICOM SR tags which we want exported in metadata json files
metadata_fields = [
"SOPClassUID",
"SOPInstanceUID",
"StudyInstanceUID",
"SeriesInstanceUID",
"ContentDate",
"ModalitiesInStudy",
"PatientID", # this one will be mapped from CHI to EUPI
]


# ---------------------------------------------------------------------
# PatientID mapping from CHI to EUPI

def patientid_map(PatientID):
eupi = IdentifierMapper.CHItoEUPI().lookup(PatientID)
if not eupi:
return 'UNKNOWN'
return eupi


def extract_mongojson(mongojson, output):
# ---------------------------------------------------------------------

def extract_mongojson(mongojson, output, metadata_output=None):
""" Called by extract_mongojson_file
to parse the JSON from Mongo and write to output.
mongojson - the DICOM in JSON format.
Expand All @@ -42,24 +66,33 @@ def extract_mongojson(mongojson, output):
if os.path.isdir(output):
filename = mongojson['SOPInstanceUID'] + '.txt'
output = os.path.join(output, filename)
if metadata_output and os.path.isdir(metadata_output):
filename = mongojson['SOPInstanceUID'] + '.json'
metadata_output = os.path.join(metadata_output, filename)
logging.info('Parse %s' % mongojson.get('header',{}).get('DicomFilePath','<NoFilePath?>'))
if 'PatientID' in mongojson:
mongojson['PatientID'] = patientid_map(mongojson['PatientID'])
with open(output, 'w') as fd:
SR.SR_parse(mongojson, filename, fd)
SR.SR_parse(mongojson, filename, fd)
if metadata_output:
with open(metadata_output, 'w') as fd:
print(json.dumps({k:mongojson[k] for k in metadata_fields if k in mongojson}), file=fd)
logging.info(f'Wrote {metadata_output}')
logging.info(f'Wrote {output}')


def extract_mongojson_file(input, output):
def extract_mongojson_file(input, output, metadata_output=None):
""" Read MongoDB data in JSON format from input file
convert to output, which can be a filename or directory.
"""
with open(input, 'r') as fd:
mongojson = json.load(fd)
extract_mongojson(mongojson, output)
extract_mongojson(mongojson, output, metadata_output=metadata_output)


# ---------------------------------------------------------------------

def extract_dicom_file(input, output):
def extract_dicom_file(input, output, metadata_output=None):
""" Extract text from a DICOM file input
into the output, which can be a filename,
or a directory in which case the file is named by SOPInstanceUID.
Expand All @@ -72,14 +105,23 @@ def extract_dicom_file(input, output):
if os.path.isdir(output):
filename = dicomtext.SOPInstanceUID() + '.txt'
output = os.path.join(output, filename)
if metadata_output and os.path.isdir(metadata_output):
filename = dicomtext.SOPInstanceUID() + '.json'
metadata_output = os.path.join(metadata_output, filename)
with open(output, 'w') as fd:
fd.write(dicomtext.text())
if metadata_output:
with open(metadata_output, 'w') as fd:
metadata_json = {k:dicomtext.tag(k) for k in metadata_fields if dicomtext.tag(k)}
metadata_json['PatientID'] = patientid_map(metadata_json.get('PatientID',''))
print(json.dumps(metadata_json), file=fd)
logging.info(f'Wrote {metadata_output}')
logging.info(f'Wrote {output}')


# ---------------------------------------------------------------------

def extract_file(input, output):
def extract_file(input, output, metadata_output=None):
""" If it's a readable DICOM file then extract it
otherwise try to find it in MongoDB.
"""
Expand All @@ -90,9 +132,9 @@ def extract_file(input, output):
is_dcm = False

if is_dcm:
extract_dicom_file(input, output)
extract_dicom_file(input, output, metadata_output)
else:
extract_mongojson_file(input, output)
extract_mongojson_file(input, output, metadata_output)



Expand All @@ -104,6 +146,7 @@ def extract_file(input, output):
parser.add_argument('-y', dest='yamlfile', action="append", help='path to yaml config file (can be used more than once)')
parser.add_argument('-i', dest='input', action="store", help='SOPInstanceUID or path to raw DICOM file from which text will be redacted')
parser.add_argument('-o', dest='output_dir', action="store", help='path to directory where extracted text will be written')
parser.add_argument('-m', dest='metadata_dir', action="store", help='path to directory where extracted metadata will be written')
parser.add_argument('--semehr-unique', dest='semehr_unique', action="store_true", help='only extract from MongoDB/dicom if not already in MongoDB/semehr')
args = parser.parse_args()
if not args.input:
Expand All @@ -120,11 +163,17 @@ def extract_file(input, output):
# Merge all the yaml dicts into one
cfg_dict = Merger([(list, ["append"]),(dict, ["merge"])],["override"],["override"]).merge(cfg_dict, yaml.safe_load(fd))

# Initialise the PatientID mapping by opening a DB connection
if cfg_dict:
IdentifierMapper.CHItoEUPI(cfg_dict)

# For reading SRs
mongo_dicom_host = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('HostName',{})
mongo_dicom_user = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('UserName',{})
mongo_dicom_pass = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('Password',{})
mongo_dicom_db = cfg_dict.get('MongoDatabases', {}).get('DicomStoreOptions',{}).get('DatabaseName',{})

# For writing annotations
mongo_semehr_host = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('HostName',{})
mongo_semehr_user = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('UserName',{})
mongo_semehr_pass = cfg_dict.get('MongoDatabases', {}).get('SemEHRStoreOptions',{}).get('Password',{})
Expand All @@ -143,15 +192,15 @@ def extract_file(input, output):
# ---------------------------------------------------------------------
if os.path.isfile(args.input):
# actual path to DICOM
extract_file(args.input, args.output_dir)
extract_file(args.input, args.output_dir, args.metadata_dir)
elif os.path.isfile(os.path.join(root_dir, args.input)):
# relative to FileSystemRoot
extract_file(os.path.join(root_dir, args.input), args.output_dir)
extract_file(os.path.join(root_dir, args.input), args.output_dir, args.metadata_dir)
elif os.path.isdir(args.input):
# Recurse directory
for root, dirs, files in os.walk(args.input, topdown=False):
for name in files:
extract_file(os.path.join(root, name), args.output_dir)
extract_file(os.path.join(root, name), args.output_dir, args.metadata_dir)
elif mongo_dicom_db != {}:
# Only DicomFilePath and StudyDate are indexed in MongoDB.
# Passing a SOPInstanceUID would be handy but no point if not indexed.
Expand All @@ -164,11 +213,11 @@ def extract_file(input, output):
for mongojson in mongodb_in.StudyDateToJSONList(args.input):
# If it's already in the annotation database then don't bother extracting.
if not args.semehr_unique or not mongodb_out.findSOPInstanceUID(mongojson['SOPInstanceUID']):
extract_mongojson(mongojson, args.output_dir)
extract_mongojson(mongojson, args.output_dir, args.metadata_dir)
# Otherwise assume a DICOM file path which can be retrieved from MongoDB
else:
mongojson = mongodb_in.DicomFilePathToJSON(args.input)
extract_mongojson(mongojson, args.output_dir)
extract_mongojson(mongojson, args.output_dir, args.metadata_dir)
else:
logging.error(f'Cannot find {args.input} as file and MongoDB not configured')
exit(1)
36 changes: 2 additions & 34 deletions src/applications/SRAnonTool/CTP_SRAnonTool.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,41 +130,9 @@ CTP_DicomToText.py -y $default_yaml0 -y $default_yaml1 \
# ---------------------------------------------------------------------
# Run the SemEHR anonymiser using a set of private directories
# Reads $input_doc
# Writes $anon_doc and $anon_xml
# Writes $anon_doc, and $anon_xml via the --xml option
#
# If the new anonymiser exists then use it
if [ -f $semehr_dir/CogStack-SemEHR/anonymisation/anonymiser.py ]; then
# Create a custom config file in the output directory
jq < $semehr_dir/CogStack-SemEHR/anonymisation/conf/anonymisation_task.json > $semehr_output_dir/anonymisation_task.json \
'.text_data_path="'${semehr_input_dir}'"|'\
'.anonymisation_output="'${semehr_output_dir}'"|'\
'.extracted_phi="'${semehr_output_dir}'/phi"|'\
'.grouped_phi_output="'${semehr_output_dir}'/phi_grouped"|'\
'.logging_file="'${semehr_output_dir}'/log"|'\
'.annotation_mode=true'
# Run the new anonymiser
if [ $verbose -gt 0 ]; then
echo "RUN: ${semehr_dir}/CogStack-SemEHR/anonymisation/anonymiser.py $semehr_output_dir/anonymisation_task.json"
fi
(cd $semehr_dir/CogStack-SemEHR/anonymisation; python3 ./anonymiser.py $semehr_output_dir/anonymisation_task.json) >> $log 2>&1
rc=$?
else
if [ $verbose -gt 0 ]; then
echo "RUN: ${semehr_dir}/CogStack-SemEHR/analysis/clinical_doc_wrapper.py -i ${semehr_input_dir} -o ${semehr_output_dir}"
fi
# NOTE: This requires that SemEHR be modified to accept the -i and -o options.
(cd ${semehr_dir}/CogStack-SemEHR/analysis; ./clinical_doc_wrapper.py -i ${semehr_input_dir} -o ${semehr_output_dir}) >> $log 2>&1
rc=$?
fi
if [ $rc -ne 0 ]; then
tidy_exit $rc "Possible failure (exit code $rc) of SemEHR-anon given ${input_doc} from ${input_dcm}"
fi
# The new SemEHR anonymiser can be configured to create knowtator.xml files
# but they aren't as complete as the PHI file. Convert the PHI to XML.
if [ $verbose -gt 0 ]; then
echo "Convert PHI to Knowtator.XML"
fi
CTP_PhiToXML.py -p ${semehr_output_dir}/phi
semehr_anon.py -i "${input_doc}" -o "${anon_doc}" --xml || tidy_exit 5 "Error running SemEHR-anon given ${input_doc} from ${input_dcm}"
# If there's still no XML file then exit
if [ ! -f "$anon_xml" ]; then
tidy_exit 6 "ERROR: SemEHR-anon failed to convert $input_doc to $anon_xml"
Expand Down
4 changes: 2 additions & 2 deletions src/applications/SRAnonTool/CTP_XMLToDicom.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@
# Read the annotated XML file
xmlroot = xml.etree.ElementTree.parse(args.input_xml).getroot()
xmldictlist = Knowtator.annotation_xml_to_dict(xmlroot)
if xmldictlist == []:
print('WARNING: empty document in {}'.format(args.input_xml))
#if xmldictlist == []:
# print('WARNING: empty document in {}'.format(args.input_xml))
#for annot in xmldictlist:
# print('REMOVE {} from DICOM at {}'.format(annot['text'], annot['start_char']))

Expand Down
34 changes: 27 additions & 7 deletions src/applications/SRAnonTool/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Install the python library `SmiServices` to `$SMI_ROOT/lib/python3/` or a virtua

Ensure the python package dependencies are installed system-wide or in a virtualenv on the host machine.

Modify the `default.yaml` file: in the section `CTPAnonymiserOptions` add `SRAnonTool: /path/to/SRAnonTool.sh`
Modify the `default.yaml` file: in the section `CTPAnonymiserOptions` add `SRAnonTool: /path/to/CTP_SRAnonTool.sh`

Ensure the `default.yaml` file contains the necessary `FileSystemOptions`, `LoggingOptions>LogsRoot`, `MongoDatabases`, `RabbitOptions`, etc.

Expand All @@ -46,18 +46,18 @@ If using the test stub then only the data directories are required and Python2 i

## Usage as part of CTP

Configure CTP to call the script SRAnonTool.sh when it detects a DICOM file with `SR` in the `Modality` tag, by editing `default.yaml` as above. CTP will call the script with two options:
Configure CTP to call the script CTP_SRAnonTool.sh when it detects a DICOM file with `SR` in the `Modality` tag, by editing `default.yaml` as above. CTP will call the script with two options:
* `-i input.dcm` - the raw DICOM file before anonymisation
* `-o output.dcm` - the DICOM file which CTP has already anonymised

The script will extract the text from the `input.dcm` file, anonymise it, and write the redacted text into the `output.dcm` file, which must already exist.

## Standalone usage

The script `SRAnonTool.sh` calls three components:
The script `CTP_SRAnonTool.sh` calls three components:

* `CTP_DicomToText.py` - extracts the text from the raw DICOM file into a format suitable for SemEHR-CogStack.
* `clinical_doc_wrapper.py` - this is the component within SemEHR-CogStack which anonymises the text.
* `CogStack-SemEHR/anonymisation/anonymiser.py` - this is the script in SemEHR-CogStack which anonymises the text.
* `CTP_XMLToDicom.py` - redacts the text from the raw DICOM file and write the redacted text into the output DICOM file.

Usage: `[-e virtualenv] [-s semehr_dir] -i read_from.dcm -o write_into.dcm`
Expand All @@ -71,29 +71,42 @@ The SemEHR directory (`/opt/semehr`) can be changed with the `-s` option for tes

This program can be used as part of the SRAnonTool pipeline or it can be used standalone to extract documents in bulk for later SemEHR processing.

Usage: `-y default.yaml -i input.dcm -o outfile [--semehr-unique]`
Usage: `-y default.yaml -i input.dcm -o output [-m metadata_output] [--semehr-unique]`

`-y default.yaml` - may be specified more than once if the configuration parameters are spread across multiple yaml files.

`-i input.dcm` - full path to the input DICOM file, or a partial path to be extracted from MongoDB, or a StudyDate to extract all records that day from MongoDB.

`-o output` - full path to the output text file, or directory for multiple files.

`-m metadata_output` - full path to the output metadata json file, or directory for multiple files.

`--semehr-unique` - if extracting a StudyDate from MongoDB then ignore any documents which have a SOPInstanceUID that is already in the SemEHR MongoDB database. This is intended to allow reprocessing of any documents that previously failed without having to reprocess the whole day.

The MongoDB configuration read from the yaml files needs to be in `MongoDatabases | DicomStoreOptions` and `SemEHRStoreOptions`. The former is to read DICOM documents from the `dicom.image_SR` database.collection; the latter is to check if the SOPInstanceUID is already in the `semehr.semehr_results` database.collection.
If metadata output is requested then JSON output files are created containing the values of these tags:
`SOPClassUID, SOPInstanceUID, StudyInstanceUID, SeriesInstanceUID, ContentDate, ModalitiesInStudy, PatientID`.
The latter is mapped from CHI to EUPI.

The MongoDB configuration read from the yaml files needs to be in `MongoDatabases | DicomStoreOptions` and `SemEHRStoreOptions`.
The former is to read DICOM documents from the `dicom.image_SR` database.collection;
the latter is to check if the SOPInstanceUID is already in the `semehr.semehr_results` database.collection.

The MySQL configuration read from the yaml files needs to be in `IdentifierMapperOptions`
with keys `MappingConnectionString, MappingTableName, SwapColumnName, ReplacementColumnName`.
This is used to map PatientID.

Examples:

```
* CTP_DicomToText.py -i /path/to/file.dcm -o output.txt
* CTP_DicomToText.py -i 2015/01/01/AccNum/file.dcm -o output.txt -y smi_dataLoad.yaml
* CTP_DicomToText.py -i 20150101 -o output_dir -y smi_dataLoad.yaml
* CTP_DicomToText.py -i 20150101 -o output_dir -m metadata_dir -y smi_dataLoad.yaml
```

### `clinical_doc_wrapper.py`

This script performs the anonymisation.
It is the old Python-2 version and is no longer used.

Usage: `[-s semehr_dir] [-i input_docs] [-o anonymised]` in the stub version

Expand Down Expand Up @@ -158,3 +171,10 @@ The defaults are:
`-p pattern_to_redact` - `Baker` (to suit the example DICOM file)

`-y default.yaml` - `../../../../data/microserviceConfigs/default.yaml`

To run in the test directory
```
mkdir -p ./data/input_docs
mkdir -p ./data/anonymised
./CTP_SRAnonTool_test.py -s .
```
4 changes: 2 additions & 2 deletions src/common/Smi.Common/Smi.Common.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="NLog" Version="4.7.14" />
<PackageReference Include="RabbitMQ.Client" Version="5.1.2" />
<PackageReference Include="SecurityCodeScan.VS2019" Version="5.6.0">
<PackageReference Include="SecurityCodeScan.VS2019" Version="5.6.2">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="System.IO.Abstractions" Version="16.1.15" />
<PackageReference Include="System.IO.Abstractions" Version="16.1.16" />
<PackageReference Include="YamlDotNet" Version="11.2.1" />
</ItemGroup>
</Project>
Loading

0 comments on commit a845c2a

Please sign in to comment.