In [None]:
## if pydicom and itk have not been intalled, run the cell
!pip install pydicom
!pip install itk
!pip install tqdm

### Description:
This document loop through the folder that contains multiple DICOM subjects and change the tags in each subject. Then the modified version of DICOM is stored in a specified folder.

In [5]:
import itk
import numpy as np
import os
import re
import pydicom
import itk
import zipfile
import gzip
import shutil

In [4]:
os.getcwd()

'C:\\Users\\chunr\\OneDrive - University of Iowa\\SPIROMICS_DICOM2Nifti'

In [6]:
# Input Folder structure should be
#./folderOfAllDicoms
#       /folder_of_dicoms_subject1
#       /folder_of_dicoms_subject2
#       .......
#OR
#./folderOfAllDicoms
#       /zipfile_of_dicoms_subject1
#       /zipfile_of_dicoms_subject2
#      .......

folderOfAllDicoms    = "E:\\SPIROMICS_DATA\\SPIROMICS_V1\\TLC"
folder2WriteDicoms   = "D:\\SPIROMICS_DATA\SPIROMICS_V1\\TLC"
TempFolders = "D:\\SPIROMICS_DATA\\Temp"

## The tags below do not contain private patient information besides 'PatientID', 'PatientName'. 'PatientID' and 'PatientName' are
## checked to have format as "WF123000" and "H-123456" in the code, otherwise they are removed.
key2keep = ['AccessionNumber', 'AcquisitionDate', 'AcquisitionNumber', 'AcquisitionTime','BitsAllocated', 'BitsStored', 
            'Columns', 'ContentDate', 'ContentTime', 'ConvolutionKernel', 'DataCollectionDiameter', 'DistanceSourceToDetector', 
            'DistanceSourceToPatient', 'Exposure', 'ExposureTime', 'FilterType', 'FocalSpots', 'FrameOfReferenceUID', 
            'GantryDetectorTilt', 'GeneratorPower', 'HighBit', 'ImageOrientationPatient', 'ImagePositionPatient', 'ImageType', 
            'InstanceCreationDate', 'InstanceCreationTime', 'InstanceNumber','PatientID', 'PatientName', 'PatientPosition', 
            'PixelData', 'PixelPaddingValue', 'PixelRepresentation', 'PixelSpacing', 'PositionReferenceIndicator', 
            'ProtocolName', 'ReconstructionDiameter', 'ReferencedImageSequence', 'RescaleIntercept', 'RescaleSlope',
            'RescaleType', 'RevolutionTime', 'RotationDirection', 'Rows', 'SOPClassUID', 'SOPInstanceUID', 'SamplesPerPixel', 
            'SeriesDate', 'SeriesDescription', 'SeriesInstanceUID', 'SeriesNumber', 'SeriesTime', 'SingleCollimationWidth', 
            'SliceLocation', 'SliceThickness',  'StudyDate', 'StudyDescription', 'StudyInstanceUID', 'StudyTime', 
            'TableFeedPerRotation', 'TableHeight', 'TableSpeed', 'TotalCollimationWidth', 'WindowCenter', 'WindowWidth', 
            'XRayTubeCurrent']

In [None]:
## Loop over all dicom folders and dicom zip files
for fileAndFolder in os.listdir(folderOfAllDicoms):
    inputDicomFolder = ""
    outputDicomFolder = ""
    
    ## if fileAndFolder is a folder 
    if os.path.isdir(os.path.join(folderOfAllDicoms,fileAndFolder)):
        inputDicomFolder = os.path.join(folderOfAllDicoms,fileAndFolder)
        outputDicomFolder = os.path.join(folder2WriteDicoms,fileAndFolder)
        if os.path.isdir(outputDicomFolder):
            print(outputDicomFolder+" already exist!!")
            continue
            
    ## if fileAndFolder is a zip file
    if os.path.isfile(os.path.join(folderOfAllDicoms,fileAndFolder)):
        outputDicomFolder = os.path.join(folder2WriteDicoms,os.path.splitext(fileAndFolder)[0])
        if os.path.isdir(outputDicomFolder):
            print(outputDicomFolder+" already exist!!")
            continue
        if zipfile.is_zipfile(os.path.join(folderOfAllDicoms,fileAndFolder)):
            with zipfile.ZipFile(os.path.join(folderOfAllDicoms,fileAndFolder), 'r') as zip_ref:
                inputDicomFolder = os.path.join(TempFolders,"Temp_Extraction")
                if os.path.isdir(inputDicomFolder):
                    print(inputDicomFolder+" already exist!!")
                    shutil.rmtree(inputDicomFolder)         
                os.mkdir(inputDicomFolder)
                zip_ref.extractall(inputDicomFolder)
                
    ## make a new folder to store new version of dicom           
    os.mkdir(outputDicomFolder)    
    
    print("Start processing "+fileAndFolder)            
    dicomFN = itk.GDCMSeriesFileNames.New()
    dicomFN.SetUseSeriesDetails(False) # Use more information to identify individual 3D image
    dicomFN.SetInputDirectory(inputDicomFolder)
    inputfnames = dicomFN.GetInputFileNames()
    dicomFN.SetOutputDirectory(outputDicomFolder)
    outputfnames = dicomFN.GetOutputFileNames()
                
    for inputfname,outputfname in zip(inputfnames,outputfnames):
        slicedata = pydicom.dcmread(inputfname)
    
        # Remove any private tags
        slicedata.remove_private_tags()
    
        numKeys = 0
        ## dir(slicedata) contains all tags other than private tags
        for key in dir(slicedata):
            if re.search("__",key):
                break
            numKeys +=1

        for key in dir(slicedata)[0:numKeys]:
            # Only keep the tags listed in key2keep which does not contain patient's private information
            if not key in key2keep:
                slicedata[key].clear()  ## remove the tag value

            # Make sure PatientName has format like WF123000, otherwise delete the PatientName
            if key=="PatientName":
                if not re.match("^\w{2}[0-9]{6}$",str(slicedata["PatientName"].value)):
                    slicedata["PatientName"].clear()

            # Make sure PatientID has format like H-123456, otherwise delete the PatientName        
            if key=="PatientID":
                if not re.match("^\w{1}-[0-9]{6}$",str(slicedata["PatientID"].value)):
                    slicedata["PatientID"].clear()

        slicedata.save_as(outputfname)
    
    if re.match(".*Temp_Extraction$",inputDicomFolder):
        shutil.rmtree(inputDicomFolder)
        
    print("Finish cleaning dicom: "+ fileAndFolder)