In [None]:
## if pydicom and itk have not been intalled, run the cell
!pip install pydicom
!pip install itk
!pip install tqdm

In [5]:
import itk
import numpy as np
import os
import re
import pydicom
import itk
import zipfile
import gzip
import shutil

In [4]:
os.getcwd()

'C:\\Users\\chunr\\OneDrive - University of Iowa\\SPIROMICS_DICOM2Nifti'

In [6]:
# Input Folder structure should be
#./folderOfAllDicoms
#       /folder_of_dicoms_subject1
#       /folder_of_dicoms_subject2
#       .......
#OR
#./folderOfAllDicoms
#       /zipfile_of_dicoms_subject1
#       /zipfile_of_dicoms_subject2
#      .......

folderOfAllDicoms    = "E:\\SPIROMICS_DATA\\SPIROMICS_V1\\TLC"
folder2WriteDicoms   = "D:\\SPIROMICS_DATA\SPIROMICS_V1\\TLC"
TempFolders = "D:\\SPIROMICS_DATA\\Temp"

## The tags below do not contain private patient information besides 'PatientID', 'PatientName'. 'PatientID' and 'PatientName' are
## checked to have format as "WF123000" and "H-123456" in the code, otherwise they are removed.
key2keep = ['AccessionNumber', 'AcquisitionDate', 'AcquisitionNumber', 'AcquisitionTime','BitsAllocated', 'BitsStored', 
            'Columns', 'ContentDate', 'ContentTime', 'ConvolutionKernel', 'DataCollectionDiameter', 'DistanceSourceToDetector', 
            'DistanceSourceToPatient', 'Exposure', 'ExposureTime', 'FilterType', 'FocalSpots', 'FrameOfReferenceUID', 
            'GantryDetectorTilt', 'GeneratorPower', 'HighBit', 'ImageOrientationPatient', 'ImagePositionPatient', 'ImageType', 
            'InstanceCreationDate', 'InstanceCreationTime', 'InstanceNumber','PatientID', 'PatientName', 'PatientPosition', 
            'PixelData', 'PixelPaddingValue', 'PixelRepresentation', 'PixelSpacing', 'PositionReferenceIndicator', 
            'ProtocolName', 'ReconstructionDiameter', 'ReferencedImageSequence', 'RescaleIntercept', 'RescaleSlope',
            'RescaleType', 'RevolutionTime', 'RotationDirection', 'Rows', 'SOPClassUID', 'SOPInstanceUID', 'SamplesPerPixel', 
            'SeriesDate', 'SeriesDescription', 'SeriesInstanceUID', 'SeriesNumber', 'SeriesTime', 'SingleCollimationWidth', 
            'SliceLocation', 'SliceThickness',  'StudyDate', 'StudyDescription', 'StudyInstanceUID', 'StudyTime', 
            'TableFeedPerRotation', 'TableHeight', 'TableSpeed', 'TotalCollimationWidth', 'WindowCenter', 'WindowWidth', 
            'XRayTubeCurrent']

In [None]:
for fileAndFolder in os.listdir(folderOfAllDicoms):
    inputDicomFolder = ""
    outputDicomFolder = ""
    if os.path.isdir(os.path.join(folderOfAllDicoms,fileAndFolder)):
        inputDicomFolder = os.path.join(folderOfAllDicoms,fileAndFolder)
        outputDicomFolder = os.path.join(folder2WriteDicoms,fileAndFolder)
        if os.path.isdir(outputDicomFolder):
            print(outputDicomFolder+" already exist!!")
            continue
    if os.path.isfile(os.path.join(folderOfAllDicoms,fileAndFolder)):
        outputDicomFolder = os.path.join(folder2WriteDicoms,os.path.splitext(fileAndFolder)[0])
        if os.path.isdir(outputDicomFolder):
            print(outputDicomFolder+" already exist!!")
            continue
        if zipfile.is_zipfile(os.path.join(folderOfAllDicoms,fileAndFolder)):
            with zipfile.ZipFile(os.path.join(folderOfAllDicoms,fileAndFolder), 'r') as zip_ref:
                inputDicomFolder = os.path.join(TempFolders,"Temp_Extraction")
                if os.path.isdir(inputDicomFolder):
                    print(inputDicomFolder+" already exist!!")
                    shutil.rmtree(inputDicomFolder)         
                os.mkdir(inputDicomFolder)
                zip_ref.extractall(inputDicomFolder)
                
    os.mkdir(outputDicomFolder)    
    
    print("Start processing "+fileAndFolder)            
    dicomFN = itk.GDCMSeriesFileNames.New()
    dicomFN.SetUseSeriesDetails(False) # Use more information to identify individual 3D image
    dicomFN.SetInputDirectory(inputDicomFolder)
    inputfnames = dicomFN.GetInputFileNames()
    dicomFN.SetOutputDirectory(outputDicomFolder)
    outputfnames = dicomFN.GetOutputFileNames()
                
    for inputfname,outputfname in zip(inputfnames,outputfnames):
        slicedata = pydicom.dcmread(inputfname)
    
        # Remove any private tags
        slicedata.remove_private_tags()
    
        numKeys = 0
        for key in dir(slicedata):
            if re.search("__",key):
                break
            numKeys +=1

        for key in dir(slicedata)[0:numKeys]:
            # Only keep the tags listed in key2keep which does not contain patient's private information
            if not key in key2keep:
                slicedata[key].clear()  ## remove the tag value

            # Make sure PatientName has format like WF123000, otherwise delete the PatientName
            if key=="PatientName":
                if not re.match("^\w{2}[0-9]{6}$",str(slicedata["PatientName"].value)):
                    slicedata["PatientName"].clear()

            # Make sure PatientID has format like H-123456, otherwise delete the PatientName        
            if key=="PatientID":
                if not re.match("^\w{1}-[0-9]{6}$",str(slicedata["PatientID"].value)):
                    slicedata["PatientID"].clear()

        slicedata.save_as(outputfname)
    
    if re.match(".*Temp_Extraction$",inputDicomFolder):
        shutil.rmtree(inputDicomFolder)
        
    print("Finish cleaning dicom: "+ fileAndFolder)

Start processing H-14338_WF120922_SPI-WF120922-V1_INSPIRATION_0.625_STANDARD_22718876.zip
Finish cleaning dicom: H-14338_WF120922_SPI-WF120922-V1_INSPIRATION_0.625_STANDARD_22718876.zip
Start processing H-14348_WF120077_SPI-WF120077-V1_INSPIRATION_0.625_STANDARD_22839908.zip
Finish cleaning dicom: H-14348_WF120077_SPI-WF120077-V1_INSPIRATION_0.625_STANDARD_22839908.zip
Start processing H-14349_WF120081_SPI-WF120081-V1_INSPIRATION_0.625_STANDARD_22843485.zip
Finish cleaning dicom: H-14349_WF120081_SPI-WF120081-V1_INSPIRATION_0.625_STANDARD_22843485.zip
Start processing H-14350_WF120045_SPI-WF120045-V1_INSPIRATION_0.625_STANDARD_22845224.zip
Finish cleaning dicom: H-14350_WF120045_SPI-WF120045-V1_INSPIRATION_0.625_STANDARD_22845224.zip
Start processing H-14351_WF120053_SPI-WF120053-V1_INSPIRATION_0.625_STANDARD_22847400.zip
Finish cleaning dicom: H-14351_WF120053_SPI-WF120053-V1_INSPIRATION_0.625_STANDARD_22847400.zip
Start processing H-14352_WF120113_SPI-WF120113-V1_INSPIRATION_0.625_ST

Finish cleaning dicom: H-14391_WF120652_SPI-WF120652-V1_INSPIRATION_0.625_STANDARD_22940093.zip
Start processing H-14392_WF120667_SPI-WF120667-V1_INSPIRATION_0.625_STANDARD_22941669.zip
Finish cleaning dicom: H-14392_WF120667_SPI-WF120667-V1_INSPIRATION_0.625_STANDARD_22941669.zip
Start processing H-14393_WF120678_SPI-WF120678-V1_INSPIRATION_0.625_STANDARD_22945714.zip
Finish cleaning dicom: H-14393_WF120678_SPI-WF120678-V1_INSPIRATION_0.625_STANDARD_22945714.zip
Start processing H-14394_WF120680_SPI-WF120680-V1_INSPIRATION_0.625_STANDARD_22946364.zip
Finish cleaning dicom: H-14394_WF120680_SPI-WF120680-V1_INSPIRATION_0.625_STANDARD_22946364.zip
Start processing H-14395_WF120695_SPI-WF120695-V1_INSPIRATION_0.625_STANDARD_22948748.zip
Finish cleaning dicom: H-14395_WF120695_SPI-WF120695-V1_INSPIRATION_0.625_STANDARD_22948748.zip
Start processing H-14396_WF120764_SPI-WF120764-V1_INSPIRATION_0.625_STANDARD_22950676.zip
Finish cleaning dicom: H-14396_WF120764_SPI-WF120764-V1_INSPIRATION_0.

Start processing H-14436_WF121238_SPI-WF121238-V1_INSPIRATION_0.625_STANDARD_23041934.zip
Finish cleaning dicom: H-14436_WF121238_SPI-WF121238-V1_INSPIRATION_0.625_STANDARD_23041934.zip
Start processing H-14437_WF121266_SPI-WF121266-V1_INSPIRATION_0.625_STANDARD_23044462.zip
Finish cleaning dicom: H-14437_WF121266_SPI-WF121266-V1_INSPIRATION_0.625_STANDARD_23044462.zip
Start processing H-14438_WF121299_SPI-WF121299-V1_INSPIRATION_0.625_STANDARD_23046816.zip
Finish cleaning dicom: H-14438_WF121299_SPI-WF121299-V1_INSPIRATION_0.625_STANDARD_23046816.zip
Start processing H-14439_WF121301_SPI-WF121301-V1_INSPIRATION_0.625_STANDARD_23049484.zip
Finish cleaning dicom: H-14439_WF121301_SPI-WF121301-V1_INSPIRATION_0.625_STANDARD_23049484.zip
Start processing H-14440_WF121314_SPI-WF121314-V1_INSPIRATION_0.625_STANDARD_23051823.zip
Finish cleaning dicom: H-14440_WF121314_SPI-WF121314-V1_INSPIRATION_0.625_STANDARD_23051823.zip
Start processing H-14441_WF121322_SPI-WF121322-V1_INSPIRATION_0.625_ST

Start processing H-14490_JH110224_SPI-JH110224-V1_INSPIRATION_0.75_B35f_23217614.zip
Finish cleaning dicom: H-14490_JH110224_SPI-JH110224-V1_INSPIRATION_0.75_B35f_23217614.zip
Start processing H-14491_JH110267_SPI-JH110267-V1_INSPIRATION_0.75_B35f_23220214.zip
Finish cleaning dicom: H-14491_JH110267_SPI-JH110267-V1_INSPIRATION_0.75_B35f_23220214.zip
Start processing H-14492_JH110398_SPI-JH110398-V1_INSPIRATION_0.75_B35f_23223568.zip
Finish cleaning dicom: H-14492_JH110398_SPI-JH110398-V1_INSPIRATION_0.75_B35f_23223568.zip
Start processing H-14493_JH110156_SPI-JH110156-V1_INSPIRATION_0.75_B35f_23226623.zip
Finish cleaning dicom: H-14493_JH110156_SPI-JH110156-V1_INSPIRATION_0.75_B35f_23226623.zip
Start processing H-14494_JH110321_SPI-JH110321-V1_INSPIRATION_0.75_B35f_23229428.zip
Finish cleaning dicom: H-14494_JH110321_SPI-JH110321-V1_INSPIRATION_0.75_B35f_23229428.zip
Start processing H-14495_JH110410_SPI-JH110410-V1_INSPIRATION_0.75_B35f_23232020.zip
Finish cleaning dicom: H-14495_JH11

Start processing H-14546_JH111039_SPI-JH111039-V1_INSPIRATION_0.75_B35f_23350087.zip
Finish cleaning dicom: H-14546_JH111039_SPI-JH111039-V1_INSPIRATION_0.75_B35f_23350087.zip
Start processing H-14547_JH113755_SPI-JH113755-V1_INSPIRATION_0.75_B35f_23352654.zip
Finish cleaning dicom: H-14547_JH113755_SPI-JH113755-V1_INSPIRATION_0.75_B35f_23352654.zip
Start processing H-14548_JH113764_SPI-JH113764-V1_INSPIRATION_0.75_B35f_23356008.zip
Finish cleaning dicom: H-14548_JH113764_SPI-JH113764-V1_INSPIRATION_0.75_B35f_23356008.zip
Start processing H-14549_JH113731_SPI-JH113731-V1_INSPIRATION_0.75_B35f_23358648.zip
Finish cleaning dicom: H-14549_JH113731_SPI-JH113731-V1_INSPIRATION_0.75_B35f_23358648.zip
Start processing H-14550_JH113678_SPI-JH113678-V1_INSPIRATION_0.75_B35f_23361348.zip
Finish cleaning dicom: H-14550_JH113678_SPI-JH113678-V1_INSPIRATION_0.75_B35f_23361348.zip
Start processing H-14551_JH113680_SPI-JH113680-V1_INSPIRATION_0.75_B35f_23364548.zip
Finish cleaning dicom: H-14551_JH11

In [None]:
#H-14382,H-14398is not correct