### Transfer dicom files to pseudonymization destination

<details>
<summary>STEP 1 BIG PICTURE</summary>
We collected data from centers in folders, named as patient ID (e.g. admission). We want to clean these directories, so 
I: Each CT study is placed in one folder
II: Store cases in an excel file, with its dicom files in the table, and all other variables (outcome, clinical, pathology data) stored here. We call this master key, which also contains patient id (un-anonymized) along with the key for anonymization.
III: Transfer dicom-pnly files to new destination and anonymize these images.
</details>
<details>
<summary>PREVIOUS STEP</summary>
We find all file types in our directory (I ran the code for each center sepratly. Having 1.5 terabytes of informaiton and ~1800 cases, it collectivly took 30 hours on a RTX3080Ti labtob and Corei912gen and 32Ram)
</details>
<details>
<summary>THIS STEP</summary>
In this step we will add unique dicom meta data about patient info, study info, and series info
</details>
<details>
<summary>NEXT STEP</summary>
Finding dicom meta data
</details>


### Libraries & Functions

In [None]:
## Add this to the first block in your note book to show json files in the jupyter output
import uuid
from IPython.core.display import display, HTML
import pydicom as pm
import os

import json

class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json_data
        self.uuid = str(uuid.uuid4())
        # This line is missed out in most of the versions of this script across the web, it is essential for this to work interleaved with print statements
        self._ipython_display_()
        
    def _ipython_display_(self):
        display(HTML('<div id="{}" style="height: auto; width:100%;"></div>'.format(self.uuid)))
        display(HTML("""<script>
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
        renderjson.set_show_to_level(1)
        document.getElementById('%s').appendChild(renderjson(%s))
        });</script>
        """ % (self.uuid, self.json_str)))

# Since this is copy-pasted wrongly(mostly) at a lot of places across the web, i'm putting the fixed, updated version here, mainly for self-reference


## To use this function, call this, this now works even when you have a print statement before or after the RenderJSON call
RenderJSON(dict_to_render)

In [None]:
#E1 that works
import pydicom as pm
import os

def get_dicomdir_give_dicomseriesdic_v1(dicom_dir, dicom_validation=False):
    """
    This function lists all DICOM series names and numbers in a directory. 
    Set dicom_validation=True to validate DICOM files before processing.
    Returns a dictionary with series numbers as keys and series names as values.
    """

    series_info = {}
    for root, dirs, files in os.walk(dicom_dir):
        for file in files:
            if dicom_validation:
                if not pm.misc.is_dicom(os.path.join(root, file)):
                    continue
            try:
                dicom_file = pm.dcmread(os.path.join(root, file))
                series_name = dicom_file.get((0x0008, 0x103E), None)  # Series Description Tag
                series_number = dicom_file.get((0x0020, 0x0011), None)  # Series Number Tag
                if series_name and series_number:
                    series_info[series_name.value] = series_number.value
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue                
    return series_info

#dicom_dir = r"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Taleghani\Maybe Case Image\243018"
#series = get_dicomdir_give_dicomseriesdic_v1(dicom_dir)
#print(series)



In [None]:
#E2 that works (I solved many problems, such as finding the unique studies)

import pydicom as pm
import os

def get_dicomdir_give_dicomseriesdic_json(dicom_dir, dicom_validation=False):
    """
    This function creates a multi-level dictionary for DICOM series in a directory.
    The top level has the last component of dicom_dir as a key, mapping to a dictionary
    of study IDs and their corresponding series names.
    Set dicom_validation=True to validate DICOM files before processing.
    """

    last_dir_name = os.path.basename(os.path.normpath(dicom_dir))
    dicom_data = {last_dir_name: {}}

    for root, dirs, files in os.walk(dicom_dir):
        for file in files:
            if dicom_validation:
                if not pm.misc.is_dicom(os.path.join(root, file)):
                    continue
            try:
                dicom_file = pm.dcmread(os.path.join(root, file))
                study_id = dicom_file.get((0x0020, 0x0010), None)  # Study ID Tag
                series_name = dicom_file.get((0x0008, 0x103E), None)  # Series Description Tag
                if study_id and series_name:
                    study_id_value = study_id.value
                    series_name_value = series_name.value
                    if study_id_value not in dicom_data[last_dir_name]:
                        dicom_data[last_dir_name][study_id_value] = []
                    if series_name_value not in dicom_data[last_dir_name][study_id_value]:
                        dicom_data[last_dir_name][study_id_value].append(series_name_value)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue                
    return dicom_data

dicom_dir = r"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Taleghani\Maybe Case Image\243018"
series_2=get_dicomdir_give_dicomseriesdic_json(dicom_dir)
series_2



In [None]:
#E3
import pydicom as pm
import os

def get_dicomdir_give_dicomseriesdic(dicom_dir, dicom_validation=False):
    """
    This function creates a multi-level dictionary for DICOM series in a directory.
    The top level has the last component of dicom_dir as a key, mapping to a dictionary
    of study IDs and their corresponding series names and numbers.
    Set dicom_validation=True to validate DICOM files before processing.
    """

    last_dir_name = os.path.basename(os.path.normpath(dicom_dir))
    dicom_data = {last_dir_name: {}}

    for root, dirs, files in os.walk(dicom_dir):
        for file in files:
            if dicom_validation:
                if not pm.misc.is_dicom(os.path.join(root, file)):
                    continue
            try:
                dicom_file = pm.dcmread(os.path.join(root, file))
                study_id = dicom_file.get((0x0020, 0x0010), None)  # Study ID Tag
                series_name = dicom_file.get((0x0008, 0x103E), None)  # Series Description Tag
                series_number = dicom_file.get((0x0020, 0x0011), None)  # Series Number Tag
                subdirectory = root.split(last_dir_name)[-1]
                if study_id and series_name and series_number:
                    series_info = (series_number.value, series_name.value)
                    study_info= (study_id.value, subdirectory)
                    if study_info not in dicom_data[last_dir_name]:
                        dicom_data[last_dir_name][study_info] = []
                    if series_info not in dicom_data[last_dir_name][study_info]:
                        dicom_data[last_dir_name][study_info].append(series_info)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue                
    return dicom_data

#dicom_dir = r"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Taleghani\Maybe Case Image\243018"
dicom_dir=r'D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Dr Radmard\Valid Case'
series = get_dicomdir_give_dicomseriesdic(dicom_dir,dicom_validation=True)
series

In [None]:
#E4
import pydicom as pm
import os

def generate_study_identifier(dicom_file):
    # Use a combination of tags to create a unique identifier for each study
    study_id = dicom_file.get((0x0020, 0x0010), "NoID").value  # Study ID
    study_date = dicom_file.get((0x0008, 0x0020), "NoDate").value  # Study Date
    patient_id = dicom_file.get((0x0010, 0x0020), "NoPatientID").value  # Patient ID
    return f"{study_id}_{study_date}_{patient_id}"

def get_dicomdir_give_dicomseriesdic(dicom_dir, dicom_validation=False):
    """
    This function creates a multi-level dictionary for DICOM series in a directory.
    The top level has the last component of dicom_dir as a key, mapping to a dictionary
    of study IDs and their corresponding series names and numbers.
    Set dicom_validation=True to validate DICOM files before processing.
    """

    last_dir_name = os.path.basename(os.path.normpath(dicom_dir))
    dicom_data = {last_dir_name: {}}

    for root, dirs, files in os.walk(dicom_dir):
        for file in files:
            if dicom_validation:
                if not pm.misc.is_dicom(os.path.join(root, file)):
                    continue
            try:
                dicom_file = pm.dcmread(os.path.join(root, file))
                study_id = dicom_file.get((0x0020, 0x0010), None)  # Study ID Tag
                series_name = dicom_file.get((0x0008, 0x103E), None)  # Series Description Tag
                series_number = dicom_file.get((0x0020, 0x0011), None)  # Series Number Tag
                study_date = dicom_file.get((0x0008, 0x0020), None)
                subdirectory = root.split(dicom_dir)[-1]
                subdirectory = [part for part in subdirectory.split('\\') if part]
                first_subfolder = subdirectory[0] if subdirectory else None
                manufacturer= dicom_file.get((0x0008, 0x1090), None)
                Study_description= dicom_file.get((0x0008, 0x1030), None)
                Image_type= dicom_file.get((0x0008, 0x0008), None)
                slice_thickness=dicom_file.get((0x0018, 0x0050), None)
                Pt_Sex= dicom_file.get((0x0010, 0x0040), None)
                Pt_Age= dicom_file.get((0x0010, 0x1010), None)
                Image_comment=dicom_file.get((0x0020, 0x4000), None)
                Protocol=dicom_file.get((0x0018, 0x1030), None)
                body_part=dicom_file.get((0x0018, 0x0015), None)
                if study_id and series_name and series_number and study_date:
                    series_info = (series_number.value, series_name.value)
                    study_unique= f'{first_subfolder}_{study_id.value}_{study_date.value}'
                    if study_unique not in dicom_data[last_dir_name]:
                        dicom_data[last_dir_name][study_unique] = []
                        dicom_data[last_dir_name][study_unique]['dir_to_root']= [{root}]
                        dicom_data[last_dir_name][study_unique]['study_description']= [{Study_description.vavlue}]
                        dicom_data[last_dir_name][study_unique]['date']= [{study_date.value}]
                        dicom_data[last_dir_name][study_unique]['age']= [{Pt_Age.value}]
                        dicom_data[last_dir_name][study_unique]['sex']= [{Pt_Sex.value}]
                        dicom_data[last_dir_name][study_unique]['manufacture']= [{manufacturer.value}]
                        dicom_data[last_dir_name][study_unique]['protocol']= [{Protocol.value}]


                    if series_info not in dicom_data[last_dir_name]['image_series']:
                        dicom_data[last_dir_name][study_unique]['image_series_list'].append(series_info)
                        dicom_data[last_dir_name][study_unique]['Image_type'].append(Image_type)
                        dicom_data[last_dir_name][study_unique]['Image_type'].append(body_part)
                        dicom_data[last_dir_name][study_unique]['slice_thickness'].append(slice_thickness)
                        dicom_data[last_dir_name][study_unique]['Image_comment'].append(Image_comment)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue                
    return dicom_data

#dicom_dir=r'D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Dr Radmard\Valid Case'
dicom_dir=r"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\Example\example s"
series = get_dicomdir_give_dicomseriesdic(dicom_dir,dicom_validation=True)
series

In [None]:
#transforming multi-level dictionary to json

series_json = json.dumps(series, indent=4)
RenderJSON(series_json)

json_file_name='dicomseries.json'
saveto_json_dir= os.path.join(dicom_dir, json_file_name)
with open(saveto_json_dir, 'w') as file:
    file.write(series_json)

### ARCHIVED CODES (TRASH)

In [None]:
for i in range(len(data['Full_Directory'])):
    i=0

    if data['If_dicom'] is True:
        dcm_dir="{}\{}".format(data["Full_Directory"].iloc[0],data["File"].iloc[0])
        dcm_dir
        
    else:
        

In [None]:
import pydicom as pm

dcm_dir="{}\{}".format(data["Full_Directory"].iloc[0],data["File"].iloc[0])
dicom_file = pm.dcmread(dcm_dir)





In [None]:
Hospital_name= "Guilan"
directory_shortlist=f"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\{Hospital_name}_data_short_just_dcm.xlsx"
directory_longlist=f"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\{Hospital_name}_data.csv"



directory_longlist=pd.read_csv(directory_longlist)
directory_longlist_dcm=directory_longlist[directory_longlist['If_dicom']==True]

directory_longlist_dcm=directory_longlist[directory_longlist['If_dicom']==True]
directory_longlist_dcm=directory_longlist_dcm.reset_index()



i=1
dir_path = os.path.join(directory_longlist_dcm.iloc[i][3], directory_longlist_dcm.iloc[i][2])
dir_path



# this code aimed to get all dicom meta data so I can work with them, and know them, especially for anonymization and knowing the phase.
# however, it failed due to different dicom formats. I will use json instead (I should have use it at first place).
# the json also has many erorrs, so I added the try-except into the loop to handle erorrs while finishing the loop.

dcminfo_list = []  # List to store the individual DataFrame pieces

print(f'total rows in your dataframe is {len(directory_longlist_dcm)}')
start_time=time.time()

for i in range(1, len(directory_longlist_dcm)):
    dir_path = os.path.join(directory_longlist_dcm.iloc[i][3], directory_longlist_dcm.iloc[i][2])
    
    try:
        ds = pm.dcmread(dir_path)
        ds = pd.DataFrame(ds.values())
        if ds.shape[1]>1:
            ds= pd.DataFrame({'WARNING_MORETHAN1ROW_DF2CELL': [ds.to_string()]})
        else: 
            ds[0] = ds[0].apply(lambda x: pm.dataelem.DataElement_from_raw(x) if isinstance(x, pm.dataelem.RawDataElement) else x)
            ds['name'] = ds[0].apply(lambda x: x.name)
            ds['value'] = ds[0].apply(lambda x: x.value)
            ds = ds[['name', 'value']]
            ds = ds.T
            new_header = ds.iloc[0]  # First row as header
            ds = ds[1:]  # Taking the rest of the data
            ds.columns = new_header  # Setting the new header

        if i % 1000 == 0:
            percentage = (i / len(directory_longlist_dcm)) * 100
            end_time = time.time() 
            elapsed_time = end_time - start_time
            print(f'Processed {i} rows, which is {percentage:.2f}% of total rows in {elapsed_time} seconds.')

        ds['to_directory'] = dir_path
        ds['key2csv']=directory_longlist_dcm['Unnamed: 0'][i]
    
    except Exception as e:
        error_message = str(e)
        ds = pd.DataFrame({'WARNING_ERROR': [error_message], 'to_directory': dir_path, 'key2csv': directory_longlist_dcm['Unnamed: 0'][i]})

    dcminfo_list.append(ds)

for df in dcminfo_list:
    rename_duplicate_columns(df)


dcminfo_all=pd.concat(dcminfo_list, ignore_index=True, sort=False)
dcminfo_all


In [None]:

ds = pm.dcmread(r'D:\\Data\\Big Pancreas (CT, EUS)\\Raw Data Hospital\\Guilan\\Valid Case\\PG1002-malihe hoseynlo\\DICOMDIR')
js=ds.to_json()
data=json.loads(js)
data

In [None]:
import pydicom as pm
import pandas as pd
import os

def rename_duplicate_columns(df):
    """Rename duplicate columns in the DataFrame."""
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique(): 
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_DUP' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols

Hospital_name= "Guilan"
directory_shortlist=f"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\{Hospital_name}_data_short_just_dcm.xlsx"
directory_longlist=f"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\{Hospital_name}_data.csv"


directory_longlist=pd.read_csv(directory_longlist)
directory_longlist_dcm=directory_longlist[directory_longlist['If_dicom']==True]
directory_longlist=pd.read_csv(directory_longlist)
directory_longlist_dcm=directory_longlist[directory_longlist['If_dicom']==True]
directory_longlist_dcm=directory_longlist_dcm.reset_index()





import os
import pandas as pd
import pydicom as pm

dcminfo_list = []  # List to store the individual DataFrame pieces

for i in range(1, len(directory_longlist_dcm)):
    dir_path = os.path.join(directory_longlist_dcm.iloc[i][4], directory_longlist_dcm.iloc[i][3])
    ds = pm.dcmread(dir_path)
    ds = pd.DataFrame(ds.values())
    ds[0] = ds[0].apply(lambda x: pm.dataelem.DataElement_from_raw(x) if isinstance(x, pm.dataelem.RawDataElement) else x)
    ds['name'] = ds[0].apply(lambda x: x.name)
    ds['value'] = ds[0].apply(lambda x: x.value)
    ds = ds[['name', 'value']]
    ds = ds.T
    new_header = ds.iloc[0]  # First row as header
    ds = ds[1:]  # Taking the rest of the data
    ds.columns = new_header  # Setting the new header
    ds['to_directory'] = dir_path
    ds['key2csv']=directory_longlist_dcm['Unnamed: 0'][i]
    

    dcminfo_list.append(ds)

for df in dcminfo_list:
    rename_duplicate_columns(df)


dcminfo_all=pd.concat(dcminfo_list, ignore_index=True, sort=False)
dcminfo_all


In [None]:
# from previous dataframe of directories, read all dicoms.
dcminfo_list = []  # List to store the individual DataFrame pieces

print(f'total rows in your dataframe is {len(directory_longlist_dcm)}')
start_time=time.time()

for i in range(1, len(directory_longlist_dcm)):
    dir_path = os.path.join(directory_longlist_dcm.iloc[i][4], directory_longlist_dcm.iloc[i][3])
    ds = pm.dcmread(dir_path)
    js=ds.to_json()
    

    if i % 1000 == 0:
        percentage = (i / len(directory_longlist_dcm)) * 100
        end_time = time.time() 
        elapsed_time = end_time - start_time
        print(f'Processed {i} rows, which is {percentage:.2f}% of total rows in {elapsed_time} seconds.')

    ds['to_directory'] = dir_path
    ds['key2csv']=directory_longlist_dcm['Unnamed: 0'][i]
    

    dcminfo_list.append(ds)

for df in dcminfo_list:
    rename_duplicate_columns(df)


dcminfo_all=pd.concat(dcminfo_list, ignore_index=True, sort=False)
dcminfo_all


In [None]:
# from previous dataframe of directories, read all dicoms.
dcminfo_list = []  # List to store the individual DataFrame pieces

print(f'total rows in your dataframe is {len(directory_longlist_dcm)}')
start_time=time.time()

for i in range(1, len(directory_longlist_dcm)):
    dir_path = os.path.join(directory_longlist_dcm.iloc[i][4], directory_longlist_dcm.iloc[i][3])
    ds = pm.dcmread(dir_path)
    js=ds.to_json()
    

    if i % 1000 == 0:
        percentage = (i / len(directory_longlist_dcm)) * 100
        end_time = time.time() 
        elapsed_time = end_time - start_time
        print(f'Processed {i} rows, which is {percentage:.2f}% of total rows in {elapsed_time} seconds.')

    ds['to_directory'] = dir_path
    ds['key2csv']=directory_longlist_dcm['Unnamed: 0'][i]
    

    dcminfo_list.append(ds)

for df in dcminfo_list:
    rename_duplicate_columns(df)


dcminfo_all=pd.concat(dcminfo_list, ignore_index=True, sort=False)
dcminfo_all


In [None]:

ds = pm.dcmread(r'D:\\Data\\Big Pancreas (CT, EUS)\\Raw Data Hospital\\Guilan\\Valid Case\\PG1002-malihe hoseynlo\\DICOMDIR')
js=ds.to_json()
data=json.loads(js)
data

In [None]:
# Function to flatten the JSON recursively
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

flat_data = flatten_json(js)
dff = pd.DataFrame([flat_data])
dff

In [None]:
dcminfo_all.to_excel(f"D:\Data\Big Pancreas (CT, EUS)\Raw Data Hospital\{Hospital_name}_testdicomdataframe.xlsx")