### Anonymization of dicom

<details>
<summary>STEP 1 BIG PICTURE</summary>
We collected data from centers in folders, named as patient ID (e.g. admission). We want to clean these directories, so 
I: Each CT study is placed in one folder
II: Store cases in an excel file, with its dicom files in the table, and all other variables (outcome, clinical, pathology data) stored here. We call this master key, which also contains patient id (un-anonymized) along with the key for anonymization.
III: Transfer dicom-pnly files to new destination and anonymize these images.
</details>

<details>
<summary>PREVIOUS STEP</summary>
WE previously transfered all dicom files of enrolled cases to a new destination. Now we want to anynymize it.
</details>

<details>
<summary>THIS STEP</summary>
This code remove dicom meta data that may represent patient identity, while preserving many useful tags within dicom metadata.
</details>

<details>
<summary>NEXT STEP</summary>
Upload to Xnat for labeling and anonymization.
</details>

### Preparing Files: Function and library

In [None]:
#final

# Prepare for anonymization
# First, we want to prepare the dicoms. In previous steps we sotred each dicom sutdy in one folder. 
# However, for many cases, sesies are stored in different folders 'SR_1', 'SR_2', etc.
# This code will add the fodler name at the begiining of dicom files within that folder, and then transfer all files to the main dicom study folder.
# Also, if all dicoms are stored in one place, 1, 2, 3, ..., 1500; We will add series number at the beging of them. So we can choose relvant series for tranfering them to server.


import os
from tqdm.notebook import tqdm
from IPython.display import HTML
import pydicom as pm
import shutil

def create_clickable_dir_path(dir_path):
    # Convert the directory path to a file URL
    file_url = f"{dir_path}"
    return HTML(f'<a href="{file_url}" target="_blank">{dir_path}</a>')


def add_series_2beginingoffile(directory, adding_directory_2progresbar=''):
    renamed_count = 0
    skipped_count = 0
    for filename in tqdm(os.listdir(directory),desc=f'Adding series name {adding_directory_2progresbar} at the beggining of files', unit='folders'):
        file_path = os.path.join(directory, filename)

        # Check if it's a file and has a DICOM extension (optional)
        if os.path.isfile(file_path):
            
            try:
                # Read the DICOM file
                dicom_file = pm.dcmread(file_path)

                # Get the series number
                series_number = dicom_file.SeriesNumber

                # Create new file name with series number prefix
                new_filename = f"SR{series_number}_{filename}"
                new_file_path = os.path.join(directory, new_filename)

                # Rename (replace) the file
                os.rename(file_path, new_file_path)
                renamed_count += 1

            except Exception as e:
                skipped_count += 1
                
    print(f"Total files skipped: {skipped_count}")
    print(f"Total files renamed: {renamed_count}")

def add_subfolder_name_and_move(main_directory,adding_directory_2progresbar=''):
    for folder in tqdm(os.listdir(main_directory),desc=f'Adding series name {adding_directory_2progresbar} at the beggining of files', unit='folders'):
        folder_path = os.path.join(main_directory, folder)

        # Check if it's a directory
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file)

                # Check if it's a file
                if os.path.isfile(file_path):
                    # Create new file name with subfolder prefix
                    new_filename = f"{folder}_{file}"
                    new_file_path = os.path.join(main_directory, new_filename)

                    # Move and rename file
                    shutil.move(file_path, new_file_path)
                    print(f"Moved and renamed: {new_file_path}")
                    
def add_subfolder_name_and_move(main_directory,adding_directory_2progresbar=''):
    files_moved = 0  # Initialize a counter for the number of files moved

    for folder in tqdm(os.listdir(main_directory),desc=f'Move and rename data from subfolders of {adding_directory_2progresbar}', unit='folders'):
        folder_path = os.path.join(main_directory, folder)

        # Check if it's a directory
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file)

                # Check if it's a file
                if os.path.isfile(file_path):
                    # Create new file name with subfolder prefix
                    new_filename = f"{folder}_{file}"
                    new_file_path = os.path.join(main_directory, new_filename)

                    # Move and rename file
                    shutil.move(file_path, new_file_path)
                    files_moved += 1  # Increment the counter


    # Print the total number of files moved
    print(f"Total number of files moved: {files_moved}")

    # Remove empty subfolders
    empty_folder_removed=0
    for folder in os.listdir(main_directory):
        folder_path = os.path.join(main_directory, folder)
        
        # Check if the folder is empty and a directory
        if os.path.isdir(folder_path) and not os.listdir(folder_path):
            os.rmdir(folder_path)
            empty_folder_removed +=1
            print(f"Removed empty folder: {folder_path}")

    remaining_folders = sum(os.path.isdir(os.path.join(main_directory, d)) for d in os.listdir(main_directory))
    
    print(f"Number of empty folders removed: {empty_folder_removed}")
    print(f"Number of remaining folders: {remaining_folders}")


def folderedstudy_and_addingseriesname_handler(main_directory):

    for folder in tqdm(os.listdir(main_directory), desc='Reading folders withing directory', unit='files'):
        folder_path = os.path.join(main_directory, folder)
        
        if os.path.isdir(folder_path):
            subfolders = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]
            count_subfolders = len(subfolders)
            if count_subfolders > 0:
                print(f"#{count_subfolders} Subfolders within: ")
                display(create_clickable_dir_path(folder_path))
                add_subfolder_name_and_move(folder_path,adding_directory_2progresbar=folder_path)
                add_series_2beginingoffile(folder_path,adding_directory_2progresbar=folder_path)
                
            else:
                print(f"No subfolder exists in:")
                display(create_clickable_dir_path(folder_path))
                add_series_2beginingoffile(folder_path,adding_directory_2progresbar=folder_path)

        
        print('-----------------------------------------')


### Anonymize: Function and library

In [None]:
import pydicom as pm
import dicognito.anonymizer
from tqdm.notebook import tqdm
import os


def Anonymize_and_update_dicom_metadata(dicom_from_path, clean_to_path, InstituteName='InstituteName'):
    anonymizer = dicognito.anonymizer.Anonymizer()
    errors = {}
    for folder_name in tqdm(os.listdir(dicom_from_path), desc='reading directory'):
        folder_input_path = os.path.join(dicom_from_path, folder_name)
        if os.path.isdir(folder_input_path):
            folder_output_path = os.path.join(clean_to_path, folder_name)
            os.makedirs(folder_output_path, exist_ok=True)
            for file in tqdm(os.listdir(folder_input_path), desc=f'anonymizing dicom from {folder_input_path}  to  {folder_output_path}'):
                input_file_path = os.path.join(folder_input_path, file)
                if pm.misc.is_dicom(input_file_path):
                    try:
                        dataset = pm.dcmread(input_file_path)

                        if hasattr(dataset, 'remove_private_tags'):
                            dataset.remove_private_tags()

                        if 'OtherPatientIDs' in dataset:
                            if dataset.OtherPatientIDs.value != '':
                                del dataset.OtherPatientIDs

                        anonymizer.anonymize(dataset)

                        if 'PatientID' in dataset:
                            dataset.PatientID = folder_name
                        if 'PatientName' in dataset:
                            dataset.PatientName = folder_name
                        if 'InstitutionName' in dataset:
                            dataset.InstitutionName = InstituteName
                        if 'InstitutionAddress' in dataset:
                            dataset.InstitutionAddress = InstituteName

                        # Save the modified file
                        dataset.save_as(os.path.join(folder_output_path, "ancl_" + file))
                    except Exception as e:
                        print(f"Error processing {folder_output_path, input_file_path}: {e}")
                        errors[file] = str(e)+'\n'
    print("Errors encountered: \n", errors)


dicom_from_path = r"C:\PanCanAID_Valid_Control_20231221"
clean_to_path = r"C:\PanCanAID_Valid_Control_20231221_AnonymizeClean"
InstituteName = 'www.PanCanAID.com'
Anonymize_and_update_dicom_metadata(dicom_from_path=dicom_from_path, clean_to_path=clean_to_path, InstituteName=InstituteName)


### Arhcived codes

In [None]:
import pydicom as pm
import dicognito.anonymizer
from tqdm.notebook import tqdm
import os

anonymizer = dicognito.anonymizer.Anonymizer()
dicom_from_path  = r"C:\PanCanAID_Valid_Control_20231221"
clean_to_path = r"C:\PanCanAID_Valid_Control_20231221_AnonymizeClean"

errors = {}
def Anonymize_and_update_dicom_metadata(dicom_from_path, Instititue_tag='InstituteTag'):
    for folder_name in tqdm(os.listdir(dicom_from_path), desc='Total directories'):
        if os.path.isdir(os.path.join(dicom_from_path, folder_name)):
            folder_input_path = os.path.join(dicom_from_path, folder_name)
            folder_output_path = os.path.join(clean_to_path, folder_name)

            if not os.path.exists(folder_output_path):
                os.makedirs(folder_output_path)

            for root, dirs, files in os.walk(folder_input_path):
                for file in tqdm(files, desc=f'Anonymizing dicoms from {folder_input_path} to {folder_output_path}'):
                    dir_dcm_file = os.path.join(root, file)
                    if pm.misc.is_dicom(dir_dcm_file):  # Ensure this function exists in your pydicom version
                        try:
                            with pm.dcmread(dir_dcm_file) as dataset:
                                dataset.remove_private_tags() #removing private tag
                                
                                if 'OtherPatientIdataset' in dataset: #removing others and unknown tag
                                    delattr(dataset, 'OtherPatientIdataset')
                                    
                                if 'PatientID' in dataset:
                                    dataset.data_element('PatientID').value = folder_name
                                if 'PatientName' in dataset:
                                    dataset.data_element('PatientName').value = folder_name
                                    
                                if 'InstitutionName' in dataset:
                                    dataset.data_element('InstitutionName').value=Instititue_tag
                                if 'InstitutionAddress' in dataset:
                                    dataset.data_element('InstitutionAddress').value=Instititue_tag
                                
                                anonymizer.anonymize(dataset)
                                dataset.save_as(os.path.join(folder_output_path, "clean-" + file))
                        except Exception as e:
                            print(f"Error reading {file}::: {e} \n")
                            errors[file] = str(e)
                        
                    

# Output the errors at the end
print("Errors encountered:", errors)


In [None]:
clean_to_path  = r"C:\PanCanAID_Valid_Case_20231212"
def update_dicom_metadata(directory):
    for folder_name in tqdm(os.listdir(directory), desc='reading direcotry'):
        folder_path = os.path.join(directory, folder_name)
        if os.path.isdir(folder_path):
            for file in tqdm(os.listdir(folder_path),desc=f'updating dicom meta at {folder_path}'):
                file_path = os.path.join(folder_path, file)
                if pm.misc.is_dicom(file_path):
                    try:
                        ds = pm.dcmread(file_path)

                        ds.remove_private_tags()
                        if 'OtherPatientIDs' in ds:
                            delattr(ds, 'OtherPatientIDs')
                            
                        if 'PatientID' in ds:
                            ds.data_element('PatientID').value = folder_name
                        if 'PatientName' in ds:
                            ds.data_element('PatientName').value = folder_name
                            
                        if 'InstitutionName' in ds:
                            ds.data_element('InstitutionName').value='www.PanCanAID.com'
                        if 'InstitutionAddress' in ds:
                            ds.data_element('InstitutionAddress').value='www.PanCanAID.com'
                            
                        # Save the modified file
                        ds.save_as(file_path)
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

update_dicom_metadata(clean_to_path)