In [3]:
import json
import os

def search_cancer_trials(main_folder):
    # List to store names of files containing the term 'cancer'
    cancer_files = []

    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(main_folder):
        for file in files:
            if file.endswith('.json'):  # Check for JSON files
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        # Load the JSON data
                        data = json.load(f)

                        # Convert the data to a string and check for the term 'cancer'
                        if 'cancer' in json.dumps(data, ensure_ascii=False).lower():
                            # If found, add the file name to the list
                            cancer_files.append(os.path.relpath(file_path, main_folder))
                
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")

    # Save the names of the files to cancer_trials.txt
    with open("cancer_trials.txt", 'w', encoding='utf-8') as output_file:
        for file_name in cancer_files:
            output_file.write(f"{file_name}\n")

    print(f"Found {len(cancer_files)} files containing the term 'cancer'.")
    print("File names saved to cancer_trials.txt.")

# Example usage
if __name__ == "__main__":
    main_folder = "Outputs_ids_with_results"  # Replace with your main folder path
    search_cancer_trials(main_folder)

Found 18288 files containing the term 'cancer'.
File names saved to cancer_trials.txt.


In [None]:
## Keys to save for gpt
## First keys protocolSection
# protocolSection
# identificationModule, statusModule , descriptionModule, conditionsModule, designModule, armsInterventionsModule, outcomesModule, eligibilityModule
# resultsSection
# 

In [12]:
import json

file_path = "../Original_format/NCT00623831.json"  # original JSON

with open(file_path, "r", encoding='utf-8') as f:
    original_data = json.load(f)
original_data['resultsSection'].keys()

dict_keys(['participantFlowModule', 'baselineCharacteristicsModule', 'outcomeMeasuresModule', 'adverseEventsModule', 'moreInfoModule'])

In [13]:
original_data['resultsSection']['moreInfoModule']

{'certainAgreement': {'piSponsorEmployee': False,
  'restrictionType': 'LTE60',
  'restrictiveAgreement': True},
 'pointOfContact': {'title': 'Jonathan Skipper PhD',
  'organization': 'Ludwig Institute for Cancer Research',
  'email': 'jskipper@lcr.org',
  'phone': '12124501539'}}

In [29]:
a = 'b'
a

'b'

## Filter Original files by keys

In [1]:
import glob
from tqdm import tqdm
import os

def process_json_files():
    # Define input and output directories
    input_dir = "../Original_format"
    output_dir = "Files_for_GPT"
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Define keys to keep
    identification_keys_to_keep = [   
        'identificationModule',
        'statusModule',
        'descriptionModule',
        'conditionsModule',
        'designModule',
        'armsInterventionsModule',
        'outcomesModule' , 
        'eligibilityModule'
    ]

    result_keys_to_keep = [   
        'participantFlowModule',
        'baselineCharacteristicsModule',
        'outcomeMeasuresModule',
        'conditionsModule',
        'adverseEventsModule'
    ]
    
    input_files = []
    try:
        with open("cancer_trials.txt", 'r', encoding='utf-8') as f:         
            # Read filenames
            for line in f:
                line = line.strip()
                path_parts = line.split(os.sep)
                file_id = path_parts[-1]
                path_parts[0] = input_dir
                full_path = os.path.join(*path_parts)
                input_files.append(str(full_path))
    
        print(f"Found {len(input_files)} files to process from cancer_files.txt")
        
    except Exception as e:
        print(f"Error reading cancer_files.txt: {str(e)}")
    
    if not input_files:
        print("No JSON files found in input directory")
    
    print(f"Found {len(input_files)} files to process")
    
    #Process each file
    for input_file in tqdm(input_files, desc="Processing files"):
        try:
            # Read input file
            with open(input_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                identification_data = data['protocolSection']
                results_data = data['resultsSection']
            
            # Create new dictionary with only desired keys
            filtered_data = {key: identification_data[key] for key in identification_keys_to_keep if key in identification_data}
            result = {key: results_data[key] for key in result_keys_to_keep if key in results_data}
            filtered_data.update(result)
            
            # Create output filename
            # Create output filename using only the last part of the input file name
            output_file_name = os.path.basename(input_file)  # Get the full filename
            output_file_name = os.path.splitext(output_file_name)[0]  # Remove the file extension
            output_file = os.path.join(output_dir, output_file_name + ".json")  # Add .json extension
                        
            # Save filtered data
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(filtered_data, f, indent=4, ensure_ascii=False)
                
        except Exception as e:
            print(f"\nError processing {input_file}: {str(e)}")
    
    print("\nProcessing complete!")
    print(f"Output files saved in: {output_dir}")

if __name__ == "__main__":
    process_json_files()

Found 18288 files to process from cancer_files.txt
Found 18288 files to process


Processing files:  15%|███████████████████████████████████████████▏                                                                                                                                                                                                                                                             | 2660/18288 [00:10<00:52, 295.04it/s]


Error processing ../Original_format/.ipynb_checkpoints/NCT00003468-checkpoint.json: [Errno 2] No such file or directory: '../Original_format/.ipynb_checkpoints/NCT00003468-checkpoint.json'


Processing files:  35%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                                                                                | 6465/18288 [00:25<00:44, 268.31it/s]


Error processing ../Original_format/NCT05037656.json: 'resultsSection'


Processing files:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 16073/18288 [01:06<00:07, 310.96it/s]


Error processing ../Original_format/NCT00000479-checkpoint.json: [Errno 2] No such file or directory: '../Original_format/NCT00000479-checkpoint.json'

Error processing ../Original_format/NCT00001941-checkpoint.json: [Errno 2] No such file or directory: '../Original_format/NCT00001941-checkpoint.json'


Processing files:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                           | 16592/18288 [01:08<00:05, 311.22it/s]


Error processing ../Original_format/.ipynb_checkpoints/NCT00524745-checkpoint.json: [Errno 2] No such file or directory: '../Original_format/.ipynb_checkpoints/NCT00524745-checkpoint.json'


Processing files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18288/18288 [01:14<00:00, 245.93it/s]


Processing complete!
Output files saved in: Files_for_GPT





## Get files from cancer_trials in Inference_Conclusion

In [2]:
import os
import shutil

def copy_json_files():
    # Define the input file and output directory
    input_file_path = "cancer_trials.txt"  # Path to the input file
    output_dir = "Files_for_evaluation"  # Define your output directory
    input_dir = "Inferences_Conclusion"  # Define the input directory where the JSON files are located

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    try:
        with open(input_file_path, 'r', encoding='utf-8') as f:
            # Read each line in the input file
            for line in f:
                line = line.strip()  # Remove any leading/trailing whitespace
                if line:  # Ensure the line is not empty
                    # Construct the full path to the source file
                    source_file_path = os.path.join(input_dir, line)
                    
                    # Copy the file to the output directory
                    try:
                        shutil.copy2(source_file_path, os.path.join(output_dir, os.path.basename(source_file_path)))
                        print(f"Copied: {source_file_path} to {output_dir}")
                    except Exception as e:
                        print(f"Error copying {source_file_path}: {str(e)}")
    except FileNotFoundError:
        print(f"Error: The file {input_file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    copy_json_files()

Copied: Inferences_Conclusion/folder_15/NCT00633750.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00642746.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00623545.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00634244.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00649389.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00651625.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00626106.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00624052.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00630565.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00626561.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00632931.json to Files_for_evaluation
Copied: Inferences_Conclusion/folder_15/NCT00651482.json to Files_for_evaluation
Copied: Inferences_Conclusio