In [17]:
%%capture
! pip install pandas

In [18]:
import os
import ast
import json
import pandas as pd

## Find the experiment_logs files within the directory and extract each hyperparameter into a column

In [19]:
def find_raw_content(root_dir):
    raw_data = []
    
    for subdir, _, files in os.walk(root_dir):
        experiment_file = None
        metrics_file = None
        
        for file in files: 
            if file.startswith('experiment_logs') and file.endswith('.txt'):
                experiment_file = os.path.join(subdir, file)
            elif file.startswith('metrics') and file.endswith('.txt'):
                metrics_file = os.path.join(subdir, file)
        
        if experiment_file and metrics_file:
            ckpt_name = os.path.basename(subdir)
            raw_data.append((ckpt_name, experiment_file, metrics_file))
    
    return raw_data
    

In [20]:
def extract_dict(file_path):
    try:
        if 'experiment_logs' in file_path : # complete the condition here
            with open(file_path, 'r') as file: # open the txt in read mode
        
                # Read the entire content of the file into a single string variable
                content = file.read() 
        
                # Use 'find' to search for the first '{' char within the string variable to identify the beginning of the JSON-like 
                # configuration that stores the hyperparameters. The use 'rfind' to search for the last '}' and identify the end of the configuration.
                start = content.find('{')
                end = content.rfind('}')
        
                # Extract a substring from the 'content' variable from 'start' to 'end'. The +1 is used to specify including the last '}'
                json_str = content[start:end+1]
        
                # Parse the string into a Python dictionary
                dict_data = json.loads(json_str)
        elif 'metrics' in file_path : # And here
            with open(file_path, 'r') as file:
                content = file.read()
                dict_data = ast.literal_eval(content)
        
    except (json.JSONDecodeError, ValueError, SyntaxError) as e:
        print(f"Error decoding JSON or evaluating string from file {file_path}: {e}")
        
    return dict_data

In [21]:
# The args for this function are the following: d (dictionary to be parsed), parent_key (base key string used for nested keys), 
# sep (separator string used to concatenate nested keys)
def flatten_dict(d, parent_key='', sep='.'):
    items = []
    for key, value in d.items(): # iterate over each key-value pair in the dictionary
        new_key = f"{parent_key}{sep}{key}" if parent_key else key 
        
        # Check if the value is a dictionary with isinstance
        if isinstance(value, dict):
            
            # The function calls itself recursively with the nested dictionary 'value', the 'new_key' and the 'sep' and the
            # flattened items are extended into the items list
            items.extend(flatten_dict(value, new_key, sep=sep).items()) 
        else:
            items.append((new_key, value))
    return dict(items)

In [22]:
def create_dataframe(raw_data):
    records = []
    
    for ckpt_name, experiment_file, metrics_file in raw_data:
        config_data = extract_dict(experiment_file)
        metrics_data = extract_dict(metrics_file)
        
        flat_config_data = flatten_dict(config_data)
        flat_metrics_data = flatten_dict(metrics_data)
        
        combined_data = {**flat_config_data, **flat_metrics_data}
        combined_data['ckpt_name'] = ckpt_name
        
        records.append(combined_data)
    
    df = pd.DataFrame(records)
    
    # Ensure 'ckpt_name' is the first column
    columns = ['ckpt_name'] + [col for col in df.columns if col != 'ckpt_name']
    df = df[columns]
    
    return df

In [15]:
root_dir = "PATH/TO/DIRECTORY"
raw_data = find_raw_content(root_dir)
print (raw_data)
df = create_dataframe(raw_data)
df.head()

[('RUN_20240612_100027_359642', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240612_100027_359642\\experiment_logs_Jun12_10_00_27.txt', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240612_100027_359642\\metrics_20240612_100027.txt'), ('RUN_20240617_163510_293224', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240617_163510_293224\\experiment_logs_Jun17_16_35_10.txt', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240617_163510_293224\\metrics_20240617_163510.txt'), ('RUN_20240620_104658_182467', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240620_104658_182467\\experiment_logs_Jun20_10_46_58.txt', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240620_104658_182467\\metrics_20240620_104658.txt'), ('RUN_20240624_175224_278149', 'F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated\\RUN_20240624_175224_278149\

Unnamed: 0,ckpt_name,checkpoint_params.load_checkpoint,checkpoint_params.schema,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,...,Recall@0.50:0.95,mAP@0.50:0.95,F1@0.50:0.95,AP@0.50:0.95_Architectural distortion,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Architectural distortion,Best_score_threshold_Mass,Best_score_threshold_Calcification
0,RUN_20240612_100027_359642,False,,0,100,0,1e-06,0.1,Adam,1e-05,...,0.005259,7e-06,0.000182,0.0,1e-05,4e-06,0.11,0.0,0.11,0.22
1,RUN_20240617_163510_293224,False,,0,100,0,1e-06,0.1,Adam,1e-05,...,0.011994,7e-06,0.000235,0.0,6e-06,9e-06,0.11,0.0,0.11,0.22
2,RUN_20240620_104658_182467,False,,0,100,0,1e-06,0.1,AdamW,1e-05,...,0.076085,0.005795,0.001406,0.0,8e-05,0.011509,0.25,0.0,0.1,0.25
3,RUN_20240624_175224_278149,False,,0,100,0,1e-06,0.1,AdamW,1e-05,...,0.119091,0.011001,0.002431,0.0,0.00034,0.021661,0.25,0.0,0.18,0.25
4,RUN_20240625_113055_125920,False,,0,100,0,1e-06,0.1,AdamW,1e-05,...,0.147835,0.015902,0.003215,0.0,0.000385,0.031419,0.25,0.0,0.18,0.25


In [16]:
df.to_csv('raw.csv', index=False)