# Raw data extraction from model-training-generated files

In [1]:
%%capture
! pip install pandas

In [2]:
import os
import ast
import json
import pandas as pd

### Find the experiment_logs and metrics files within the directory and extract each hyperparameter into a column

In [3]:
def find_raw_content(root_dir):
    raw_data = []
    
    for subdir, _, files in os.walk(root_dir):
        experiment_file = None
        metrics_file = None
        
        for file in files: 
            if file.startswith('experiment_logs') and file.endswith('.txt'):
                experiment_file = os.path.join(subdir, file)
            elif file.startswith('metrics') and file.endswith('.txt'):
                metrics_file = os.path.join(subdir, file)
        
        if experiment_file and metrics_file:
            ckpt_name = os.path.basename(subdir)
            raw_data.append((ckpt_name, experiment_file, metrics_file))
    
    return raw_data
    

In [4]:
def extract_dict(file_path):
    try:
        if 'experiment_logs' in file_path : # complete the condition here
            with open(file_path, 'r') as file: # open the txt in read mode
        
                # Read the entire content of the file into a single string variable
                content = file.read() 
        
                # Use 'find' to search for the first '{' char within the string variable to identify the beginning of the JSON-like 
                # configuration that stores the hyperparameters. The use 'rfind' to search for the last '}' and identify the end of the configuration.
                start = content.find('{')
                end = content.rfind('}')
        
                # Extract a substring from the 'content' variable from 'start' to 'end'. The +1 is used to specify including the last '}'
                json_str = content[start:end+1]
        
                # Parse the string into a Python dictionary
                dict_data = json.loads(json_str)
        elif 'metrics' in file_path : # And here
            with open(file_path, 'r') as file:
                content = file.read()
                dict_data = ast.literal_eval(content)
        
    except (json.JSONDecodeError, ValueError, SyntaxError) as e:
        print(f"Error decoding JSON or evaluating string from file {file_path}: {e}")
        
    return dict_data

In [5]:
# The args for this function are the following: d (dictionary to be parsed), parent_key (base key string used for nested keys), 
# sep (separator string used to concatenate nested keys)
def flatten_dict(d, parent_key='', sep='.'):
    items = []
    for key, value in d.items(): # iterate over each key-value pair in the dictionary
        new_key = f"{parent_key}{sep}{key}" if parent_key else key 
        
        # Check if the value is a dictionary with isinstance
        if isinstance(value, dict):
            
            # The function calls itself recursively with the nested dictionary 'value', the 'new_key' and the 'sep' and the
            # flattened items are extended into the items list
            items.extend(flatten_dict(value, new_key, sep=sep).items()) 
        else:
            items.append((new_key, value))
    return dict(items)

### Save the information to a dataframe

In [6]:
def create_dataframe(raw_data):
    records = []
    
    for ckpt_name, experiment_file, metrics_file in raw_data:
        config_data = extract_dict(experiment_file)
        metrics_data = extract_dict(metrics_file)
        
        flat_config_data = flatten_dict(config_data)
        flat_metrics_data = flatten_dict(metrics_data)
        
        combined_data = {**flat_config_data, **flat_metrics_data}
        combined_data['ckpt_name'] = ckpt_name
        
        records.append(combined_data)
    
    df = pd.DataFrame(records)
    
    # Ensure 'ckpt_name' is the first column
    columns = ['ckpt_name'] + [col for col in df.columns if col != 'ckpt_name']
    df = df[columns]
    
    return df

In [7]:
root_dir = "F:/Bases/Cancer/Mama/Mamografias/BRAHMA_DETECCION/ckpt_associated"
raw_data = find_raw_content(root_dir)
df = create_dataframe(raw_data)
df.head()

Unnamed: 0,ckpt_name,checkpoint_params.load_checkpoint,checkpoint_params.schema,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,...,mAP@0.50:0.95,F1@0.50:0.95,AP@0.50:0.95_Architectural distortion,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Architectural distortion,Best_score_threshold_Mass,Best_score_threshold_Calcification,training_hyperparams.lr_decay_factor
0,RUN_20240612_100027_359642,False,,0,100,0,1e-06,0.1,Adam,1e-05,...,7e-06,0.000182,0.0,1e-05,4e-06,0.11,0.0,0.11,0.22,
1,RUN_20240617_163510_293224,False,,0,100,0,1e-06,0.1,Adam,1e-05,...,7e-06,0.000235,0.0,6e-06,9e-06,0.11,0.0,0.11,0.22,
2,RUN_20240620_104658_182467,False,,0,100,0,1e-06,0.1,AdamW,1e-05,...,0.005795,0.001406,0.0,8e-05,0.011509,0.25,0.0,0.1,0.25,
3,RUN_20240624_175224_278149,False,,0,100,0,1e-06,0.1,AdamW,1e-05,...,0.011001,0.002431,0.0,0.00034,0.021661,0.25,0.0,0.18,0.25,
4,RUN_20240625_113055_125920,False,,0,100,0,1e-06,0.1,AdamW,1e-05,...,0.015902,0.003215,0.0,0.000385,0.031419,0.25,0.0,0.18,0.25,


### Assign target values

In [8]:
def target_values (df, thresh):
    df['Target'] = (df['Best_score_threshold'] > thresh).astype(int)
    return df

In [9]:
df = target_values(df, 0.25)
df.tail()

Unnamed: 0,ckpt_name,checkpoint_params.load_checkpoint,checkpoint_params.schema,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,...,F1@0.50:0.95,AP@0.50:0.95_Architectural distortion,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Architectural distortion,Best_score_threshold_Mass,Best_score_threshold_Calcification,training_hyperparams.lr_decay_factor,Target
8,RUN_20240709_162535_315125,False,,0,100,0,1e-06,0.01,AdamW,1e-05,...,0.005772,0.0,0.000449,0.036148,0.23,0.0,0.09,0.23,,0
9,RUN_20240710_105031_001793,False,,0,100,0,1e-06,0.01,AdamW,1e-05,...,0.009697,0.0,0.001963,0.074611,0.27,0.0,0.16,0.27,,1
10,RUN_20240715_102716_778750,False,,0,100,0,1e-06,0.01,AdamW,1e-05,...,0.002209,0.0,2e-06,0.026816,0.26,0.0,0.05,0.26,,1
11,RUN_20240716_115801_530107,False,,0,100,0,1e-06,0.01,AdamW,1e-05,...,0.00537,0.0,0.000284,0.012181,0.22,0.0,0.13,0.22,,0
12,RUN_20240729_122413_289327,False,,0,100,0,1e-06,0.01,AdamW,1e-05,...,0.007618,0.0,0.000759,0.047631,0.37,0.0,0.11,0.37,0.9,1


In [10]:
df.to_csv('../data/raw.csv', index=False)

In [16]:
df.dtypes

ckpt_name                                         object
checkpoint_params.load_checkpoint                   bool
checkpoint_params.schema                          object
training_hyperparams.lr_warmup_epochs              int64
training_hyperparams.lr_warmup_steps               int64
                                                  ...   
Best_score_threshold_Architectural distortion    float64
Best_score_threshold_Mass                        float64
Best_score_threshold_Calcification               float64
training_hyperparams.lr_decay_factor             float64
Target                                             int32
Length: 298, dtype: object

## Data clean-up

### Drop NaN and empty list columns

In [17]:
def drop_empty_columns(df):

    # First drop columns where all values are NaN
    df_cleaned = df.dropna(axis=1, how='all')

    # Then drop columns where all values are empty lists
    empty_list_cols = [col for col in df.columns 
             if df[col].apply(lambda x: x == [] or x == '[]').all()]
    df_cleaned = df_cleaned.drop(columns=empty_list_cols)
    return df_cleaned

df = drop_empty_columns(df)
df.head()

Unnamed: 0,ckpt_name,checkpoint_params.load_checkpoint,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,training_hyperparams.ema,...,F1@0.50:0.95,AP@0.50:0.95_Architectural distortion,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Architectural distortion,Best_score_threshold_Mass,Best_score_threshold_Calcification,training_hyperparams.lr_decay_factor,Target
0,RUN_20240612_100027_359642,False,0,100,0,1e-06,0.1,Adam,1e-05,True,...,0.000182,0.0,1e-05,4e-06,0.11,0.0,0.11,0.22,,0
1,RUN_20240617_163510_293224,False,0,100,0,1e-06,0.1,Adam,1e-05,True,...,0.000235,0.0,6e-06,9e-06,0.11,0.0,0.11,0.22,,0
2,RUN_20240620_104658_182467,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,...,0.001406,0.0,8e-05,0.011509,0.25,0.0,0.1,0.25,,0
3,RUN_20240624_175224_278149,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,...,0.002431,0.0,0.00034,0.021661,0.25,0.0,0.18,0.25,,0
4,RUN_20240625_113055_125920,False,0,100,0,1e-06,0.1,AdamW,1e-05,True,...,0.003215,0.0,0.000385,0.031419,0.25,0.0,0.18,0.25,,0


### Replace NaN values

In [18]:
def replace_nan(df, numeric, non_numeric):
    for col in df.columns:
        if df[col].dtype == 'int64' or df[col].dtype == 'float64' or df[col].dtype == 'int32':
            df[col] = df[col].fillna(numeric)
        else:
            df[col] = df[col].fillna(non_numeric)
    return df
         

df = replace_nan(df, 0, 'Empty')
df.tail()

Unnamed: 0,ckpt_name,checkpoint_params.load_checkpoint,training_hyperparams.lr_warmup_epochs,training_hyperparams.lr_warmup_steps,training_hyperparams.lr_cooldown_epochs,training_hyperparams.warmup_initial_lr,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.optimizer_params.weight_decay,training_hyperparams.ema,...,F1@0.50:0.95,AP@0.50:0.95_Architectural distortion,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Architectural distortion,Best_score_threshold_Mass,Best_score_threshold_Calcification,training_hyperparams.lr_decay_factor,Target
8,RUN_20240709_162535_315125,False,0,100,0,1e-06,0.01,AdamW,1e-05,True,...,0.005772,0.0,0.000449,0.036148,0.23,0.0,0.09,0.23,0.0,0
9,RUN_20240710_105031_001793,False,0,100,0,1e-06,0.01,AdamW,1e-05,True,...,0.009697,0.0,0.001963,0.074611,0.27,0.0,0.16,0.27,0.0,1
10,RUN_20240715_102716_778750,False,0,100,0,1e-06,0.01,AdamW,1e-05,True,...,0.002209,0.0,2e-06,0.026816,0.26,0.0,0.05,0.26,0.0,1
11,RUN_20240716_115801_530107,False,0,100,0,1e-06,0.01,AdamW,1e-05,True,...,0.00537,0.0,0.000284,0.012181,0.22,0.0,0.13,0.22,0.0,0
12,RUN_20240729_122413_289327,False,0,100,0,1e-06,0.01,AdamW,1e-05,True,...,0.007618,0.0,0.000759,0.047631,0.37,0.0,0.11,0.37,0.9,1


### Drop columns that contain constants

In [19]:
def drop_constants(df):
    # Seleccionar solo las columnas que no tienen el mismo valor en todas las filas
    df = df.loc[:, (df != df.iloc[0]).any()]
    return df

df = drop_constants(df)
df.head()

Unnamed: 0,ckpt_name,training_hyperparams.cosine_final_lr_ratio,training_hyperparams.optimizer,training_hyperparams.batch_accumulate,training_hyperparams.lr_mode,training_hyperparams.phase_callbacks,training_hyperparams.initial_lr.backbone,training_hyperparams.initial_lr.default,training_hyperparams.max_epochs,dataset_params.train_dataset_params,...,Recall@0.50:0.95,mAP@0.50:0.95,F1@0.50:0.95,AP@0.50:0.95_Mass,AP@0.50:0.95_Calcification,Best_score_threshold,Best_score_threshold_Mass,Best_score_threshold_Calcification,training_hyperparams.lr_decay_factor,Target
0,RUN_20240612_100027_359642,0.1,Adam,1,CosineLRScheduler,[<super_gradients.training.utils.callbacks.cal...,0.1,0.1,50,"{'data_dir': '../data/', 'images_dir': 'train/...",...,0.005259,7e-06,0.000182,1e-05,4e-06,0.11,0.11,0.22,0.0,0
1,RUN_20240617_163510_293224,0.1,Adam,1,CosineLRScheduler,[<super_gradients.training.utils.callbacks.cal...,0.1,0.1,50,"{'data_dir': '../data/', 'images_dir': 'train/...",...,0.011994,7e-06,0.000235,6e-06,9e-06,0.11,0.11,0.22,0.0,0
2,RUN_20240620_104658_182467,0.1,AdamW,1,CosineLRScheduler,[<super_gradients.training.utils.callbacks.cal...,0.1,0.1,75,"{'data_dir': '../data/', 'images_dir': 'train/...",...,0.076085,0.005795,0.001406,8e-05,0.011509,0.25,0.1,0.25,0.0,0
3,RUN_20240624_175224_278149,0.1,AdamW,1,CosineLRScheduler,[<super_gradients.training.utils.callbacks.cal...,0.01,0.01,75,"{'data_dir': '../data/', 'images_dir': 'train/...",...,0.119091,0.011001,0.002431,0.00034,0.021661,0.25,0.18,0.25,0.0,0
4,RUN_20240625_113055_125920,0.1,AdamW,1,CosineLRScheduler,[<super_gradients.training.utils.callbacks.cal...,0.01,0.01,100,"{'data_dir': '../data/', 'images_dir': 'train/...",...,0.147835,0.015902,0.003215,0.000385,0.031419,0.25,0.18,0.25,0.0,0


In [20]:
df.to_csv("../data/processed.csv", index=False)