# Data Configuration Autogenertation

## Setup

In [None]:
from copy import deepcopy
from pathlib import Path
from json import dump

## Template

In [None]:
template = {
  "label": None,
  "format": "tabular",
  "data_source": None,
  "separator": "\t",
  "pre_split_hooks": [],
  "post_split_hooks": []
}

## Utility Functions 

Build the baseline pre-split hooks

In [None]:
def build_presplit(file_name: str, json_content: dict):
    # Isolate the pre_split_hook list
    pre_split_hooks = json_content['pre_split_hooks']

    feature_list = ['GRP']
    
    # Append an extended list of explicitly dropped features if the dataset contains clinical data
    if "full_" in file_name or "clinical_" in file_name:
        feature_list.extend([
            "Site",
            "Surgical",
            "Number of Surgeries",
            "Treatment Plan",
            "Followup: 6-18 weeks",
            "Followup: 12 month",
            "Followup: 24 month",
            "Followup: 60 month",
            "Date of Assessment",
            "CSM Duration",
            "Work Status",
            "mJOA 12 months",
            "HRR"
        ])
    # Otherwise just drop some basic metadata
    if "full_" in file_name or "img_" in file_name:
        feature_list.extend([
            "acq",
            "weight"
        ])
        
    # Append the resulting list of features to drop explicitly
    pre_split_hooks.append({
        "type": "drop_features_explicit",
        "features": feature_list
    })
        
    # Add some nullity checks as well
    pre_split_hooks.extend([{
        "type": "feature_drop_null",
        "threshold": 0.5
    }, {
        "type": "sample_drop_null",
        "threshold": 0.5
    }])

    

Build the baseline post-split hooks

In [None]:
def build_postsplit(file_name: str, json_content: dict):
    # Append an extended list of explicitly categorical features if the dataset contains clinical data
    if "full_" in file_name or "clinical_" in file_name:
        json_content['post_split_hooks'].extend([{
            "type": "imputation_simple",
            "strategy": "most_frequent",
            "features": [
                "EQ5D: Anxiety/Depression",
                "EQ5D: Mobility",
                "EQ5D: Pain/Discomfort",
                "EQ5D: Self-Care",
                "EQ5D: Total",
                "EQ5D: Usual Activities",
                "Sex",
                "Symptom Duration",
                "Work Status (Category)",
                "Comorbidities: Nicotine (Smoking)",
                "Comorbidities: Nicotine (Smokeless)",
                "Comorbidities: Nicotine (Patches)",
                "Comorbidities: Nicotine (Recent Quit)"
            ]}, {
            "type": "one_hot_encode",
            "features": [
                "EQ5D: Anxiety/Depression",
                "EQ5D: Mobility",
                "EQ5D: Pain/Discomfort",
                "EQ5D: Self-Care",
                "EQ5D: Usual Activities",
                "Sex",
                "Symptom Duration",
                "Work Status (Category)",
                "Comorbidities: Nicotine (Smoking)",
                "Comorbidities: Nicotine (Smokeless)",
                "Comorbidities: Nicotine (Patches)",
                "Comorbidities: Nicotine (Recent Quit)"
            ],
            "max_unique_vals": 5,
            "handle_unknown": "ignore"
        }])
    # Add some common standardization and imputation for everything
    json_content['post_split_hooks'].extend([{
            "type": "imputation_simple",
            "strategy": "mean"
        }, {
            "type": "standard_scaling",
            "run_per_cross": True
        }])

Feature Selection/Transformation

In [None]:
def add_rfe(json_content):
    json_content['label'] += '_rfe'
    json_content['post_split_hooks'].append({
        "type":  "recursive_feature_elimination",
        "proportion": {
            "label": "rfe_feature_proportion",
            "type": "float",
            "low": 0.1,
            "high": 0.9
        }
    })

def add_pca(json_content):
    json_content['label'] += '_pca'
    json_content['post_split_hooks'].append({
        "type":  "principal_component_analysis",
        "proportion": {
            "label": "pca_component_proportion",
            "type": "float",
            "low": 0.1,
            "high": 0.9
        }
    })

## Configuration Generation

In [None]:
root_path = Path("/home/kalum.ost/classic_ml_reloaded/sct_processing/softseg_data/")

# NOTE; the full metrics are CSV, not TSV, so we don't need to check within the loop at all!
for p in Path('.').glob('*.tsv'):
    # Copy the template
    new_json = deepcopy(template)

    # Extend it with initial pre- and post-split hooks
    build_presplit(p.name, new_json)
    build_postsplit(p.name, new_json)
    
    # Set the data path
    new_json['data_source'] = str(root_path / p.name)

    # Initialize the config label by using the file's name
    new_label = str(p.name).split('.')[0]
    new_json['label'] = new_label

    # Generate 5 configs each: no RFE/PCA (basic)...
    final_json = deepcopy(new_json)
    final_json['label'] += '_noprep'
    with open(f"{final_json['label']}.json", 'w') as fp:
        dump(final_json, fp, indent=2)

    # RFE only...
    final_json = deepcopy(new_json)
    add_rfe(final_json)
    with open(f"{final_json['label']}.json", 'w') as fp:
        dump(final_json, fp, indent=2)

    # PCA only...
    final_json = deepcopy(new_json)
    add_pca(final_json)
    with open(f"{final_json['label']}.json", 'w') as fp:
        dump(final_json, fp, indent=2)

    # RFE into PCA...
    final_json = deepcopy(new_json)
    add_rfe(final_json)
    add_pca(final_json)
    with open(f"{final_json['label']}.json", 'w') as fp:
        dump(final_json, fp, indent=2)
    
    # ... and PCA into RFE.
    final_json = deepcopy(new_json)
    add_pca(final_json)
    add_rfe(final_json)
    with open(f"{final_json['label']}.json", 'w') as fp:
        dump(final_json, fp, indent=2)
