# Set global parameters

In [178]:
use_accelerate = False
WRITE_MODE = 'w' # 'w' for sweeps, 'a' for tests
num_devices = 2 if use_accelerate else 1
DEVICE = 1
base_dir = '/home/lange/fantastic-umbrella/runs'
boolean_args = {
    'with_tracking' : [True],
}


# Set run(s) parameters

In [179]:
# vanilla n_epochs
GENERATION = "28_sweeps"
SWEEP_ID = '1'
RUN_NAME = f"28_S{SWEEP_ID}" + "_ID_{run_id}"
TAGS = [f'SWP{SWEEP_ID}']
EARLY_STOP_FRAC = 1
value_args = {
    'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
    'model_name_or_path' : ['bert-base-uncased'],
    # 'training_size' : [64],
    # 'evaluation_steps': [4],

    'training_size' : [256],
    'evaluation_steps': [16],

    # 'training_size' : [1024],
    # 'evaluation_steps': [32],

    'num_train_epochs' : [25,
		50,
		100,
		200,
		400,
		800],
    # 'max_train_steps' : [800],

    'learning_rate': [3e-5],
    'insert_dropout' : [0.1],
    'dataset_seed' : [1],
    # 'catch_dropout' : [0],

    'per_device_train_batch_size' : [32],
    'per_device_eval_batch_size' : [128],    
    'weight_decay' : [0.01], 
    'lr_scheduler_type' : ['linear'],
    'warmup_steps_fraction' : [0.1],
    'report_to' : ['all'],
    'beta1' : [0.9],
    'beta2' : [0.999],
    # 'early_stopping_patience' : [20],
}
seeds = [1,2,3]



# Util functions

In [180]:
def compose_instruction(value_args, boolean_args, run_name, use_accelerate=False, device=0,tags = [], ):
    run_name = run_name.format(**value_args) + f'_{str(tags[-1][0:3]).upper()}'
    if use_accelerate:
        instruction = 'accelerate launch run_glue_no_trainer_modded.py'
    else:
        # instruction = 'python run_glue_no_trainer_modded.py'
        tag_string = f"""'[{','.join([f'"{s}"' for s in tags])}]'"""
        env_variables = f'CUDA_VISIBLE_DEVICES={device} WANBD_TAGS={tag_string} WANDB_RUNNAME="{run_name}" '
        instruction = env_variables + f'python run_glue_no_trainer_modded.py'

    for k, v in value_args.items():
        instruction += f' --{k} {v}'

    for k, v in boolean_args.items():
        if v:
            instruction += f' --{k}'

    return instruction

In [181]:
import pandas as pd
INDEX_FILE = 'run_index.csv'
zfill = 4
def get_param_config_id(param_dict, insert_new=True):
    try:
        pc_index = pd.read_csv(INDEX_FILE, index_col='id')
    except FileNotFoundError: 
        pc_index = pd.DataFrame()
        pc_index.index.name='id'

    
    if len(pc_index) == 0:
        new_row = pd.DataFrame(param_dict,index = [0])
        
    else: 
        new_row = pd.DataFrame(param_dict,index = [pc_index.index.max()+1])
    new_row.index.name = 'id'
    # new_row.name = 0
    pc_index = pd.concat([pc_index,new_row],join='outer')

    duplicate_rows = pc_index.duplicated(keep=False)
    if duplicate_rows.any():
        pcid = duplicate_rows.idxmax()
        pc_index = pc_index.drop_duplicates()
    else:
        pcid = new_row.index[0]

    if insert_new:
        pc_index.to_csv(INDEX_FILE)
        
    return f'pcid_{str(pcid).zfill(zfill)}'

In [182]:
def determine_run_type(list_of_value_args):
    if 'catch_dropout' in list_of_value_args[0]:
        return "modded"
    else:
        return "vanilla"

In [183]:
import math
def calculate_early_stopping_patience(value_args, early_stop_frac):
    if 'num_train_epochs' in value_args:
        steps_per_epoch = value_args['training_size'] / value_args["per_device_train_batch_size"]
        total_steps = steps_per_epoch * value_args['num_train_epochs']
    elif 'max_train_steps' in value_args:
        total_steps = value_args['max_train_steps']
    else:
        raise('ERROR')
    n_evaluations = total_steps / value_args['evaluation_steps']
    early_stopping_patience = math.ceil(early_stop_frac*n_evaluations)
    return early_stopping_patience

# Create combinations

In [184]:
from itertools import product
keys, values = zip(*value_args.items())
list_of_value_args = [dict(zip(keys, v)) for v in product(*values)]
print(len(list_of_value_args))
list_of_value_args[0]

6


{'task_name': 'qnli',
 'model_name_or_path': 'bert-base-uncased',
 'training_size': 256,
 'evaluation_steps': 16,
 'num_train_epochs': 25,
 'learning_rate': 3e-05,
 'insert_dropout': 0.1,
 'dataset_seed': 1,
 'per_device_train_batch_size': 32,
 'per_device_eval_batch_size': 128,
 'weight_decay': 0.01,
 'lr_scheduler_type': 'linear',
 'warmup_steps_fraction': 0.1,
 'report_to': 'all',
 'beta1': 0.9,
 'beta2': 0.999}

# Generate instructions

In [185]:
from math import log, ceil
import json
sweep_type = determine_run_type(list_of_value_args)
print(f'Detected sweep type: {sweep_type}')
n_runs = len(list_of_value_args)*len(seeds)
magnitude = ceil(log(n_runs,10))
print(f'Number of experiments in this run in the range of 10^{magnitude} ({n_runs})')
sweep_identifier = f'{GENERATION}_{str(SWEEP_ID).zfill(2)}'

with open(f'instructions_{sweep_identifier}_config.json',WRITE_MODE) as f:
    json.dump(value_args | {"seeds":seeds},f)

with open(f'instructions_{sweep_identifier}.txt',WRITE_MODE, newline='\n') as f:
    sweep_identifier += f'_{sweep_type}'
    for idx, value_args in enumerate(list_of_value_args): # Iterate over param constellations
        value_args["run_generation"] = GENERATION # set generation arg
        # param_config_id = get_param_config_id(value_args) # get pcid
        # value_args["param_config_id"] = param_config_id # set pcid to args

        if EARLY_STOP_FRAC is not None:
            value_args['early_stopping_patience'] = calculate_early_stopping_patience(value_args,EARLY_STOP_FRAC)

        for idy, seed in enumerate(seeds):
            value_args["seed"] = seed # update current seed to args

            run_id = str(idx*len(seeds)+idy).zfill(magnitude)
            output_dir = f'{base_dir}/{sweep_identifier}/run_{run_id}'
            value_args["output_dir"] = output_dir
            try:
                run_name = RUN_NAME.format(**{'run_id':run_id})
            except:
                run_name = RUN_NAME

            instruction = compose_instruction(
                value_args = value_args, 
                boolean_args = boolean_args,
                run_name = run_name,
                use_accelerate = use_accelerate,
                device=DEVICE,
                tags=TAGS+[sweep_type]
            )
            f.write(instruction + '\n')
            print(instruction)

Detected sweep type: vanilla
Number of experiments in this run in the range of 10^2 (18)
CUDA_VISIBLE_DEVICES=1 WANBD_TAGS='["SWP1","vanilla"]' WANDB_RUNNAME="28_S1_ID_00_VAN" python run_glue_no_trainer_modded.py --task_name qnli --model_name_or_path bert-base-uncased --training_size 256 --evaluation_steps 16 --num_train_epochs 25 --learning_rate 3e-05 --insert_dropout 0.1 --dataset_seed 1 --per_device_train_batch_size 32 --per_device_eval_batch_size 128 --weight_decay 0.01 --lr_scheduler_type linear --warmup_steps_fraction 0.1 --report_to all --beta1 0.9 --beta2 0.999 --run_generation 28_sweeps --early_stopping_patience 13 --seed 1 --output_dir /home/lange/fantastic-umbrella/runs/28_sweeps_01_vanilla/run_00 --with_tracking
CUDA_VISIBLE_DEVICES=1 WANBD_TAGS='["SWP1","vanilla"]' WANDB_RUNNAME="28_S1_ID_01_VAN" python run_glue_no_trainer_modded.py --task_name qnli --model_name_or_path bert-base-uncased --training_size 256 --evaluation_steps 16 --num_train_epochs 25 --learning_rate 3e-05 

In [186]:
# # vanilla n_epochs
# GENERATION = "28_sweeps"
# SWEEP_ID = '1'
# RUN_NAME = f"28_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']
# EARLY_STOP_FRAC = 1
# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     # 'training_size' : [64],
#     # 'evaluation_steps': [4],

#     'training_size' : [256],
#     'evaluation_steps': [16],

#     # 'training_size' : [1024],
#     # 'evaluation_steps': [32],

#     'num_train_epochs' : [6,
# 		12,
# 		25,
# 		50,
# 		100,
# 		200], 
#     # 'max_train_steps' : [800],

#     'learning_rate': [3e-5],
#     'insert_dropout' : [0.1],
#     'dataset_seed' : [1],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]



In [187]:
# # dataset investigation
# GENERATION = "27_sweeps"
# SWEEP_ID = '3'
# RUN_NAME = f"27_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']
# EARLY_STOP_FRAC = 1
# value_args = {
#     'task_name' : ['qqp','sst2'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     # 'training_size' : [64],
#     # 'evaluation_steps': [4],

#     # 'training_size' : [256],
#     # 'evaluation_steps': [16],

#     'training_size' : [1024],
#     'evaluation_steps': [64],

#     # 'num_train_epochs' : [25], 
#     'max_train_steps' : [800],

#     'learning_rate': [5e-5,1e-5],
#     'insert_dropout' : [0.2],
#     'dataset_seed' : [1,2,3],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]



In [188]:
# # lr_dropout investigation part 2
# GENERATION = "26_sweeps"
# SWEEP_ID = '5'
# RUN_NAME = f"26_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'training_size' : [64],
#     'evaluation_steps': [4],

#     # 'training_size' : [256],
#     # 'evaluation_steps': [16],

#     # 'training_size' : [1024],
#     # 'evaluation_steps': [64],

#     # 'num_train_epochs' : [25], 
#     'max_train_steps' : [800],

#     'learning_rate': [3e-5],
#     'insert_dropout' : [0.1,0.2,0.3],
#     'dataset_seed' : [1,2,3],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]



In [189]:
# # lr_dropout investigation part 2
# GENERATION = "25_sweeps"
# SWEEP_ID = '5'
# RUN_NAME = f"25_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     # 'training_size' : [64],
#     # 'evaluation_steps': [4],

#     # 'training_size' : [256],
#     # 'evaluation_steps': [16],

#     'training_size' : [1024],
#     'evaluation_steps': [64],

#     # 'num_train_epochs' : [25], 
#     'max_train_steps' : [800],

#     'learning_rate': [5e-5,1e-5,5e-6,1e-6],
#     'insert_dropout' : [0.1],
#     'dataset_seed' : [1,2,3],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]



In [190]:
# # lr_dropout investigation part 2
# GENERATION = "24_sweeps"
# SWEEP_ID = '5'
# RUN_NAME = f"24_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'training_size' : [1024],
#     # 'num_train_epochs' : [25], 
#     'max_train_steps' : [800],
#     'evaluation_steps': [64],
#     'learning_rate': [1e-5,2e-5,3e-5,4e-5,5e-5],
#     'insert_dropout' : [0.1,0.14,0.18,0.22,0.26,0.3],
#     'dataset_seed' : [1,2,3,4,5],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3,4,5]



In [191]:
# # tests 256 base
# GENERATION = "23_sweeps"
# SWEEP_ID = '1'
# RUN_NAME = f"23_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['rte'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['/home/lange/fantastic-umbrella/it_reference'],
#     'training_size' : [8,16,32,64],
#     # 'num_train_epochs' : [25], 
#     'max_train_steps' : [800],
#     'evaluation_steps': [2],
#     'learning_rate': [1e-5,2e-5],
#     'insert_dropout' : [0.24,0.28],
#     'dataset_seed' : [1,2,3,4,5],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]


In [192]:
# # tests 256 base
# GENERATION = "22_sweeps"
# SWEEP_ID = '5'
# RUN_NAME = f"22_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['qnli','qqp','sst2'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-large-uncased'],
#     'training_size' : [1024],
#     'num_train_epochs' : [25], 
#     'evaluation_steps': [32],
#     'learning_rate': [3e-5],
#     'insert_dropout' : [0.12,0.16],
#     'dataset_seed' : [1,2,3],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]



In [193]:
# # tests 256 base
# GENERATION = "21_sweeps"
# SWEEP_ID = '14'
# RUN_NAME = f"21_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['qnli','qqp','sst2'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'training_size' : [2048],
#     'num_train_epochs' : [12], 
#     'learning_rate': [6e-5],
#     'evaluation_steps': [32],
#     'dataset_seed' : [1,2,3],
#     'insert_dropout' : [0.1, 0.12],
#     # 'catch_dropout' : [0],

#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     # 'early_stopping_patience' : [20],
# }
# seeds = [1,2,3]



In [194]:
# # tests 256 base
# GENERATION = "20_sweeps"
# SWEEP_ID = '1'
# RUN_NAME = f"20_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}']

# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'insert_dropout' : [0.1,0.2],
#     # 'catch_dropout' : [0],
#     'training_size' : [256],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     'early_stopping_patience' : [20],
#     'num_train_epochs' : [25,100,400,800], 
#     'learning_rate': [2e-5],
#     'evaluation_steps': [16],
#     'dataset_seed' : [1]
# }
# seeds = [1,2,3,4,5]



In [195]:
# 1024 sweeps
# GENERATION = "19_sweeps"
# SWEEP_ID = '1'
# RUN_NAME = f"19_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}','NEW_GRAD_OUTPUT']
# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'insert_dropout' : [0.04, 0.08, 0.1, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32],
#     # 'catch_dropout' : [0],
#     'training_size' : [1024],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     'early_stopping_patience' : [5],
#     'num_train_epochs' : [25], 
#     'learning_rate': [6e-5, 4e-5, 2e-5, 1e-5, 8e-6],
#     'evaluation_steps': [32],
#     'dataset_seed' : [1]
# }
# seeds = [1,2,3]

In [196]:
# # tests 256 base
# GENERATION = "16_sweeps"
# SWEEP_ID = '0'
# RUN_NAME = f"16_S{SWEEP_ID}" + "_ID_{run_id}"
# TAGS = [f'SWP{SWEEP_ID}','NEW_GRAD_OUTPUT']
# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'insert_dropout' : [0.04, 0.08, 0.1, 0.12, 0.16, 0.2, 0.24, 0.28, 0.32],
#     'catch_dropout' : [0],
#     'training_size' : [256],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     'early_stopping_patience' : [10],
#     'num_train_epochs' : [100], 
#     'learning_rate': [6e-5, 4e-5, 2e-5, 1e-5, 8e-6],
#     'evaluation_steps': [16],
#     'dataset_seed' : [1]
# }
# seeds = [1,2,3]

In [197]:
# # tests 1024 base
# GENERATION = "19_experiments"
# SWEEP_ID = '0'
# RUN_NAME = "19_E0_EP_{num_train_epochs}"
# TAGS = [f'EXPT{SWEEP_ID}','NEW_GRAD_OUTPUT']
# value_args = {
#     'task_name' : ['qnli'],#["qnli"], #["cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'insert_dropout' : [0.1],
#     'catch_dropout' : [0],
#     'training_size' : [1024],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     'early_stopping_patience' : [200],
#     'num_train_epochs' : [6,12,25, 50, 100, 200], 
#     'learning_rate': [3e-5],
#     'evaluation_steps': [32],
#     'dataset_seed' : [1]
# }
# seeds = [1,2,3]

In [198]:
# # sweep 64 base
# GENERATION = "18_experiments"
# SWEEP_ID = '2C'
# RUN_NAME = "18_E2C_EP_{num_train_epochs}_LR_{learning_rate}"
# TAGS = [f'EXPT{SWEEP_ID}','NEW_GRAD_OUTPUT'] # ,'alternateGrad'
# value_args = {
#     'task_name' : ['qnli'], #["qnli","cola","sst2","mrpc","stsb","qqp","mnli","rte"],
#     'model_name_or_path' : ['bert-base-uncased'],
#     'per_device_train_batch_size' : [32],
#     'per_device_eval_batch_size' : [128],    
#     'weight_decay' : [0.01], 
#     'lr_scheduler_type' : ['linear'],
#     'warmup_steps_fraction' : [0.1],
#     'report_to' : ['all'],
#     'insert_dropout' : [0.1],
#     'catch_dropout' : [0],
#     'training_size' : [64],
#     'beta1' : [0.9],
#     'beta2' : [0.999],
#     'early_stopping_patience' : [6400],
#     'num_train_epochs' : [100,200,400,800,1600,3200], 
#     'learning_rate': [3e-5],
#     'evaluation_steps': [4],
#     "dataset_seed" : [1]
# }
# seeds = [2,3]