PufferAI · jsuarez5341 · May 12, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/clean_pufferl.py b/clean_pufferl.py
diff --git a/config/default.ini b/config/default.ini
@@ -1,124 +1,100 @@
 [base]
 package = None
 env_name = None
-vec = native
 policy_name = Policy
 rnn_name = None
 max_suggestion_cost = 3600
 
-[workspace]
-name = pufferai 
-project = ablations
+[vec]
+backend = Multiprocessing
+num_envs = 2
+num_workers = auto
+batch_size = auto
+zero_copy = True
+seed = 42
 
 [env]
 [policy]
 [rnn]
 
 [train]
-seed = 0
+name = pufferai 
+project = ablations
+run_id = None
+run_tag = None
+
+seed = 42
 torch_deterministic = True
 cpu_offload = False
 device = cuda
 optimizer = muon
-scheduler = cosine
 anneal_lr = True
 precision = float32
 total_timesteps = 10_000_000
 learning_rate = 0.025
 gamma = 0.995
 gae_lambda = 0.85
 update_epochs = 1
-norm_adv = True
 # Consider raising clip coef to 0.2
 clip_coef = 0.1
-clip_vloss = True
 vf_coef = 2.0
 vf_clip_coef = 0.1
 max_grad_norm = 0.5
 ent_coef = 0.01
-target_kl = None
 adam_beta1 = 0.9
 adam_beta2 = 0.999
 adam_eps = 1e-12
 
-num_envs = 2
-num_workers = 2
-env_batch_size = 1
-zero_copy = True
 data_dir = experiments
 checkpoint_interval = 200
-batch_size = 524288
+batch_size = auto
 minibatch_size = 8192
-replay_factor = 0.0
+
 # Accumulate gradients above this size
 max_minibatch_size = 32768
 bptt_horizon = 64
 compile = False
-compile_mode = reduce-overhead
+compile_mode = max-autotune-no-cudagraphs
 compile_fullgraph = True
 
-use_diayn = False
-diayn_archive = 256
-diayn_loss_coef = 0.000
-diayn_coef = 0.0
-
-use_p3o = False
-p3o_horizon = 128
-puf = 0.0
-
-use_vtrace = False
 vtrace_rho_clip = 1.0
 vtrace_c_clip = 1.0
 
-use_puff_advantage = True
-
 prio_alpha = 0.6
 prio_beta0 = 0.4
 
 [sweep]
-method = protein 
-name = sweep
-
-[sweep.metric]
+method = Protein 
+metric = score
 goal = maximize
-name = score 
-min = 0
-max = 1
 
-[sweep.env.num_envs]
+[sweep.vec.num_envs]
 distribution = uniform_pow2
-min = 64
-max = 4096
-mean = 1024
+min = 1
+max = 8
+mean = 2
 scale = auto
-#scale = 0.5
-
-#[sweep.policy.hidden_size]
-#distribution = uniform_pow2
-#min = 32
-#max = 1024
-#mean = 128
-#scale = auto
 
+# TODO: Elim from base
 [sweep.train.total_timesteps]
 distribution = log_normal
 min = 5e7
 max = 1e10
 mean = 1e8
 scale = time
 
-[sweep.train.batch_size]
-distribution = uniform_pow2
-min = 32768
-max = 1048576
-mean = 262144
+[sweep.train.bptt_horizon]
+distribution = int_uniform
+min = 16
+max = 64
+mean = 64
 scale = auto
 
 [sweep.train.minibatch_size]
 distribution = uniform_pow2
-min = 1024
-max = 32768
-mean = 8192
+min = 8192
+max = 131072
+mean = 32768
 scale = auto
 
 [sweep.train.learning_rate]
@@ -141,15 +117,13 @@ min = 0.8
 mean = 0.98
 max = 0.9999
 scale = auto
-#scale = 0.5
 
 [sweep.train.gae_lambda]
 distribution = logit_normal
 min = 0.6
 mean = 0.95
 max = 0.995
 scale = auto
-#scale = 0.5
 
 [sweep.train.update_epochs]
 distribution = int_uniform
@@ -158,6 +132,20 @@ max = 4
 mean = 1
 scale = 1.0
 
+[sweep.train.clip_coef]
+distribution = uniform
+min = 0.01
+max = 1.0
+mean = 0.1
+scale = auto
+
+[sweep.train.vf_clip_coef]
+distribution = uniform
+min = 0.01
+max = 5.0
+mean = 0.1
+scale = auto
+
 [sweep.train.vf_coef]
 distribution = uniform
 min = 0.0
@@ -172,20 +160,6 @@ mean = 1.0
 max = 5.0
 scale = auto
 
-[sweep.train.bptt_horizon]
-distribution = uniform_pow2
-min = 4
-max = 128
-mean = 16
-scale = auto
-
-#[sweep.train.puf]
-#distribution = logit_normal
-#min = 0.01
-#mean = 0.5
-#max = 0.99
-#scale = auto
-
 [sweep.train.adam_beta1]
 distribution = logit_normal
 min = 0.5
@@ -201,36 +175,22 @@ max = 0.99999
 scale = auto
 
 [sweep.train.adam_eps]
-distribution = uniform
-min = 0.00000000000001
-mean = 0.00000001
-max = 0.001
+distribution = log_normal
+min = 1e-14
+mean = 1e-8
+max = 1e-4
 scale = auto
 
-#[sweep.train.horizon]
-#distribution = uniform_pow2
-#min = 4
-#max = 128
-#mean = 32
-#scale = 0.25
-
-#[sweep.train.diayn_archive]
-#distribution = uniform_pow2
-#min = 2
-#max = 64
-#mean = 8
-#scale = auto
-
-#[sweep.train.diayn_loss_coef]
-#distribution = uniform
-#min = 0.0
-#max = 2.0
-#mean = 1.0
-#scale = auto
-
-#[sweep.train.diayn_coef]
-#distribution = log_normal
-#min = 0.0001
-#mean = 0.1
-#max = 0.99
-#scale = auto
+[sweep.train.prio_alpha]
+distribution = logit_normal
+min = 0.1
+mean = 0.6
+max = 0.99
+scale = auto
+
+[sweep.train.prio_beta0]
+distribution = logit_normal
+min = 0.1
+mean = 0.4
+max = 0.99
+scale = auto
diff --git a/config/metta.ini b/config/metta.ini
@@ -3,17 +3,17 @@ package = metta
 env_name = metta 
 policy_name = Policy
 rnn_name = Recurrent
-vec = multiprocessing
+
+[vec]
+num_envs = 128
+num_workers = 16
+batch_size = 64
 
 [env]
 render_mode = auto
-#num_envs = 128 
 
 [train]
-total_timesteps = 5_000_000_000
-num_envs = 128
-num_workers = 16
-env_batch_size = 64
+total_timesteps = 100_000_000
 learning_rate = 0.0013848535655657842
 gamma = 0.9959746852829785
 gae_lambda = 0.9283720217357007
@@ -45,17 +45,6 @@ adam_eps = 0.000249501214984291
 #minibatch_size = 32768
 #compile = False
 
-[sweep]
-method = protein
-name = sweep
-
-[sweep.metric]
-goal = maximize
-name = score 
-min = 0
-max = 10
-scale = auto
-
 #[sweep.train.total_timesteps]
 #distribution = log_normal
 #min = 2e7

diff --git a/config/ocean/blastar.ini b/config/ocean/blastar.ini
@@ -14,8 +14,8 @@ gamma = 0.95
 learning_rate = 0.05
 minibatch_size = 32768
 
-[sweep.metric]
-name = environment/enemy_crossed_screen
+[sweep]
+metric = environment/enemy_crossed_screen
 goal = minimize
 
 [sweep.parameters.train.parameters.batch_size]

diff --git a/config/ocean/breakout.ini b/config/ocean/breakout.ini
@@ -3,7 +3,9 @@ package = ocean
 env_name = puffer_breakout
 policy_name = Policy
 rnn_name = Recurrent
-vec = multiprocessing
+
+[vec]
+num_envs = 2
 
 [env]
 num_envs = 4096
@@ -16,23 +18,35 @@ input_size = 128
 hidden_size = 128
 
 [train]
-total_timesteps = 80_000_000
-learning_rate = 0.05
-minibatch_size = 32768
-
-[sweep]
-method = protein
-name = sweep
-
-[sweep.metric]
-goal = maximize
-name = score 
-min = 0
-max = 864
-
-#[sweep.train.total_timesteps]
-#distribution = log_normal
-#min = 2e7
-#max = 1e8
-#mean = 5e7
-#scale = auto
+total_timesteps = 75_000_000 
+
+# Highly sensitive
+adam_beta1 = 0.99
+
+adam_beta2 = 0.9999
+adam_eps = 1e-14
+ent_coef = 0.025
+gae_lambda = 0.85
+
+# Highly sensitive
+gamma = 0.975
+
+learning_rate = 0.01
+max_grad_norm = 1.5
+minibatch_size = 16384
+
+prio_alpha = 0.0
+# Doesn't matter
+prio_beta0 = 1.0
+
+# Just can't be low
+vf_coef = 1.3
+
+# TODO: Try tuning clip coefs
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 2e7
+max = 5e8
+mean = 8e7
+scale = auto