PufferAI · PLAZMAMA · May 24, 2025 · May 24, 2025 · May 24, 2025 · May 24, 2025
diff --git a/pufferlib/config/ocean/boids.ini b/pufferlib/config/ocean/boids.ini
@@ -1,19 +1,16 @@
 [base]
 package = ocean
 env_name = puffer_boids 
-policy_name = Boids
+policy_name = Policy
 rnn_name = Recurrent
-; rnn_name = None
 
 [env]
 num_envs = 64
-num_boids = 64
-; num_envs = 1
-; num_boids = 1
-margin_turn_factor = 0.0
-centering_factor = 0.00
-avoid_factor = 1.00
-matching_factor = 1.00
+num_boids = 16
+margin_turn_factor = 1.0
+cohesion_factor = 0.0048
+separation_factor = 0.0128
+alignment_factor = 0.2
 
 [vec]
 num_workers = 2
@@ -22,51 +19,7 @@ batch_size = auto
 
 [train]
 total_timesteps = 100_000_000
+; total_timesteps = 80_000_000
 gamma = 0.95
 learning_rate = 0.025
-minibatch_size = 16384
-; minibatch_size = 1
-
-; [sweep]
-; method = protein
-; metric = episode_length
-
-; [sweep.train.total_timesteps]
-; distribution = log_normal
-; min = 1e6
-; max = 1e7
-; mean = 5e6
-; scale = 0.5
-
-; [sweep.train.gamma]
-; distribution = log_normal
-; min = 0.9
-; max = 0.999
-; mean = 0.97
-
-; [sweep.train.gae_lambda]
-; distribution = log_normal
-; min = 0.7
-; max = 0.999
-; mean = 0.95
-
-; [sweep.train.learning_rate]
-; distribution = log_normal
-; min = 0.0001
-; max = 0.001
-; mean = 0.00025
-; scale = 0.5
-
-; [sweep.train.batch_size]
-; min = 32768
-; max = 131072
-; mean = 65536
-; scale = 0.5
-
-; [sweep.train.minibatch_size]
-; min = 512
-; max = 2048
-; mean = 1024
-; scale = 0.5
-
-
+minibatch_size = 16384
diff --git a/pufferlib/ocean/boids/binding.c b/pufferlib/ocean/boids/binding.c
@@ -7,18 +7,15 @@ static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
     env->num_boids = unpack(kwargs, "num_boids");
     env->report_interval = unpack(kwargs, "report_interval");
     env->margin_turn_factor = unpack(kwargs, "margin_turn_factor");
-    env->centering_factor = unpack(kwargs, "centering_factor");
-    env->avoid_factor = unpack(kwargs, "avoid_factor");
-    env->matching_factor = unpack(kwargs, "matching_factor");
+    env->cohesion_factor = unpack(kwargs, "cohesion_factor");
+    env->separation_factor = unpack(kwargs, "separation_factor");
+    env->alignment_factor = unpack(kwargs, "alignment_factor");
     init(env);
     return 0;
 }
 
 static int my_log(PyObject* dict, Log* log) {
-    assign_to_dict(dict, "perf", log->perf);
     assign_to_dict(dict, "score", log->score);
-    assign_to_dict(dict, "episode_return", log->episode_return);
-    assign_to_dict(dict, "episode_length", log->episode_length);
     assign_to_dict(dict, "n", log->n);
     return 0;
 }
diff --git a/pufferlib/ocean/boids/boids.c b/pufferlib/ocean/boids/boids.c
@@ -7,8 +7,13 @@
 
 // --- Demo Configuration ---
 #define NUM_BOIDS_DEMO 20   // Number of boids for the standalone demo
-#define MAX_STEPS_DEMO 500 // Max steps per episode in the demo
+#define REPORT_INTERVAL_DEMO 1000 // Report interval for the demo
+#define MAX_STEPS_DEMO 10000 // Max steps per episode in the demo
 #define ACTION_SCALE 3.0f   // Corresponds to action space [-3.0, 3.0]
+#define MARGIN_TURN_FACTOR 1.0
+#define COHESION_FACTOR 0.0
+#define SEPARATION_FACTOR 0.0
+#define ALIGNMENT_FACTOR 0.0
 
 // Dummy action generation: random velocity changes for each boid
 void generate_dummy_actions(Boids* env) {
@@ -27,11 +32,14 @@ void demo() {
     // Initialize Boids environment struct
     Boids env = {0}; 
     env.num_boids = NUM_BOIDS_DEMO;
+    env.report_interval = REPORT_INTERVAL_DEMO;
+    env.margin_turn_factor = MARGIN_TURN_FACTOR;
+    env.cohesion_factor = COHESION_FACTOR;
+    env.separation_factor = SEPARATION_FACTOR;
+    env.alignment_factor = ALIGNMENT_FACTOR;
 
-    // In the Python binding, these pointers are assigned from NumPy arrays.
-    // Here, we need to allocate them explicitly.
-    size_t obs_size = env.num_boids * 4; // num_boids * (x, y, vx, vy)
-    size_t act_size = env.num_boids * 2; // num_boids * (dvx, dvy)
+    size_t obs_size = env.num_boids * env.num_boids * 9; // 9 = (x, y, vx, vy, dx, dy, dist, dvx, dvy)
+    size_t act_size = env.num_boids * 2; // the 2 = (dvx, dvy)
     env.observations = (float*)calloc(obs_size, sizeof(float));
     env.actions = (float*)calloc(act_size, sizeof(float));
     env.rewards = (float*)calloc(env.num_boids, sizeof(float)); // Env-level reward

diff --git a/pufferlib/ocean/boids/boids.h b/pufferlib/ocean/boids/boids.h
@@ -13,19 +13,17 @@
 #define LEFT_MARGIN 50
 #define RIGHT_MARGIN 50
 #define VELOCITY_CAP 5
-#define VISUAL_RANGE 20
-#define PROTECTED_RANGE 100
+#define VISUAL_RANGE 400
+#define PROTECTED_RANGE 60
 #define WIDTH 1080
 #define HEIGHT 720
 #define BOID_WIDTH 32
 #define BOID_HEIGHT 32
-#define BOID_TEXTURE_PATH "./resources/puffers_128.png"
+#define BOID_TEXTURE_PATH "./resources/shared/puffers_128.png"
+#define MAX_DIST 2000
 
 typedef struct {
-    float perf;
     float score;
-    float episode_return;
-    float episode_length;
     float n;
 } Log;
 
@@ -42,25 +40,26 @@ typedef struct {
 
 typedef struct Client Client;
 typedef struct {
-    // an array of shape (num_boids, 4) with the 4 values correspoinding to (x, y, velocity x, velocity y)
+    // Flat array of shape (num_boids * 8) values:
+    // - Each boid has 8 values corresponding to (x, y, vx, vy, dx, dy, dvx, dvy)
+    // - The first 8 values are for the boid itself
+    // - All the other 8 values for the other boids
     float* observations;
-    // an array of shape (num_boids, 2) with the 2 values correspoinding to (velocity x, velocity y)
+    // an array of shape (num_boids, 2) with the 2 values correspoinding to (dvx, dvy)
     float* actions;
     // an array of shape (1) with the summed up reward for all boids
     float* rewards;
     unsigned char* terminals; // Not being used but is required by env_binding.h
     Boid* boids;
     unsigned int num_boids;
     float margin_turn_factor;
-    float centering_factor;
-    float avoid_factor;
-    float matching_factor;
+    float cohesion_factor;
+    float separation_factor;
+    float alignment_factor;
     unsigned tick;
     Log log;
-    Log* boid_logs;
     unsigned report_interval;
     Client* client;
-
 } Boids;
 
 static inline float flmax(float a, float b) { return a > b ? a : b; }
@@ -73,12 +72,18 @@ static void respawn_boid(Boids *env, unsigned int i) {
     env->boids[i].y = rndf(BOTTOM_MARGIN, HEIGHT - TOP_MARGIN);
     env->boids[i].velocity.x = 0;
     env->boids[i].velocity.y = 0;
-    env->boid_logs[i]       = (Log){0};
 }
 
 void init(Boids *env) {
+    if(env->num_boids < 1) {
+        printf("ERROR: num_boids must be bigger than 0\n");
+        exit(1);
+    }
+    if (env->report_interval < 1) {
+        printf("ERROR: report_interval must be bigger than 0\n");
+        exit(1);
+    }
     env->boids = (Boid*)calloc(env->num_boids, sizeof(Boid));
-    env->boid_logs = (Log*)calloc(env->num_boids, sizeof(Log));
     env->log = (Log){0};
     env->tick = 0;
 
@@ -92,15 +97,33 @@ void init(Boids *env) {
 
 
 static void compute_observations(Boids *env) {
-    unsigned base_indx;
-
     int idx = 0;
+    float diff_x, diff_y, dist;
     for (unsigned i=0; i<env->num_boids; i++) {
+        // observations for the current boid
+        env->observations[idx++] = env->boids[i].x / WIDTH;
+        env->observations[idx++] = env->boids[i].y / HEIGHT;
+        env->observations[idx++] = env->boids[i].velocity.x / VELOCITY_CAP;
+        env->observations[idx++] = env->boids[i].velocity.y / VELOCITY_CAP;
+        // zeros for relative observations since comparing to itself will always be 0
+        for (unsigned j=0; j<5; j++) { env->observations[idx++] = 0; }
+
+        // observations for the other boids compared to the current boid
         for (unsigned j=0; j<env->num_boids; j++) {
-            env->observations[idx++] = (env->boids[j].x - env->boids[i].x) / WIDTH;
-            env->observations[idx++] = (env->boids[j].y - env->boids[i].y) / HEIGHT;
-            env->observations[idx++] = (env->boids[j].velocity.x - env->boids[i].velocity.x) / VELOCITY_CAP;
-            env->observations[idx++] = (env->boids[j].velocity.y - env->boids[i].velocity.y) / VELOCITY_CAP;
+            if (i == j) continue;
+            diff_x = env->boids[i].x - env->boids[j].x;
+            diff_y = env->boids[i].y - env->boids[j].y;
+            dist = sqrtf(diff_x*diff_x + diff_y*diff_y);
+
+            env->observations[idx++] = env->boids[j].x / WIDTH;
+            env->observations[idx++] = env->boids[j].y / HEIGHT;
+            env->observations[idx++] = env->boids[j].velocity.x / VELOCITY_CAP;
+            env->observations[idx++] = env->boids[j].velocity.y / VELOCITY_CAP;
+            env->observations[idx++] = diff_x / WIDTH;
+            env->observations[idx++] = diff_y / HEIGHT;
+            env->observations[idx++] = dist / MAX_DIST;
+            env->observations[idx++] = (env->boids[i].velocity.x - env->boids[j].velocity.x) / VELOCITY_CAP;
+            env->observations[idx++] = (env->boids[i].velocity.y - env->boids[j].velocity.y) / VELOCITY_CAP;
         }
     }
 }
@@ -118,7 +141,7 @@ void c_step(Boids *env) {
     Boid* current_boid;
     Boid observed_boid;
     float vis_vx_sum, vis_vy_sum, vis_x_sum, vis_y_sum, vis_x_avg, vis_y_avg, vis_vx_avg, vis_vy_avg;
-    float diff_x, diff_y, dist, protected_dist_sum, current_boid_reward;
+    float diff_x, diff_y, dist, current_boid_reward;
     unsigned visual_count, protected_count;
     bool manual_control = IsKeyDown(KEY_LEFT_SHIFT);
     float mouse_x = (float)GetMouseX();
@@ -127,21 +150,22 @@ void c_step(Boids *env) {
     env->tick++;
     env->rewards[0] = 0;
     env->log.score = 0;
+    env->log.n = 0;
     for (unsigned current_indx = 0; current_indx < env->num_boids; current_indx++) {
         // apply action
         current_boid = &env->boids[current_indx];
         if (manual_control) {
             current_boid->velocity.x = flclip(current_boid->velocity.x + (mouse_x - current_boid->x), -VELOCITY_CAP, VELOCITY_CAP);
             current_boid->velocity.y = flclip(current_boid->velocity.y + (mouse_y - current_boid->y), -VELOCITY_CAP, VELOCITY_CAP);
         } else {
-            current_boid->velocity.x = flclip(current_boid->velocity.x + 2*env->actions[current_indx * 2 + 0], -VELOCITY_CAP, VELOCITY_CAP);
-            current_boid->velocity.y = flclip(current_boid->velocity.y + 2*env->actions[current_indx * 2 + 1], -VELOCITY_CAP, VELOCITY_CAP);
+            current_boid->velocity.x = flclip(current_boid->velocity.x + env->actions[current_indx * 2 + 0], -VELOCITY_CAP, VELOCITY_CAP);
+            current_boid->velocity.y = flclip(current_boid->velocity.y + env->actions[current_indx * 2 + 1], -VELOCITY_CAP, VELOCITY_CAP);
         }
         current_boid->x = flclip(current_boid->x + current_boid->velocity.x, 0, WIDTH  - BOID_WIDTH);
         current_boid->y = flclip(current_boid->y + current_boid->velocity.y, 0, HEIGHT - BOID_HEIGHT);
 
         // reward calculation
-        current_boid_reward = 0.0f, protected_dist_sum = 0.0f, protected_count = 0.0f;
+        current_boid_reward = 0.0f, protected_count = 0.0f;
         visual_count = 0.0f, vis_vx_sum = 0.0f, vis_vy_sum = 0.0f, vis_x_sum = 0.0f, vis_y_sum = 0.0f;
         for (unsigned observed_indx = 0; observed_indx < env->num_boids; observed_indx++) {
             if (current_indx == observed_indx) continue;
@@ -150,7 +174,6 @@ void c_step(Boids *env) {
             diff_y = current_boid->y - observed_boid.y;
             dist = sqrtf(diff_x*diff_x + diff_y*diff_y);
             if (dist < PROTECTED_RANGE) {
-                protected_dist_sum += (PROTECTED_RANGE - dist);
                 protected_count++;
             } else if (dist < VISUAL_RANGE) {
                 vis_x_sum += observed_boid.x;
@@ -161,46 +184,42 @@ void c_step(Boids *env) {
             }
         }
         if (protected_count > 0) {
-            //current_boid_reward -= fabsf(protected_dist_sum / protected_count) * env->avoid_factor;
-            current_boid_reward -= flclip(protected_count/5.0, 0.0f, 1.0f) * env->avoid_factor;
+            // protected_range_diff = (float)(env->num_boids - protected_count) - protected_count;
+            // current_boid_reward += protected_range_diff * env->seperation_factor;
+
+            current_boid_reward -= protected_count * env->separation_factor;
         }
         if (visual_count) {
             vis_x_avg  = vis_x_sum  / visual_count;
             vis_y_avg  = vis_y_sum  / visual_count;
             vis_vx_avg = vis_vx_sum / visual_count;
             vis_vy_avg = vis_vy_sum / visual_count;
 
-            current_boid_reward -= fabsf(vis_vx_avg - current_boid->velocity.x) * env->matching_factor;
-            current_boid_reward -= fabsf(vis_vy_avg - current_boid->velocity.y) * env->matching_factor;
-            current_boid_reward -= fabsf(vis_x_avg  - current_boid->x) * env->centering_factor;
-            current_boid_reward -= fabsf(vis_y_avg  - current_boid->y) * env->centering_factor;
+            current_boid_reward -= fabsf(vis_vx_avg - current_boid->velocity.x) * env->alignment_factor;
+            current_boid_reward -= fabsf(vis_vy_avg - current_boid->velocity.y) * env->alignment_factor;
+            current_boid_reward -= fabsf(vis_x_avg  - current_boid->x) * env->cohesion_factor;
+            current_boid_reward -= fabsf(vis_y_avg  - current_boid->y) * env->cohesion_factor;
         }
-        if (current_boid->y < TOP_MARGIN || current_boid->y > HEIGHT - BOTTOM_MARGIN) {
+        if (current_boid->y < TOP_MARGIN || current_boid->y + BOID_HEIGHT > HEIGHT - BOTTOM_MARGIN) {
             current_boid_reward -= env->margin_turn_factor;
-        } else {
-            current_boid_reward += env->margin_turn_factor;
         }
-        if (current_boid->x < LEFT_MARGIN || current_boid->x > WIDTH  - RIGHT_MARGIN) {
+        if (current_boid->x < LEFT_MARGIN || current_boid->x + BOID_WIDTH > WIDTH - RIGHT_MARGIN) {
             current_boid_reward -= env->margin_turn_factor;
-        } else {
-            current_boid_reward += env->margin_turn_factor;
         }
+
         // Normalization
-        // env->rewards[current_indx] = current_boid_reward / 15.0f;
-        // printf("current_boid_reward: %f\n", current_boid_reward);
-        env->rewards[current_indx] = current_boid_reward / 2.0f;
+        // env->rewards[current_indx] = current_boid_reward;
+        env->rewards[current_indx] = current_boid_reward / 6.0f;
+        // env->rewards[current_indx] = current_boid_reward / 205.0f;
+        // env->rewards[current_indx] = current_boid_reward / 10.0f;
 
         //log updates
         if (env->tick == env->report_interval) {
             env->log.score          += env->rewards[current_indx];
             env->log.n              += 1.0f;
-
-            /* clear per-boid log for next episode */
-            // env->boid_logs[boid_indx] = (Log){0};
             env->tick = 0;
         }
     }
-    //env->log.score /= env->num_boids;
 
     compute_observations(env);
 }
@@ -220,7 +239,6 @@ void c_close_client(Client* client) {
 
 void c_close(Boids* env) {
     free(env->boids);
-    free(env->boid_logs);
     if (env->client != NULL) {
         c_close_client(env->client);
     }