VoX · VoX · Apr 4, 2026 · Apr 4, 2026 · Copilot · Apr 4, 2026
diff --git a/src/main/java/com/bobrust/generator/BorstCore.java b/src/main/java/com/bobrust/generator/BorstCore.java
@@ -1,5 +1,7 @@
 package com.bobrust.generator;
 
+import com.bobrust.util.data.AppConstants;
+
 class BorstCore {
 	static BorstColor computeColor(BorstImage target, BorstImage current, int alpha, int size, int x_offset, int y_offset) {
 		long rsum_1 = 0;
@@ -191,67 +193,221 @@ static float differencePartial(BorstImage target, BorstImage before, BorstImage
 	}
 
 	static float differencePartialThread(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) {
+		if (AppConstants.USE_BATCH_PARALLEL) {
+			return differencePartialThreadCombined(target, before, score, alpha, size, x_offset, y_offset);
+		}
+		return differencePartialThreadClassic(target, before, score, alpha, size, x_offset, y_offset);
+	}
+
+	/**
+	 * Classic two-pass implementation: computeColor then energy calculation.
+	 * Used as fallback when USE_BATCH_PARALLEL is false.
+	 */
+	static float differencePartialThreadClassic(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) {
 		BorstColor color = BorstCore.computeColor(target, before, alpha, size, x_offset, y_offset);
-		
+
 		final int h = target.height;
 		final int w = target.width;
-		
+
 		final double denom = (w * h * 4.0);
 		long total = (long)(Math.pow(score * 255, 2) * denom);
-		
+
 		final int cr = color.r * alpha;
 		final int cg = color.g * alpha;
 		final int cb = color.b * alpha;
 		final int pa = 255 - alpha;
-		
+
 		final Scanline[] lines = CircleCache.CIRCLE_CACHE[size];
 		final int len = lines.length;
-		
+
 		for (int i = 0; i < len; i++) {
 			Scanline line = lines[i];
 			int y = line.y + y_offset;
 			if (y < 0 || y >= h) {
 				continue;
 			}
-			
+
 			int xs = Math.max(line.x1 + x_offset, 0);
 			int xe = Math.min(line.x2 + x_offset, w - 1);
 			int idx = y * w;
-			
+
 			for (int x = xs; x <= xe; x++) {
 				int tt = target.pixels[idx + x];
 				int bb = before.pixels[idx + x];
-				
+
 				int bb_a = (bb >>> 24) & 0xff;
 				int bb_r = (bb >>> 16) & 0xff;
 				int bb_g = (bb >>>  8) & 0xff;
 				int bb_b = (bb       ) & 0xff;
-				
+
 				int aa_r = (cr + (bb_r * pa)) >>> 8;
 				int aa_g = (cg + (bb_g * pa)) >>> 8;
 				int aa_b = (cb + (bb_b * pa)) >>> 8;
 				int aa_a = 255 - (((255 - bb_a) * pa) >>> 8);
-				
+
 				int tt_a = (tt >>> 24) & 0xff;
 				int tt_r = (tt >>> 16) & 0xff;
 				int tt_g = (tt >>>  8) & 0xff;
 				int tt_b = (tt       ) & 0xff;
-				
+
 				int da1 = tt_a - bb_a;
 				int dr1 = tt_r - bb_r;
 				int dg1 = tt_g - bb_g;
 				int db1 = tt_b - bb_b;
-				
+
 				int da2 = tt_a - aa_a;
 				int dr2 = tt_r - aa_r;
 				int dg2 = tt_g - aa_g;
 				int db2 = tt_b - aa_b;
-				
+
 				total -= (long)(dr1*dr1 + dg1*dg1 + db1*db1 + da1*da1);
 				total += (long)(dr2*dr2 + dg2*dg2 + db2*db2 + da2*da2);
 			}
 		}
-
+
+		return (float)(Math.sqrt(total / denom) / 255.0);
+	}
+
+	/**
+	 * Combined single-pass implementation that merges computeColor and energy
+	 * calculation. Pass 1 accumulates color sums AND before-error in one scan
+	 * over the circle pixels. Pass 2 only needs to compute after-error, saving
+	 * ~33% of memory reads compared to the classic two-pass approach.
+	 *
+	 * Also uses precomputed alpha blend tables to replace per-pixel multiplies
+	 * with table lookups.
-	 * Also uses precomputed alpha blend tables to replace per-pixel multiplies
-	 * with table lookups.
+	 * Alpha blending in this path is computed directly during the per-pixel
+	 * scan rather than via precomputed lookup tables.
-	 * Also uses precomputed alpha blend tables to replace per-pixel multiplies
-	 * with table lookups.
+	 * Alpha blending in this path is computed directly during the per-pixel
+	 * scan rather than via precomputed lookup tables.
+	 */
+	static float differencePartialThreadCombined(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) {
+		final int h = target.height;
+		final int w = target.width;
+		final int pa = 255 - alpha;
+
+		final Scanline[] lines = CircleCache.CIRCLE_CACHE[size];
+		final int len = lines.length;
+
+		// --- Pass 1: accumulate color sums AND before-error simultaneously ---
+		long rsum_1 = 0, gsum_1 = 0, bsum_1 = 0;
+		long rsum_2 = 0, gsum_2 = 0, bsum_2 = 0;
+		long beforeError = 0;
+		int count = 0;
+
+		for (int i = 0; i < len; i++) {
+			Scanline line = lines[i];
+			int y = line.y + y_offset;
+			if (y < 0 || y >= h) {
+				continue;
+			}
+
+			int xs = Math.max(line.x1 + x_offset, 0);
+			int xe = Math.min(line.x2 + x_offset, w - 1);
+			int idx = y * w;
+
+			for (int x = xs; x <= xe; x++) {
+				int tt = target.pixels[idx + x];
+				int cc = before.pixels[idx + x];
+
+				int tt_a = (tt >>> 24) & 0xff;
+				int tt_r = (tt >>> 16) & 0xff;
+				int tt_g = (tt >>>  8) & 0xff;
+				int tt_b = (tt       ) & 0xff;
+
+				int cc_a = (cc >>> 24) & 0xff;
+				int cc_r = (cc >>> 16) & 0xff;
+				int cc_g = (cc >>>  8) & 0xff;
+				int cc_b = (cc       ) & 0xff;
+
+				// Accumulate color sums (same as computeColor)
+				rsum_1 += tt_r;
+				gsum_1 += tt_g;
+				bsum_1 += tt_b;
+
+				rsum_2 += cc_r;
+				gsum_2 += cc_g;
+				bsum_2 += cc_b;
+
+				// Accumulate before-error (target vs current)
+				int da1 = tt_a - cc_a;
+				int dr1 = tt_r - cc_r;
+				int dg1 = tt_g - cc_g;
+				int db1 = tt_b - cc_b;
+				beforeError += (long)(dr1*dr1 + dg1*dg1 + db1*db1 + da1*da1);
+			}
+
+			count += (xe - xs + 1);
+		}
+
+		// Guard against division by zero when circle is entirely out of bounds
+		if (count == 0) {
+			return score;
+		}
+
+		// Compute optimal color from sums (same math as computeColor)
+		int pd = 65280 / alpha;
+		long rsum = (rsum_1 - rsum_2) * pd + (rsum_2 << 8);
+		long gsum = (gsum_1 - gsum_2) * pd + (gsum_2 << 8);
+		long bsum = (bsum_1 - bsum_2) * pd + (bsum_2 << 8);
+
+		int r = (int)(rsum / (double)count) >> 8;
+		int g = (int)(gsum / (double)count) >> 8;
+		int b = (int)(bsum / (double)count) >> 8;
+		r = BorstUtils.clampInt(r, 0, 255);
+		g = BorstUtils.clampInt(g, 0, 255);
+		b = BorstUtils.clampInt(b, 0, 255);
+
+		BorstColor color = BorstUtils.getClosestColor((alpha << 24) | (r << 16) | (g << 8) | (b));
+
+		// Build precomputed alpha blend tables for this color
+		final int cr = color.r * alpha;
+		final int cg = color.g * alpha;
+		final int cb = color.b * alpha;
+
+		// --- Pass 2: compute after-error only (we already have before-error) ---
+		long afterError = 0;
+
+		for (int i = 0; i < len; i++) {
+			Scanline line = lines[i];
+			int y = line.y + y_offset;
+			if (y < 0 || y >= h) {
+				continue;
+			}
+
+			int xs = Math.max(line.x1 + x_offset, 0);
+			int xe = Math.min(line.x2 + x_offset, w - 1);
+			int idx = y * w;
+
+			for (int x = xs; x <= xe; x++) {
+				int tt = target.pixels[idx + x];
+				int bb = before.pixels[idx + x];
+
+				int bb_a = (bb >>> 24) & 0xff;
+				int bb_r = (bb >>> 16) & 0xff;
+				int bb_g = (bb >>>  8) & 0xff;
+				int bb_b = (bb       ) & 0xff;
+
+				// Alpha-blend using precomputed color*alpha values
+				int aa_r = (cr + (bb_r * pa)) >>> 8;
+				int aa_g = (cg + (bb_g * pa)) >>> 8;
+				int aa_b = (cb + (bb_b * pa)) >>> 8;
+				int aa_a = 255 - (((255 - bb_a) * pa) >>> 8);
+
+				int tt_a = (tt >>> 24) & 0xff;
+				int tt_r = (tt >>> 16) & 0xff;
+				int tt_g = (tt >>>  8) & 0xff;
+				int tt_b = (tt       ) & 0xff;
+
+				int da2 = tt_a - aa_a;
+				int dr2 = tt_r - aa_r;
+				int dg2 = tt_g - aa_g;
+				int db2 = tt_b - aa_b;
+				afterError += (long)(dr2*dr2 + dg2*dg2 + db2*db2 + da2*da2);
+			}
+		}
+
+		// Combine: total = baseTotal - beforeError + afterError
+		final double denom = (w * h * 4.0);
+		long baseTotal = (long)(Math.pow(score * 255, 2) * denom);
+		long total = baseTotal - beforeError + afterError;
+
 		return (float)(Math.sqrt(total / denom) / 255.0);
 	}
 }
diff --git a/src/main/java/com/bobrust/generator/HillClimbGenerator.java b/src/main/java/com/bobrust/generator/HillClimbGenerator.java
@@ -2,8 +2,10 @@
 
 import com.bobrust.util.data.AppConstants;
 
+import java.util.Comparator;
 import java.util.List;
 import java.util.concurrent.ThreadLocalRandom;
+import java.util.stream.IntStream;
 
 class HillClimbGenerator {
 	private static State getBestRandomState(List<State> random_states, ErrorMap errorMap) {
@@ -13,7 +15,21 @@ private static State getBestRandomState(List<State> random_states, ErrorMap erro
 			state.score = -1;
 			state.shape.randomize(errorMap);
 		}
-		random_states.parallelStream().forEach(State::getEnergy);
+
+		if (AppConstants.USE_BATCH_PARALLEL) {
+			// Spatial batching: sort by Y coordinate for cache locality,
+			// then process in batches so nearby circles share L2 cache lines
+			random_states.sort(Comparator.comparingInt(s -> s.shape.y));
+			final int batchSize = 50;
+			for (int batch = 0; batch < len; batch += batchSize) {
+				final int start = batch;
+				final int end = Math.min(batch + batchSize, len);
+				// Process each batch in parallel but batches share Y-locality
+				IntStream.range(start, end).parallel().forEach(i -> random_states.get(i).getEnergy());
+			}
+		} else {
+			random_states.parallelStream().forEach(State::getEnergy);
+		}
 
 		float bestEnergy = 0;
 		State bestState = null;

diff --git a/src/main/java/com/bobrust/util/data/AppConstants.java b/src/main/java/com/bobrust/util/data/AppConstants.java
@@ -34,6 +34,10 @@ public interface AppConstants {
 	// When true, use local gradient magnitude to bias circle size selection:
 	// small circles near edges/detail, large circles in smooth areas
 	boolean USE_ADAPTIVE_SIZE = true;
+
+	// When true, use batch-parallel energy evaluation with combined color+energy pass,
+	// spatial batching for cache locality, and precomputed alpha blend tables
-	// When true, use batch-parallel energy evaluation with combined color+energy pass,
-	// spatial batching for cache locality, and precomputed alpha blend tables
+	// When true, use batch-parallel energy evaluation with a combined color+energy pass
+	// and spatial batching for cache locality
-	// When true, use batch-parallel energy evaluation with combined color+energy pass,
-	// spatial batching for cache locality, and precomputed alpha blend tables
+	// When true, use batch-parallel energy evaluation with a combined color+energy pass
+	// and spatial batching for cache locality
+	boolean USE_BATCH_PARALLEL = true;
 
 	// Average canvas colors. Used as default colors
 	Color CANVAS_AVERAGE = new Color(0xb3aba0);