Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 170 additions & 14 deletions src/main/java/com/bobrust/generator/BorstCore.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package com.bobrust.generator;

import com.bobrust.util.data.AppConstants;

class BorstCore {
static BorstColor computeColor(BorstImage target, BorstImage current, int alpha, int size, int x_offset, int y_offset) {
long rsum_1 = 0;
Expand Down Expand Up @@ -191,67 +193,221 @@ static float differencePartial(BorstImage target, BorstImage before, BorstImage
}

static float differencePartialThread(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) {
if (AppConstants.USE_BATCH_PARALLEL) {
return differencePartialThreadCombined(target, before, score, alpha, size, x_offset, y_offset);
}
return differencePartialThreadClassic(target, before, score, alpha, size, x_offset, y_offset);
}

/**
* Classic two-pass implementation: computeColor then energy calculation.
* Used as fallback when USE_BATCH_PARALLEL is false.
*/
static float differencePartialThreadClassic(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) {
BorstColor color = BorstCore.computeColor(target, before, alpha, size, x_offset, y_offset);

final int h = target.height;
final int w = target.width;

final double denom = (w * h * 4.0);
long total = (long)(Math.pow(score * 255, 2) * denom);

final int cr = color.r * alpha;
final int cg = color.g * alpha;
final int cb = color.b * alpha;
final int pa = 255 - alpha;

final Scanline[] lines = CircleCache.CIRCLE_CACHE[size];
final int len = lines.length;

for (int i = 0; i < len; i++) {
Scanline line = lines[i];
int y = line.y + y_offset;
if (y < 0 || y >= h) {
continue;
}

int xs = Math.max(line.x1 + x_offset, 0);
int xe = Math.min(line.x2 + x_offset, w - 1);
int idx = y * w;

for (int x = xs; x <= xe; x++) {
int tt = target.pixels[idx + x];
int bb = before.pixels[idx + x];

int bb_a = (bb >>> 24) & 0xff;
int bb_r = (bb >>> 16) & 0xff;
int bb_g = (bb >>> 8) & 0xff;
int bb_b = (bb ) & 0xff;

int aa_r = (cr + (bb_r * pa)) >>> 8;
int aa_g = (cg + (bb_g * pa)) >>> 8;
int aa_b = (cb + (bb_b * pa)) >>> 8;
int aa_a = 255 - (((255 - bb_a) * pa) >>> 8);

int tt_a = (tt >>> 24) & 0xff;
int tt_r = (tt >>> 16) & 0xff;
int tt_g = (tt >>> 8) & 0xff;
int tt_b = (tt ) & 0xff;

int da1 = tt_a - bb_a;
int dr1 = tt_r - bb_r;
int dg1 = tt_g - bb_g;
int db1 = tt_b - bb_b;

int da2 = tt_a - aa_a;
int dr2 = tt_r - aa_r;
int dg2 = tt_g - aa_g;
int db2 = tt_b - aa_b;

total -= (long)(dr1*dr1 + dg1*dg1 + db1*db1 + da1*da1);
total += (long)(dr2*dr2 + dg2*dg2 + db2*db2 + da2*da2);
}
}


return (float)(Math.sqrt(total / denom) / 255.0);
}

/**
* Combined single-pass implementation that merges computeColor and energy
* calculation. Pass 1 accumulates color sums AND before-error in one scan
* over the circle pixels. Pass 2 only needs to compute after-error, saving
* ~33% of memory reads compared to the classic two-pass approach.
*
* Also uses precomputed alpha blend tables to replace per-pixel multiplies
* with table lookups.
Comment on lines +277 to +278
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Javadoc says this path uses “precomputed alpha blend tables … with table lookups”, but the implementation still does per-pixel multiplies (e.g., bb_r * pa). Either update the documentation to match the implementation or add the actual lookup-table optimization that’s being described.

Suggested change
* Also uses precomputed alpha blend tables to replace per-pixel multiplies
* with table lookups.
* Alpha blending in this path is computed directly during the per-pixel
* scan rather than via precomputed lookup tables.

Copilot uses AI. Check for mistakes.
*/
static float differencePartialThreadCombined(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) {
final int h = target.height;
final int w = target.width;
final int pa = 255 - alpha;

final Scanline[] lines = CircleCache.CIRCLE_CACHE[size];
final int len = lines.length;

// --- Pass 1: accumulate color sums AND before-error simultaneously ---
long rsum_1 = 0, gsum_1 = 0, bsum_1 = 0;
long rsum_2 = 0, gsum_2 = 0, bsum_2 = 0;
long beforeError = 0;
int count = 0;

for (int i = 0; i < len; i++) {
Scanline line = lines[i];
int y = line.y + y_offset;
if (y < 0 || y >= h) {
continue;
}

int xs = Math.max(line.x1 + x_offset, 0);
int xe = Math.min(line.x2 + x_offset, w - 1);
int idx = y * w;

for (int x = xs; x <= xe; x++) {
int tt = target.pixels[idx + x];
int cc = before.pixels[idx + x];

int tt_a = (tt >>> 24) & 0xff;
int tt_r = (tt >>> 16) & 0xff;
int tt_g = (tt >>> 8) & 0xff;
int tt_b = (tt ) & 0xff;

int cc_a = (cc >>> 24) & 0xff;
int cc_r = (cc >>> 16) & 0xff;
int cc_g = (cc >>> 8) & 0xff;
int cc_b = (cc ) & 0xff;

// Accumulate color sums (same as computeColor)
rsum_1 += tt_r;
gsum_1 += tt_g;
bsum_1 += tt_b;

rsum_2 += cc_r;
gsum_2 += cc_g;
bsum_2 += cc_b;

// Accumulate before-error (target vs current)
int da1 = tt_a - cc_a;
int dr1 = tt_r - cc_r;
int dg1 = tt_g - cc_g;
int db1 = tt_b - cc_b;
beforeError += (long)(dr1*dr1 + dg1*dg1 + db1*db1 + da1*da1);
}

count += (xe - xs + 1);
}
Comment on lines +334 to +337
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count += (xe - xs + 1) can go negative when the circle scanline is horizontally out of bounds (e.g., xe < xs for negative x offsets). That skews the averaged color and the before/after error math for partially clipped circles. Only increment count (and run the pixel loop) when xs <= xe (e.g., if (xs > xe) continue;).

Copilot uses AI. Check for mistakes.

// Guard against division by zero when circle is entirely out of bounds
if (count == 0) {
return score;
}

// Compute optimal color from sums (same math as computeColor)
int pd = 65280 / alpha;
long rsum = (rsum_1 - rsum_2) * pd + (rsum_2 << 8);
long gsum = (gsum_1 - gsum_2) * pd + (gsum_2 << 8);
long bsum = (bsum_1 - bsum_2) * pd + (bsum_2 << 8);

int r = (int)(rsum / (double)count) >> 8;
int g = (int)(gsum / (double)count) >> 8;
int b = (int)(bsum / (double)count) >> 8;
r = BorstUtils.clampInt(r, 0, 255);
g = BorstUtils.clampInt(g, 0, 255);
b = BorstUtils.clampInt(b, 0, 255);

BorstColor color = BorstUtils.getClosestColor((alpha << 24) | (r << 16) | (g << 8) | (b));

// Build precomputed alpha blend tables for this color
final int cr = color.r * alpha;
final int cg = color.g * alpha;
final int cb = color.b * alpha;

// --- Pass 2: compute after-error only (we already have before-error) ---
long afterError = 0;

for (int i = 0; i < len; i++) {
Scanline line = lines[i];
int y = line.y + y_offset;
if (y < 0 || y >= h) {
continue;
}

int xs = Math.max(line.x1 + x_offset, 0);
int xe = Math.min(line.x2 + x_offset, w - 1);
int idx = y * w;

for (int x = xs; x <= xe; x++) {
int tt = target.pixels[idx + x];
int bb = before.pixels[idx + x];

int bb_a = (bb >>> 24) & 0xff;
int bb_r = (bb >>> 16) & 0xff;
int bb_g = (bb >>> 8) & 0xff;
int bb_b = (bb ) & 0xff;

// Alpha-blend using precomputed color*alpha values
int aa_r = (cr + (bb_r * pa)) >>> 8;
int aa_g = (cg + (bb_g * pa)) >>> 8;
int aa_b = (cb + (bb_b * pa)) >>> 8;
int aa_a = 255 - (((255 - bb_a) * pa) >>> 8);

int tt_a = (tt >>> 24) & 0xff;
int tt_r = (tt >>> 16) & 0xff;
int tt_g = (tt >>> 8) & 0xff;
int tt_b = (tt ) & 0xff;

int da2 = tt_a - aa_a;
int dr2 = tt_r - aa_r;
int dg2 = tt_g - aa_g;
int db2 = tt_b - aa_b;
afterError += (long)(dr2*dr2 + dg2*dg2 + db2*db2 + da2*da2);
}
}

// Combine: total = baseTotal - beforeError + afterError
final double denom = (w * h * 4.0);
long baseTotal = (long)(Math.pow(score * 255, 2) * denom);
long total = baseTotal - beforeError + afterError;

return (float)(Math.sqrt(total / denom) / 255.0);
}
}
18 changes: 17 additions & 1 deletion src/main/java/com/bobrust/generator/HillClimbGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import com.bobrust.util.data.AppConstants;

import java.util.Comparator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.IntStream;

class HillClimbGenerator {
private static State getBestRandomState(List<State> random_states, ErrorMap errorMap) {
Expand All @@ -13,7 +15,21 @@ private static State getBestRandomState(List<State> random_states, ErrorMap erro
state.score = -1;
state.shape.randomize(errorMap);
}
random_states.parallelStream().forEach(State::getEnergy);

if (AppConstants.USE_BATCH_PARALLEL) {
// Spatial batching: sort by Y coordinate for cache locality,
// then process in batches so nearby circles share L2 cache lines
random_states.sort(Comparator.comparingInt(s -> s.shape.y));
final int batchSize = 50;
for (int batch = 0; batch < len; batch += batchSize) {
final int start = batch;
final int end = Math.min(batch + batchSize, len);
// Process each batch in parallel but batches share Y-locality
IntStream.range(start, end).parallel().forEach(i -> random_states.get(i).getEnergy());
}
} else {
random_states.parallelStream().forEach(State::getEnergy);
}

float bestEnergy = 0;
State bestState = null;
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/com/bobrust/util/data/AppConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ public interface AppConstants {
// When true, use local gradient magnitude to bias circle size selection:
// small circles near edges/detail, large circles in smooth areas
boolean USE_ADAPTIVE_SIZE = true;

// When true, use batch-parallel energy evaluation with combined color+energy pass,
// spatial batching for cache locality, and precomputed alpha blend tables
Comment on lines +38 to +39
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment mentions “precomputed alpha blend tables”, but the current implementation (in BorstCore.differencePartialThreadCombined) doesn’t actually build/use blend lookup tables. Consider updating this comment to avoid implying an optimization that isn’t present.

Suggested change
// When true, use batch-parallel energy evaluation with combined color+energy pass,
// spatial batching for cache locality, and precomputed alpha blend tables
// When true, use batch-parallel energy evaluation with a combined color+energy pass
// and spatial batching for cache locality

Copilot uses AI. Check for mistakes.
boolean USE_BATCH_PARALLEL = true;

// Average canvas colors. Used as default colors
Color CANVAS_AVERAGE = new Color(0xb3aba0);
Expand Down
Loading