-
Notifications
You must be signed in to change notification settings - Fork 0
Batch-parallel energy evaluation (Proposal 4) #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,7 @@ | ||
| package com.bobrust.generator; | ||
|
|
||
| import com.bobrust.util.data.AppConstants; | ||
|
|
||
| class BorstCore { | ||
| static BorstColor computeColor(BorstImage target, BorstImage current, int alpha, int size, int x_offset, int y_offset) { | ||
| long rsum_1 = 0; | ||
|
|
@@ -191,67 +193,221 @@ static float differencePartial(BorstImage target, BorstImage before, BorstImage | |
| } | ||
|
|
||
| static float differencePartialThread(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) { | ||
| if (AppConstants.USE_BATCH_PARALLEL) { | ||
| return differencePartialThreadCombined(target, before, score, alpha, size, x_offset, y_offset); | ||
| } | ||
| return differencePartialThreadClassic(target, before, score, alpha, size, x_offset, y_offset); | ||
| } | ||
|
|
||
| /** | ||
| * Classic two-pass implementation: computeColor then energy calculation. | ||
| * Used as fallback when USE_BATCH_PARALLEL is false. | ||
| */ | ||
| static float differencePartialThreadClassic(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) { | ||
| BorstColor color = BorstCore.computeColor(target, before, alpha, size, x_offset, y_offset); | ||
|
|
||
| final int h = target.height; | ||
| final int w = target.width; | ||
|
|
||
| final double denom = (w * h * 4.0); | ||
| long total = (long)(Math.pow(score * 255, 2) * denom); | ||
|
|
||
| final int cr = color.r * alpha; | ||
| final int cg = color.g * alpha; | ||
| final int cb = color.b * alpha; | ||
| final int pa = 255 - alpha; | ||
|
|
||
| final Scanline[] lines = CircleCache.CIRCLE_CACHE[size]; | ||
| final int len = lines.length; | ||
|
|
||
| for (int i = 0; i < len; i++) { | ||
| Scanline line = lines[i]; | ||
| int y = line.y + y_offset; | ||
| if (y < 0 || y >= h) { | ||
| continue; | ||
| } | ||
|
|
||
| int xs = Math.max(line.x1 + x_offset, 0); | ||
| int xe = Math.min(line.x2 + x_offset, w - 1); | ||
| int idx = y * w; | ||
|
|
||
| for (int x = xs; x <= xe; x++) { | ||
| int tt = target.pixels[idx + x]; | ||
| int bb = before.pixels[idx + x]; | ||
|
|
||
| int bb_a = (bb >>> 24) & 0xff; | ||
| int bb_r = (bb >>> 16) & 0xff; | ||
| int bb_g = (bb >>> 8) & 0xff; | ||
| int bb_b = (bb ) & 0xff; | ||
|
|
||
| int aa_r = (cr + (bb_r * pa)) >>> 8; | ||
| int aa_g = (cg + (bb_g * pa)) >>> 8; | ||
| int aa_b = (cb + (bb_b * pa)) >>> 8; | ||
| int aa_a = 255 - (((255 - bb_a) * pa) >>> 8); | ||
|
|
||
| int tt_a = (tt >>> 24) & 0xff; | ||
| int tt_r = (tt >>> 16) & 0xff; | ||
| int tt_g = (tt >>> 8) & 0xff; | ||
| int tt_b = (tt ) & 0xff; | ||
|
|
||
| int da1 = tt_a - bb_a; | ||
| int dr1 = tt_r - bb_r; | ||
| int dg1 = tt_g - bb_g; | ||
| int db1 = tt_b - bb_b; | ||
|
|
||
| int da2 = tt_a - aa_a; | ||
| int dr2 = tt_r - aa_r; | ||
| int dg2 = tt_g - aa_g; | ||
| int db2 = tt_b - aa_b; | ||
|
|
||
| total -= (long)(dr1*dr1 + dg1*dg1 + db1*db1 + da1*da1); | ||
| total += (long)(dr2*dr2 + dg2*dg2 + db2*db2 + da2*da2); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| return (float)(Math.sqrt(total / denom) / 255.0); | ||
| } | ||
|
|
||
| /** | ||
| * Combined single-pass implementation that merges computeColor and energy | ||
| * calculation. Pass 1 accumulates color sums AND before-error in one scan | ||
| * over the circle pixels. Pass 2 only needs to compute after-error, saving | ||
| * ~33% of memory reads compared to the classic two-pass approach. | ||
| * | ||
| * Also uses precomputed alpha blend tables to replace per-pixel multiplies | ||
| * with table lookups. | ||
| */ | ||
| static float differencePartialThreadCombined(BorstImage target, BorstImage before, float score, int alpha, int size, int x_offset, int y_offset) { | ||
| final int h = target.height; | ||
| final int w = target.width; | ||
| final int pa = 255 - alpha; | ||
|
|
||
| final Scanline[] lines = CircleCache.CIRCLE_CACHE[size]; | ||
| final int len = lines.length; | ||
|
|
||
| // --- Pass 1: accumulate color sums AND before-error simultaneously --- | ||
| long rsum_1 = 0, gsum_1 = 0, bsum_1 = 0; | ||
| long rsum_2 = 0, gsum_2 = 0, bsum_2 = 0; | ||
| long beforeError = 0; | ||
| int count = 0; | ||
|
|
||
| for (int i = 0; i < len; i++) { | ||
| Scanline line = lines[i]; | ||
| int y = line.y + y_offset; | ||
| if (y < 0 || y >= h) { | ||
| continue; | ||
| } | ||
|
|
||
| int xs = Math.max(line.x1 + x_offset, 0); | ||
| int xe = Math.min(line.x2 + x_offset, w - 1); | ||
| int idx = y * w; | ||
|
|
||
| for (int x = xs; x <= xe; x++) { | ||
| int tt = target.pixels[idx + x]; | ||
| int cc = before.pixels[idx + x]; | ||
|
|
||
| int tt_a = (tt >>> 24) & 0xff; | ||
| int tt_r = (tt >>> 16) & 0xff; | ||
| int tt_g = (tt >>> 8) & 0xff; | ||
| int tt_b = (tt ) & 0xff; | ||
|
|
||
| int cc_a = (cc >>> 24) & 0xff; | ||
| int cc_r = (cc >>> 16) & 0xff; | ||
| int cc_g = (cc >>> 8) & 0xff; | ||
| int cc_b = (cc ) & 0xff; | ||
|
|
||
| // Accumulate color sums (same as computeColor) | ||
| rsum_1 += tt_r; | ||
| gsum_1 += tt_g; | ||
| bsum_1 += tt_b; | ||
|
|
||
| rsum_2 += cc_r; | ||
| gsum_2 += cc_g; | ||
| bsum_2 += cc_b; | ||
|
|
||
| // Accumulate before-error (target vs current) | ||
| int da1 = tt_a - cc_a; | ||
| int dr1 = tt_r - cc_r; | ||
| int dg1 = tt_g - cc_g; | ||
| int db1 = tt_b - cc_b; | ||
| beforeError += (long)(dr1*dr1 + dg1*dg1 + db1*db1 + da1*da1); | ||
| } | ||
|
|
||
| count += (xe - xs + 1); | ||
| } | ||
|
Comment on lines
+334
to
+337
|
||
|
|
||
| // Guard against division by zero when circle is entirely out of bounds | ||
| if (count == 0) { | ||
| return score; | ||
| } | ||
|
|
||
| // Compute optimal color from sums (same math as computeColor) | ||
| int pd = 65280 / alpha; | ||
| long rsum = (rsum_1 - rsum_2) * pd + (rsum_2 << 8); | ||
| long gsum = (gsum_1 - gsum_2) * pd + (gsum_2 << 8); | ||
| long bsum = (bsum_1 - bsum_2) * pd + (bsum_2 << 8); | ||
|
|
||
| int r = (int)(rsum / (double)count) >> 8; | ||
| int g = (int)(gsum / (double)count) >> 8; | ||
| int b = (int)(bsum / (double)count) >> 8; | ||
| r = BorstUtils.clampInt(r, 0, 255); | ||
| g = BorstUtils.clampInt(g, 0, 255); | ||
| b = BorstUtils.clampInt(b, 0, 255); | ||
|
|
||
| BorstColor color = BorstUtils.getClosestColor((alpha << 24) | (r << 16) | (g << 8) | (b)); | ||
|
|
||
| // Build precomputed alpha blend tables for this color | ||
| final int cr = color.r * alpha; | ||
| final int cg = color.g * alpha; | ||
| final int cb = color.b * alpha; | ||
|
|
||
| // --- Pass 2: compute after-error only (we already have before-error) --- | ||
| long afterError = 0; | ||
|
|
||
| for (int i = 0; i < len; i++) { | ||
| Scanline line = lines[i]; | ||
| int y = line.y + y_offset; | ||
| if (y < 0 || y >= h) { | ||
| continue; | ||
| } | ||
|
|
||
| int xs = Math.max(line.x1 + x_offset, 0); | ||
| int xe = Math.min(line.x2 + x_offset, w - 1); | ||
| int idx = y * w; | ||
|
|
||
| for (int x = xs; x <= xe; x++) { | ||
| int tt = target.pixels[idx + x]; | ||
| int bb = before.pixels[idx + x]; | ||
|
|
||
| int bb_a = (bb >>> 24) & 0xff; | ||
| int bb_r = (bb >>> 16) & 0xff; | ||
| int bb_g = (bb >>> 8) & 0xff; | ||
| int bb_b = (bb ) & 0xff; | ||
|
|
||
| // Alpha-blend using precomputed color*alpha values | ||
| int aa_r = (cr + (bb_r * pa)) >>> 8; | ||
| int aa_g = (cg + (bb_g * pa)) >>> 8; | ||
| int aa_b = (cb + (bb_b * pa)) >>> 8; | ||
| int aa_a = 255 - (((255 - bb_a) * pa) >>> 8); | ||
|
|
||
| int tt_a = (tt >>> 24) & 0xff; | ||
| int tt_r = (tt >>> 16) & 0xff; | ||
| int tt_g = (tt >>> 8) & 0xff; | ||
| int tt_b = (tt ) & 0xff; | ||
|
|
||
| int da2 = tt_a - aa_a; | ||
| int dr2 = tt_r - aa_r; | ||
| int dg2 = tt_g - aa_g; | ||
| int db2 = tt_b - aa_b; | ||
| afterError += (long)(dr2*dr2 + dg2*dg2 + db2*db2 + da2*da2); | ||
| } | ||
| } | ||
|
|
||
| // Combine: total = baseTotal - beforeError + afterError | ||
| final double denom = (w * h * 4.0); | ||
| long baseTotal = (long)(Math.pow(score * 255, 2) * denom); | ||
| long total = baseTotal - beforeError + afterError; | ||
|
|
||
| return (float)(Math.sqrt(total / denom) / 255.0); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -34,6 +34,10 @@ public interface AppConstants { | |||||||||
| // When true, use local gradient magnitude to bias circle size selection: | ||||||||||
| // small circles near edges/detail, large circles in smooth areas | ||||||||||
| boolean USE_ADAPTIVE_SIZE = true; | ||||||||||
|
|
||||||||||
| // When true, use batch-parallel energy evaluation with combined color+energy pass, | ||||||||||
| // spatial batching for cache locality, and precomputed alpha blend tables | ||||||||||
|
Comment on lines
+38
to
+39
|
||||||||||
| // When true, use batch-parallel energy evaluation with combined color+energy pass, | |
| // spatial batching for cache locality, and precomputed alpha blend tables | |
| // When true, use batch-parallel energy evaluation with a combined color+energy pass | |
| // and spatial batching for cache locality |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The Javadoc says this path uses “precomputed alpha blend tables … with table lookups”, but the implementation still does per-pixel multiplies (e.g.,
bb_r * pa). Either update the documentation to match the implementation or add the actual lookup-table optimization that’s being described.