Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Subscribe via RSS: <https://www.freightutils.com/changelog.xml>
## 2026-05-14

- **Performance**: `/hs/code/*` and `/hs/heading/*` now served from Vercel edge cache with `Cache-Control: public, max-age=300, s-maxage=86400, stale-while-revalidate=604800`. Cold-serve response unchanged; warm-cache response served from the edge without re-rendering the page. ISR (`export const revalidate = 86400`) was already in place on both routes — this change makes the cache strategy explicit and tunable in `next.config.ts`, and surfaces the `Cache-Control` header in the response so cache hits are observable via `curl -I`. Sourced by the 2026-05-14 scraper-signature audit (`docs/audit/scraper-signature-2026-05-14.md`) which confirmed an active 216.* scraper hitting these paths at sustained ~5-second intervals. The application-layer ScrapeGuard rate limiter still runs on every request and continues to 429 the scraper as designed; Phase 1.6 will measure whether edge-cache adoption reduces overall Redis-INCR volume enough to skip Phase 2 (static generation).
- **Internal**: ScrapeGuard now logs sanitised User-Agent + full source IP on block decisions (429s only — never on the success / cache-hit path). Supports evidence-based firewall rule additions; preserves existing structured 429 body and headers. UA sanitisation strips control characters (log-injection guard), replaces internal quotes with apostrophes, truncates to 200 chars, and falls back to `ua=empty` for null/whitespace UAs. Log line format converted to space-separated `key=value` pairs for grep/awk parsing. IP resolution unchanged (`x-real-ip` first, per existing Vercel-trust comment).

## 2026-05-13

Expand Down
5 changes: 5 additions & 0 deletions lib/changelog-data.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ export interface ChangelogEntry {
}

export const entries: ChangelogEntry[] = [
{
isoDate: '2026-05-14', date: 'May 14', tag: 'Security',
title: 'ScrapeGuard logs sanitised UA + full source IP on block decisions',
desc: 'Middleware now emits the User-Agent and full client IP on every 429 — never on the success/cache-hit path. Supports evidence-based firewall rule additions. UA is sanitised (control chars stripped to prevent log injection, internal quotes replaced with apostrophes, truncated to 200 chars, falls back to `ua=empty` for null/whitespace). Log line format converted to space-separated `key=value` pairs for grep/awk parsing. IP resolution unchanged.',
},
{
isoDate: '2026-05-14', date: 'May 14', tag: 'Bug Fix',
title: 'Edge cache on /hs/code/* and /hs/heading/*',
Expand Down
21 changes: 19 additions & 2 deletions middleware.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,19 @@ function getClientIp(req: NextRequest): string {
?? 'unknown';
}

// Sanitise UA for block-decision logs only. Strips ASCII control chars
// (incl. \r\n\t and DEL — log-injection guard), replaces internal " with '
// to keep the quoted ua="..." field parseable by grep/awk, truncates at
// 200 chars. Returns the literal string 'empty' for null / whitespace-only
// UAs so log shape never collapses to `ua=` or `ua=null`.
function getSanitisedUa(req: NextRequest): string {
const raw = req.headers.get('user-agent');
if (!raw || /^\s*$/.test(raw)) return 'empty';
const cleaned = raw.replace(/[\x00-\x1f\x7f]/g, '').replace(/"/g, "'");
if (!cleaned || /^\s*$/.test(cleaned)) return 'empty';
return cleaned.slice(0, 200);
}

// ─────────────────────────────────────────────────────────────────
// Structured 429 — single source of truth for the rate-limit body
// shape and headers. Every 429 emitted by this middleware goes
Expand Down Expand Up @@ -301,8 +314,10 @@ async function tryBulkRefScrape(req: NextRequest): Promise<NextResponse | null>
try {
const { success, remaining, reset } = await rl.limit(ip);
if (!success) {
const ua = getSanitisedUa(req);
const uaField = ua === 'empty' ? 'ua=empty' : `ua="${ua}"`;
console.warn(
`[ScrapeGuard] 429 — IP: ${ip}, path: ${req.nextUrl.pathname}, group: bulkref, limit: 10, resets: ${new Date(reset).toISOString()}`,
`[ScrapeGuard] 429 path=${req.nextUrl.pathname} ip=${ip} ${uaField} group=bulkref limit=10 resets=${new Date(reset).toISOString()}`,
);
return buildConversionRateLimitResponse(req, {
retryAfterSeconds: 300,
Expand Down Expand Up @@ -339,8 +354,10 @@ async function handleScrapeProtection(req: NextRequest): Promise<NextResponse> {
const { success, remaining, reset } = await rl.limit(ip);

if (!success) {
const ua = getSanitisedUa(req);
const uaField = ua === 'empty' ? 'ua=empty' : `ua="${ua}"`;
console.warn(
`[ScrapeGuard] 429 — IP: ${ip}, path: ${pathname}, group: ${group}, limit: ${limit}, resets: ${new Date(reset).toISOString()}`
`[ScrapeGuard] 429 path=${pathname} ip=${ip} ${uaField} group=${group} limit=${limit} resets=${new Date(reset).toISOString()}`
);
return buildConversionRateLimitResponse(req, {
retryAfterSeconds: retryAfter,
Expand Down
111 changes: 111 additions & 0 deletions scripts/test-scrapeguard-ua-sanitiser.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**
* Standalone smoke test for the middleware ScrapeGuard UA sanitiser.
*
* Mirrors getSanitisedUa() in middleware.ts. Pure-function logic, easy to
* unit-test outside the Edge runtime so we don't need a live Upstash
* connection just to validate sanitisation.
*
* Run: node scripts/test-scrapeguard-ua-sanitiser.mjs
*/

function getSanitisedUa(headerValue) {
const raw = headerValue;
if (!raw || /^\s*$/.test(raw)) return 'empty';
const cleaned = raw.replace(/[\x00-\x1f\x7f]/g, '').replace(/"/g, "'");
if (!cleaned || /^\s*$/.test(cleaned)) return 'empty';
return cleaned.slice(0, 200);
}

const tests = [
// [name, input, expected]
['null UA → empty', null, 'empty'],
['undefined UA → empty', undefined, 'empty'],
['empty string → empty', '', 'empty'],
['whitespace only → empty', ' \t ', 'empty'],
['plain curl UA', 'curl/8.0.0', 'curl/8.0.0'],
['plain python UA', 'python-requests/2.31.0', 'python-requests/2.31.0'],
[
'mozilla long UA preserved',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
],
['CR injection stripped', 'curl/8.0\r\n[ScrapeGuard] fake', 'curl/8.0[ScrapeGuard] fake'],
['LF injection stripped', 'curl/8.0\n[ScrapeGuard] fake', 'curl/8.0[ScrapeGuard] fake'],
['tab stripped', 'curl/8.0\tbar', 'curl/8.0bar'],
['NUL byte stripped', 'curl/8.0\x00bar', 'curl/8.0bar'],
['DEL stripped', 'curl/8.0\x7fbar', 'curl/8.0bar'],
['internal quote escaped to apostrophe', 'Browser "Foo" 1.0', "Browser 'Foo' 1.0"],
['only control chars → empty', '\r\n\t\x00', 'empty'],
['200-char truncation', 'a'.repeat(300), 'a'.repeat(200)],
['200-char boundary preserved', 'a'.repeat(200), 'a'.repeat(200)],
];

let pass = 0;
let fail = 0;
for (const [name, input, expected] of tests) {
const actual = getSanitisedUa(input);
if (actual === expected) {
pass++;
console.log(`PASS ${name}`);
} else {
fail++;
console.error(`FAIL ${name}`);
console.error(` expected: ${JSON.stringify(expected)}`);
console.error(` actual: ${JSON.stringify(actual)}`);
}
}

// Log-line format check — confirm the assembled line is grep-friendly and
// quoted UA field survives a shell-escape round trip.
const fakePath = '/hs/code/0101';
const fakeIp = '216.244.66.231';
const fakeReset = '2026-05-14T12:34:56.789Z';
function buildLogLine(uaRaw, group, limit) {
const ua = getSanitisedUa(uaRaw);
const uaField = ua === 'empty' ? 'ua=empty' : `ua="${ua}"`;
return `[ScrapeGuard] 429 path=${fakePath} ip=${fakeIp} ${uaField} group=${group} limit=${limit} resets=${fakeReset}`;
}

const lineTests = [
[
'normal UA produces quoted field',
buildLogLine('python-requests/2.31.0', 'hs', 10),
'[ScrapeGuard] 429 path=/hs/code/0101 ip=216.244.66.231 ua="python-requests/2.31.0" group=hs limit=10 resets=2026-05-14T12:34:56.789Z',
],
[
'null UA produces ua=empty',
buildLogLine(null, 'bulkref', 10),
'[ScrapeGuard] 429 path=/hs/code/0101 ip=216.244.66.231 ua=empty group=bulkref limit=10 resets=2026-05-14T12:34:56.789Z',
],
[
'injection attempt does not split log line',
buildLogLine('evil\r\n[ScrapeGuard] FAKE 429', 'hs', 10),
`[ScrapeGuard] 429 path=/hs/code/0101 ip=216.244.66.231 ua="evil[ScrapeGuard] FAKE 429" group=hs limit=10 resets=2026-05-14T12:34:56.789Z`,
],
];

for (const [name, actual, expected] of lineTests) {
if (actual === expected) {
pass++;
console.log(`PASS ${name}`);
} else {
fail++;
console.error(`FAIL ${name}`);
console.error(` expected: ${JSON.stringify(expected)}`);
console.error(` actual: ${JSON.stringify(actual)}`);
}
}

// Specifically assert the assembled line has no embedded \r or \n —
// that's the log-injection regression guard.
const injectionLine = buildLogLine('attacker\r\nFAKE', 'hs', 10);
if (!injectionLine.includes('\r') && !injectionLine.includes('\n')) {
pass++;
console.log('PASS assembled line free of \\r and \\n after injection input');
} else {
fail++;
console.error('FAIL assembled line still contains \\r or \\n');
}

console.log(`\n${pass} pass / ${fail} fail (${pass + fail} total)`);
process.exit(fail === 0 ? 0 : 1);