diff --git a/app.js b/app.js index df4ea2d..2ec41f7 100644 --- a/app.js +++ b/app.js @@ -15,6 +15,10 @@ var cache = apicache.options({ respectCacheControl: false, // Add origin to cache key to avoid CORS issues with cached responses appendKey: (req, res) => req.headers.origin || "", + // Only cache successful responses. Excludes error statuses (a transient + // upstream failure would otherwise poison the 1-month TTL) and redirects + // from /api/download (their tokenized target URL expires in minutes). + statusCodes: { include: [200] }, }).middleware; // Define routes @@ -25,9 +29,16 @@ var comPapersRouter = require("./routes/papers_gceguide_com"); var ppcoPapersRouter = require("./routes/pastpapers_co"); var ppcaPapersRouter = require("./routes/papacambridge_com"); var yearsRouter = require("./routes/years"); +var downloadRouter = require("./routes/download"); var app = express(); +// Trust Fly.io's edge proxy so req.protocol reflects X-Forwarded-Proto. +// Without this the paper-list endpoint builds http:// download URLs even +// when the original request was https, triggering mixed-content blocks +// when the frontend (https) renders the link. +app.set("trust proxy", true); + // Setup views app.set("views", path.join(__dirname, "views")); app.set("view engine", "jade"); @@ -61,7 +72,9 @@ app.use( }) ); app.use(cookieParser()); -app.use(cache("1 month")); +// Skip cache for /api/download — it streams multi-MB binaries and uses a +// per-request tokenized upstream URL, neither of which belongs in apicache. +app.use(cache("1 month", (req) => !req.originalUrl.startsWith("/api/download"))); // Define routes app.use("/api", indexRouter); @@ -71,6 +84,7 @@ app.use("/api/papers/com", comPapersRouter); app.use("/api/papers/ppco", ppcoPapersRouter); app.use("/api/papers/ppca", ppcaPapersRouter); app.use("/api/years", yearsRouter); +app.use("/api/download", downloadRouter); app.get("/api/cache/clear", (_req, res) => { res.json(apicache.clear()); diff --git a/config/cors.config.js b/config/cors.config.js index ead6d6a..e323a52 100644 --- a/config/cors.config.js +++ b/config/cors.config.js @@ -83,11 +83,16 @@ const corsOptions = { "X-Request-Id", ], - // Headers exposed to the client + // Headers exposed to the client. Content-Disposition and Content-Length + // matter for /api/download — frontends use them to read the filename + // (for `` or File System Access) and show progress. exposedHeaders: [ "X-Total-Count", "X-Page-Count", + "Content-Disposition", + "Content-Length", "Content-Range", + "Accept-Ranges", "X-Request-Id", "X-Response-Time", "X-Rate-Limit-Remaining", diff --git a/package.json b/package.json index 3f956e3..df6fc52 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "http-errors": "~2.0.0", "jade": "~1.11.0", "morgan": "~1.10.0", + "pdf-lib": "^1.17.1", "redis": "^4.6.13", "socks5-https-client": "^1.2.1" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b52ce99..e458b7d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -41,6 +41,9 @@ importers: morgan: specifier: ~1.10.0 version: 1.10.0 + pdf-lib: + specifier: ^1.17.1 + version: 1.17.1 redis: specifier: ^4.6.13 version: 4.6.15 @@ -50,6 +53,12 @@ importers: packages: + '@pdf-lib/standard-fonts@1.0.0': + resolution: {integrity: sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==} + + '@pdf-lib/upng@1.0.1': + resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} + '@redis/bloom@1.2.0': resolution: {integrity: sha512-HG2DFjYKbpNmVXsa0keLHp/3leGJz1mjh09f2RLGGLQZzSHpkmZWuwJbAvo3QcRY8p80m5+ZdXZdYOSBLlp7Cg==} peerDependencies: @@ -624,6 +633,9 @@ packages: optimist@0.3.7: resolution: {integrity: sha512-TCx0dXQzVtSCg2OgY/bO9hjM9cV4XYx09TVK+s3+FhkjT6LovsLe+pPMzpWf+6yXK/hUizs2gUoTw3jHM0VaTQ==} + pako@1.0.11: + resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} + parseurl@1.3.3: resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} engines: {node: '>= 0.8'} @@ -631,6 +643,9 @@ packages: path-to-regexp@0.1.7: resolution: {integrity: sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ==} + pdf-lib@1.17.1: + resolution: {integrity: sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==} + performance-now@2.1.0: resolution: {integrity: sha512-7EAHlyLHI56VEIdK57uwHdHKIaAGbnXPiw0yWbarQZOKaKpvUIgW0jWRVLiatnM+XXlSwsanIBH/hzGMJulMow==} @@ -764,6 +779,9 @@ packages: resolution: {integrity: sha512-zJf5m2EIOngmBbDe2fhTPpCjzM2qkZVqrFJZc2jaln+KBeEaYKhS2QMOIkfVrNUyoOwqgbTwOHATzr3jZRQDyg==} deprecated: Deprecated, use jstransformer + tslib@1.14.1: + resolution: {integrity: sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==} + tunnel-agent@0.6.0: resolution: {integrity: sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==} @@ -841,6 +859,14 @@ packages: snapshots: + '@pdf-lib/standard-fonts@1.0.0': + dependencies: + pako: 1.0.11 + + '@pdf-lib/upng@1.0.1': + dependencies: + pako: 1.0.11 + '@redis/bloom@1.2.0(@redis/client@1.5.17)': dependencies: '@redis/client': 1.5.17 @@ -1425,10 +1451,19 @@ snapshots: dependencies: wordwrap: 0.0.3 + pako@1.0.11: {} + parseurl@1.3.3: {} path-to-regexp@0.1.7: {} + pdf-lib@1.17.1: + dependencies: + '@pdf-lib/standard-fonts': 1.0.0 + '@pdf-lib/upng': 1.0.1 + pako: 1.0.11 + tslib: 1.14.1 + performance-now@2.1.0: {} promise@2.0.0: @@ -1610,6 +1645,8 @@ snapshots: promise: 2.0.0 uglify-js: 2.2.5 + tslib@1.14.1: {} + tunnel-agent@0.6.0: dependencies: safe-buffer: 5.2.1 diff --git a/routes/cates.js b/routes/cates.js index c7c9621..d43c77f 100644 --- a/routes/cates.js +++ b/routes/cates.js @@ -1,6 +1,7 @@ var Crawler = require("crawler"); var express = require("express"); var router = express.Router(); +var { fetchCates } = require("../utils/papersdaddy_wrapper"); /* const Agent = require('socks5-https-client/lib/Agent'); */ @@ -61,44 +62,14 @@ router.get("/ppco/:cate", function (req, res, next) { ]); }); -// PapaCambridge -// as-and-a-level -// igcse -router.get("/ppca/:cate", function (req, res, next) { - const server = "https://pastpapers.papacambridge.com/papers/caie/"; - c.queue([ - { - uri: `${server}${req.params.cate}`.toLowerCase(), - callback: function (error, resC, done) { - if (error) { - console.log(error); - } else { - let $ = resC.$; - let returnArray = { - cates: new Array(), - count: 0, - }; - $("#datafile > div.files-list-main > div").each(function () { - const subject = $(this).text().trim(); - - if (subject.includes("-") && !subject.includes("No Content Available")) { - const subjectName = subject - .substring(0, subject.lastIndexOf("-")) - .replaceAll("-", " "); - const subjectCode = subject.substring(subject.lastIndexOf("-") + 2); - returnArray.cates.push({ - name: `${subjectName} (${subjectCode})`, - }); - } - }); - returnArray.count = returnArray.cates.length; - console.log(server + req.params.cate); - res.send(JSON.stringify(returnArray)); - } - done(); - }, - }, - ]); +router.get("/ppca/:cate", function (req, res, _next) { + fetchCates(req.params.cate, function (err, cates) { + if (err) { + console.log(err); + return res.status(502).json({ cates: [], count: 0, error: err.message }); + } + res.json({ cates, count: cates.length }); + }); }); // GCE Guide diff --git a/routes/download.js b/routes/download.js new file mode 100644 index 0000000..de8be07 --- /dev/null +++ b/routes/download.js @@ -0,0 +1,136 @@ +var express = require("express"); +var https = require("https"); +var router = express.Router(); +var { resolveDownload } = require("../utils/papersdaddy_wrapper"); +var { stripWatermark } = require("../utils/watermark_stripper"); + +const SAFE_PATH = /^\/cambridge\/[a-z0-9-]+\/[a-z0-9-]+\/[0-9]{4}-[a-z-]+\/[A-Za-z0-9_.-]+\.(pdf|mp3|docx?)$/; + +const STREAM_HEADERS = { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + Accept: "*/*", +}; + +const CONTENT_TYPE_BY_EXT = { + pdf: "application/pdf", + mp3: "audio/mpeg", + doc: "application/msword", + docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", +}; + +// Returns {start, end} clamped to [0, total-1] or null if the header is +// malformed/unsupported. Only handles single-range "bytes=start-end". +function parseRange(header, total) { + const m = /^bytes=(\d*)-(\d*)$/.exec(header || ""); + if (!m) return null; + const hasStart = m[1] !== ""; + const hasEnd = m[2] !== ""; + if (!hasStart && !hasEnd) return null; + + let start, end; + if (!hasStart) { + // Suffix range "bytes=-500" → last 500 bytes + const suffix = parseInt(m[2], 10); + if (suffix <= 0) return null; + start = Math.max(0, total - suffix); + end = total - 1; + } else { + start = parseInt(m[1], 10); + end = hasEnd ? parseInt(m[2], 10) : total - 1; + } + if (start >= total || start < 0 || end < start) return null; + end = Math.min(end, total - 1); + return { start, end }; +} + +function sendBuffer(req, res, body, contentType, filename, asAttachment) { + const total = body.length; + res.setHeader("Content-Type", contentType); + res.setHeader( + "Content-Disposition", + `${asAttachment ? "attachment" : "inline"}; filename="${filename}"` + ); + res.setHeader("Accept-Ranges", "bytes"); + res.setHeader("Cache-Control", "public, max-age=86400"); + + const range = parseRange(req.headers.range, total); + if (range) { + const slice = body.subarray(range.start, range.end + 1); + res.status(206); + res.setHeader("Content-Range", `bytes ${range.start}-${range.end}/${total}`); + res.setHeader("Content-Length", slice.length); + if (req.method === "HEAD") return res.end(); + return res.end(slice); + } + + res.status(200); + res.setHeader("Content-Length", total); + if (req.method === "HEAD") return res.end(); + return res.end(body); +} + +function handle(req, res) { + const subpath = req.path; + if (!SAFE_PATH.test(subpath)) { + return res.status(400).json({ error: "invalid path", path: subpath }); + } + const asAttachment = req.query.download === "1" || req.query.download === "true"; + + resolveDownload(subpath, function (err, url) { + if (err) { + console.log(err); + return res.status(502).json({ error: err.message }); + } + + const filename = subpath.split("/").pop(); + const ext = filename.split(".").pop().toLowerCase(); + const contentType = CONTENT_TYPE_BY_EXT[ext] || "application/octet-stream"; + + const upstream = https.get(url, { headers: STREAM_HEADERS }, function (up) { + if (up.statusCode !== 200) { + res.status(502).json({ error: `upstream ${up.statusCode}` }); + up.resume(); + return; + } + + const chunks = []; + up.on("data", (c) => chunks.push(c)); + up.on("end", async () => { + const raw = Buffer.concat(chunks); + let body = raw; + if (ext === "pdf") { + try { + body = await stripWatermark(raw); + } catch (e) { + console.log("watermark strip failed, serving original:", e.message); + } + } + if (res.writableEnded || res.destroyed) return; + sendBuffer(req, res, body, contentType, filename, asAttachment); + }); + up.on("error", function (e) { + console.log(e); + if (!res.headersSent) res.status(502).json({ error: e.message }); + }); + }); + + upstream.on("error", function (e) { + console.log(e); + if (!res.headersSent) { + res.status(502).json({ error: e.message }); + } else { + res.destroy(); + } + }); + + req.on("close", function () { + upstream.destroy(); + }); + }); +} + +router.get(/.*/, handle); +router.head(/.*/, handle); + +module.exports = router; diff --git a/routes/papacambridge_com.js b/routes/papacambridge_com.js index 8facc48..acc80df 100644 --- a/routes/papacambridge_com.js +++ b/routes/papacambridge_com.js @@ -1,88 +1,17 @@ -var Crawler = require("crawler"); var express = require("express"); var router = express.Router(); - -var c = new Crawler({ - skipEventRequest: false, - maxConnections: 30, - method: "GET", -}); - -router.get("/:cate/:sub/:year", function (req, res, next) { - let sub = req.params.sub.replace("(", "").replace(")", "").replaceAll(" ", "-"); - - var server = "https://pastpapers.papacambridge.com/papers/caie/"; - var uri = `${server}${req.params.cate}-${sub}-${req.params.year}`; - - console.log(uri); - c.queue([ - { - uri: uri.toLowerCase(), - callback: function (error, resC, done) { - if (error) { - console.log(error); - } else { - var $ = resC.$; - var key = 0; - var returnArray = { - papers: new Array(), - count: 0, - }; - $("#datafile > div.files-list-main > div").each(function () { - const paper_uri = $(this) - .find("span.kt-widget2__number.kt-font-danger.cursor > div > a") - .attr("href") - .replace("download_file.php?files=", ""); - const name = paper_uri.split("/").pop(); - - // key 字段 - key += 1; - - // info 字段 - if (name.indexOf("qp") > -1) { - var info = "Question Paper"; - } else if (name.indexOf("ms") > -1) { - var info = "Mark Scheme"; - } else if (name.indexOf("er") > -1) { - var info = "Examiner Report"; - } else if (name.indexOf("ir") > -1 || name.indexOf("ci") > -1) { - var info = "Confidential Instruction"; - } else if (name.indexOf("gt") > -1) { - var info = "Grade thresholds"; - } else if (name.indexOf("Data_Booklet") > -1) { - var info = "Data Booklet"; - } else if (name.indexOf("sci") > -1) { - var info = "Specimen Confidential Instruction"; - } else if (name.indexOf("sp") > -1) { - var info = "Specimen Paper"; - } else if (name.indexOf("sm") > -1) { - var info = "Specimen Mark Scheme"; - } else if (name.indexOf("in") > -1) { - var info = "Inert"; - } else { - var info = "Unknown"; - } - - if (name.indexOf(".pdf") > -1) { - returnArray.papers.push({ - name, - url: paper_uri, - key: key, - info: [info], - type: "PDF", - year: name.split("_")[1].substr(1), - }); - } - }); - - returnArray.count = returnArray.papers.length; - - res.send(JSON.stringify(returnArray)); - } - done(); - }, - }, - ]); +var { fetchPapers } = require("../utils/papersdaddy_wrapper"); + +router.get("/:cate/:sub/:year", function (req, res, _next) { + fetchPapers(req.params.cate, req.params.sub, req.params.year, function (err, papers) { + if (err) { + console.log(err); + return res.status(502).json({ papers: [], count: 0, error: err.message }); + } + const base = `${req.protocol}://${req.get("host")}`; + const out = papers.map((p) => ({ ...p, url: `${base}${p.url}` })); + res.json({ papers: out, count: out.length }); + }); }); module.exports = router; diff --git a/routes/years.js b/routes/years.js index e654c76..4a6af61 100644 --- a/routes/years.js +++ b/routes/years.js @@ -1,6 +1,7 @@ var Crawler = require("crawler"); var express = require("express"); var router = express.Router(); +var { fetchYears } = require("../utils/papersdaddy_wrapper"); var crawler = new Crawler({ skipEventRequest: false, @@ -55,45 +56,14 @@ router.get("/ppco/:cate/:sub", function (req, res, _next) { }); router.get("/ppca/:cate/:sub", function (req, res, _next) { - let sub = req.params.sub.replace("(", "").replace(")", "").replaceAll(" ", "-"); - - if (no_dash_subjects[`${req.params.sub}`]) { - sub = no_dash_subjects[`${req.params.sub}`]; - } - - const server = "https://pastpapers.papacambridge.com/papers/caie/"; - const uri = `${server}${req.params.cate}-${sub}`; - - crawler.queue([ - { - uri: uri.toLowerCase(), - callback: function (error, resC, done) { - if (error) { - console.log(error); - } else { - let $ = resC.$; - let returnArray = { - years: new Array(), - count: 0, - }; - $("#datafile > div.files-list-main > div > a > div").each(function () { - const date = $(this).text().trim().replaceAll(" ", "-"); - - if (date.includes("Topical") || date.includes("Solved")) { - return; - } - - returnArray.years.push({ - name: date, - }); - }); - returnArray.count = returnArray.years.length; - res.send(JSON.stringify(returnArray)); - } - done(); - }, - }, - ]); + fetchYears(req.params.cate, req.params.sub, function (err, names) { + if (err) { + console.log(err); + return res.status(502).json({ years: [], count: 0, error: err.message }); + } + const years = names.map((name) => ({ name })); + res.json({ years, count: years.length }); + }); }); /* GET paper years (GCEGuide.com Only) */ diff --git a/utils/papersdaddy_wrapper.js b/utils/papersdaddy_wrapper.js new file mode 100644 index 0000000..0d1fee7 --- /dev/null +++ b/utils/papersdaddy_wrapper.js @@ -0,0 +1,265 @@ +var Crawler = require("crawler"); + +var crawler = new Crawler({ + skipEventRequest: false, + maxConnections: 30, + method: "GET", + userAgent: + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + headers: { + Accept: + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + }, +}); + +const SERVER = "https://www.papersdaddy.com"; + +const LEVEL_MAP = { + "as-and-a-level": "a-level", + "a-level": "a-level", + "as-level": "a-level", + "igcse": "igcse", + "o-level": "o-level", +}; + +function toSubjectSlug(sub) { + return sub + .replace(/&/g, "and") + .replace(/[()]/g, "") + .replace(/-+/g, "-") + .replace(/^-|-$/g, "") + .toLowerCase(); +} + +function buildSubjectUrl(cate, sub) { + const level = LEVEL_MAP[cate.toLowerCase()] || cate.toLowerCase(); + return `${SERVER}/cambridge/${level}/${toSubjectSlug(sub)}`; +} + +function inputYearToSlug(year) { + const lower = year.toLowerCase(); + const m = /(\d{4})/.exec(lower); + if (!m) return null; + const yyyy = m[1]; + if (/march|mar(?![a-z])/.test(lower)) return `${yyyy}-march`; + if (/may|jun|summer/.test(lower)) return `${yyyy}-may-june`; + if (/oct|nov|winter/.test(lower)) return `${yyyy}-oct-nov`; + return null; +} + +function yearSlugToDisplay(slug) { + if (slug.endsWith("-may-june")) return slug.replace("-may-june", "-May-Jun"); + if (slug.endsWith("-oct-nov")) return slug.replace("-oct-nov", "-Oct-Nov"); + if (slug.endsWith("-march")) return slug.replace("-march", "-March"); + return slug; +} + +const SEASON_ORDER = { March: 0, "May-Jun": 1, "Oct-Nov": 2 }; + +function yearSortKey(name) { + const m = /^(\d{4})-(March|May-Jun|Oct-Nov)$/.exec(name); + if (!m) return [0, 0]; + return [parseInt(m[1], 10), SEASON_ORDER[m[2]]]; +} + +function fileInfo(name) { + if (name.indexOf("Data_Booklet") > -1) return "Data Booklet"; + if (name.indexOf("sci") > -1) return "Specimen Confidential Instruction"; + if (name.indexOf("sp") > -1) return "Specimen Paper"; + if (name.indexOf("sm") > -1) return "Specimen Mark Scheme"; + if (name.indexOf("qp") > -1) return "Question Paper"; + if (name.indexOf("ms") > -1) return "Mark Scheme"; + if (name.indexOf("er") > -1) return "Examiner Report"; + if (name.indexOf("ir") > -1 || name.indexOf("ci") > -1) return "Confidential Instruction"; + if (name.indexOf("gt") > -1) return "Grade thresholds"; + if (name.indexOf("in") > -1) return "Inert"; + return "Unknown"; +} + +function fileType(name) { + if (name.endsWith(".pdf")) return "PDF"; + if (name.endsWith(".mp3")) return "MP3"; + if (name.endsWith(".docx") || name.endsWith(".doc")) return "DOC"; + return "Unknown"; +} + +function escapeRe(s) { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function fetchYears(cate, sub, callback) { + const uri = buildSubjectUrl(cate, sub); + console.log(uri); + const subjectPath = new URL(uri).pathname; + const yearLinkRe = new RegExp(`^${escapeRe(subjectPath)}/(\\d{4}-(?:march|may-june|oct-nov))$`); + + crawler.queue([ + { + uri, + callback: function (error, resC, done) { + if (error || !resC || resC.statusCode !== 200) { + callback(error || new Error(`upstream ${resC && resC.statusCode}`), null); + return done(); + } + const $ = resC.$; + const slugs = new Set(); + $("a[href]").each(function () { + const href = $(this).attr("href") || ""; + const m = yearLinkRe.exec(href); + if (m) slugs.add(m[1]); + }); + + const years = [...slugs] + .map(yearSlugToDisplay) + .sort((a, b) => { + const [ay, as] = yearSortKey(a); + const [by, bs] = yearSortKey(b); + return by - ay || bs - as; + }); + callback(null, years); + done(); + }, + }, + ]); +} + +function fetchPapers(cate, sub, year, callback) { + const yearSlug = inputYearToSlug(year); + if (!yearSlug) { + callback(new Error(`unrecognized year: ${year}`), null); + return; + } + const subjectUrl = buildSubjectUrl(cate, sub); + const uri = `${subjectUrl}/${yearSlug}`; + console.log(uri); + const yearPath = new URL(uri).pathname; + const fileLinkRe = new RegExp(`^${escapeRe(yearPath)}/([^/]+\\.(?:pdf|mp3|docx?))$`); + + crawler.queue([ + { + uri, + callback: function (error, resC, done) { + if (error || !resC || resC.statusCode !== 200) { + callback(error || new Error(`upstream ${resC && resC.statusCode}`), null); + return done(); + } + const $ = resC.$; + const seen = new Map(); + $("a[href]").each(function () { + const href = $(this).attr("href") || ""; + const m = fileLinkRe.exec(href); + if (m && !seen.has(m[1])) { + seen.set(m[1], href); + } + }); + + const yearCode = (() => { + const ymatch = /(\d{4})/.exec(yearSlug); + return ymatch ? ymatch[1].slice(-2) : ""; + })(); + + const papers = []; + let key = 0; + for (const [filename, href] of seen) { + key += 1; + papers.push({ + name: filename, + // Relative path — the route handler prepends the API base so + // clients get an absolute URL pointing back to /api/download. + // We can't return papersdaddy's URL directly because it serves + // an HTML viewer, and the tokenized PDF URL expires in <5 min, + // so a permanent proxy URL is the only correct contract here. + url: `/api/download${href}`, + key, + info: [fileInfo(filename)], + type: fileType(filename), + year: yearCode, + }); + } + callback(null, papers); + done(); + }, + }, + ]); +} + +const TOKEN_RE = /pdfUrl\\":\\"([^\\]+(?:\\u0026[^\\]+)*)/; + +function resolveDownload(viewerPath, callback) { + const uri = `${SERVER}${viewerPath.startsWith("/") ? "" : "/"}${viewerPath}`; + crawler.queue([ + { + uri, + callback: function (error, resC, done) { + if (error || !resC || resC.statusCode !== 200) { + callback(error || new Error(`upstream ${resC && resC.statusCode}`), null); + return done(); + } + const body = resC.body ? resC.body.toString() : ""; + const m = TOKEN_RE.exec(body); + if (!m) { + callback(new Error("download token not found in viewer page"), null); + return done(); + } + const tokenPath = m[1].replace(/\\u0026/g, "&"); + callback(null, `${SERVER}${tokenPath}`); + done(); + }, + }, + ]); +} + +const LOWERCASE_WORDS = new Set([ + "and", "of", "the", "for", "in", "on", "to", "with", "a", "an", +]); + +function slugToDisplayName(slug) { + const lastDash = slug.lastIndexOf("-"); + if (lastDash < 0) return slug; + const code = slug.slice(lastDash + 1); + const name = slug + .slice(0, lastDash) + .split("-") + .map((w, i) => { + if (!w.length) return w; + if (i > 0 && LOWERCASE_WORDS.has(w)) return w; + return w[0].toUpperCase() + w.slice(1); + }) + .join(" "); + return `${name} (${code})`; +} + +function fetchCates(cate, callback) { + const level = LEVEL_MAP[cate.toLowerCase()] || cate.toLowerCase(); + const uri = `${SERVER}/cambridge/${level}`; + console.log(uri); + const subjectLinkRe = new RegExp(`^/cambridge/${escapeRe(level)}/([a-z0-9][a-z0-9-]*[a-z0-9]-\\d{4,5})$`); + + crawler.queue([ + { + uri, + callback: function (error, resC, done) { + if (error || !resC || resC.statusCode !== 200) { + callback(error || new Error(`upstream ${resC && resC.statusCode}`), null); + return done(); + } + const $ = resC.$; + const slugs = new Set(); + $("a[href]").each(function () { + const href = $(this).attr("href") || ""; + const m = subjectLinkRe.exec(href); + if (m) slugs.add(m[1]); + }); + + const cates = [...slugs] + .sort() + .map((slug) => ({ name: slugToDisplayName(slug) })); + callback(null, cates); + done(); + }, + }, + ]); +} + +module.exports = { fetchYears, fetchPapers, fetchCates, resolveDownload }; diff --git a/utils/redis_wrapper.js b/utils/redis_wrapper.js index 656770f..21f2077 100644 --- a/utils/redis_wrapper.js +++ b/utils/redis_wrapper.js @@ -7,9 +7,14 @@ var redisClient = redis }) .on("connect", function () { console.log("Redis connected!"); + }) + .on("error", function (err) { + console.warn("Redis error (caching disabled):", err.message); }); -redisClient.connect(); +redisClient.connect().catch(function (err) { + console.warn("Redis connect failed (caching disabled):", err.message); +}); var redisWrapper = { connected: redisClient.isOpen, diff --git a/utils/watermark_stripper.js b/utils/watermark_stripper.js new file mode 100644 index 0000000..c35cbc9 --- /dev/null +++ b/utils/watermark_stripper.js @@ -0,0 +1,41 @@ +var { + PDFDocument, + PDFRawStream, + PDFName, + PDFNumber, + decodePDFRawStream, +} = require("pdf-lib"); + +// Each papersdaddy watermark lives in a tiny standalone content stream of the +// form q BT … []TJ ET Q where encodes either "PapersDaddy" or +// "Downloaded for free from www.papersdaddy.com". Matching the literal hex of +// "PapersDaddy" or "papersdaddy" picks up both header and footer reliably; the +// sequences are 22 bytes long and don't collide with anything in a CAIE paper. +const WATERMARK_RE = + /5061706572734461646479|706170657273646164647[0-9a-f]|PapersDaddy|papersdaddy/i; + +async function stripWatermark(pdfBytes) { + const doc = await PDFDocument.load(pdfBytes, { + updateMetadata: false, + ignoreEncryption: true, + }); + + for (const [ref, obj] of doc.context.enumerateIndirectObjects()) { + if (!(obj instanceof PDFRawStream)) continue; + let decoded; + try { + decoded = Buffer.from(decodePDFRawStream(obj).decode()).toString("latin1"); + } catch (_e) { + continue; + } + if (!WATERMARK_RE.test(decoded)) continue; + const empty = PDFRawStream.of(obj.dict, new Uint8Array(0)); + empty.dict.set(PDFName.of("Length"), PDFNumber.of(0)); + empty.dict.delete(PDFName.of("Filter")); + doc.context.assign(ref, empty); + } + + return Buffer.from(await doc.save({ useObjectStreams: false })); +} + +module.exports = { stripWatermark }; diff --git a/yarn.lock b/yarn.lock index 75b28ad..7fbfea4 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2,6 +2,20 @@ # yarn lockfile v1 +"@pdf-lib/standard-fonts@^1.0.0": + version "1.0.0" + resolved "https://registry.yarnpkg.com/@pdf-lib/standard-fonts/-/standard-fonts-1.0.0.tgz#8ba691c4421f71662ed07c9a0294b44528af2d7f" + integrity sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA== + dependencies: + pako "^1.0.6" + +"@pdf-lib/upng@^1.0.1": + version "1.0.1" + resolved "https://registry.yarnpkg.com/@pdf-lib/upng/-/upng-1.0.1.tgz#7dc9c636271aca007a9df4deaf2dd7e7960280cb" + integrity sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ== + dependencies: + pako "^1.0.10" + "@redis/bloom@1.2.0": version "1.2.0" resolved "https://registry.yarnpkg.com/@redis/bloom/-/bloom-1.2.0.tgz#d3fd6d3c0af3ef92f26767b56414a370c7b63b71" @@ -1075,6 +1089,11 @@ optimist@~0.3.5: dependencies: wordwrap "~0.0.2" +pako@^1.0.10, pako@^1.0.11, pako@^1.0.6: + version "1.0.11" + resolved "https://registry.yarnpkg.com/pako/-/pako-1.0.11.tgz#6c9599d340d54dfd3946380252a35705a6b992bf" + integrity sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw== + parseurl@~1.3.3: version "1.3.3" resolved "https://registry.yarnpkg.com/parseurl/-/parseurl-1.3.3.tgz#9da19e7bee8d12dff0513ed5b76957793bc2e8d4" @@ -1085,6 +1104,16 @@ path-to-regexp@0.1.7: resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-0.1.7.tgz#df604178005f522f15eb4490e7247a1bfaa67f8c" integrity sha512-5DFkuoqlv1uYQKxy8omFBeJPQcdoE07Kv2sferDCrAq1ohOU+MSDswDIbnx3YAM60qIOnYa53wBhXW0EbMonrQ== +pdf-lib@^1.17.1: + version "1.17.1" + resolved "https://registry.yarnpkg.com/pdf-lib/-/pdf-lib-1.17.1.tgz#9e7dd21261a0c1fb17992580885b39e7d08f451f" + integrity sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw== + dependencies: + "@pdf-lib/standard-fonts" "^1.0.0" + "@pdf-lib/upng" "^1.0.1" + pako "^1.0.11" + tslib "^1.11.1" + performance-now@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/performance-now/-/performance-now-2.1.0.tgz#6309f4e0e5fa913ec1c69307ae364b4b377c9e7b" @@ -1373,6 +1402,11 @@ transformers@2.1.0: promise "~2.0" uglify-js "~2.2.5" +tslib@^1.11.1: + version "1.14.1" + resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.14.1.tgz#cf2d38bdc34a134bcaf1091c41f6619e2f672d00" + integrity sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg== + tunnel-agent@^0.6.0: version "0.6.0" resolved "https://registry.yarnpkg.com/tunnel-agent/-/tunnel-agent-0.6.0.tgz#27a5dea06b36b04a0a9966774b290868f0fc40fd"