Consider extraction errors as expected operational errors without cra…

…shing the engine (#1069)
OpenTermsArchive · May 8, 2024 · 572511c · 572511c
2 parents 0b59189 + 244a789
commit 572511c
Show file tree

Hide file tree

Showing 18 changed files with 569 additions and 434 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,26 @@
 
 All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased [minor]
+
+> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
+
+### Changed
+
+- Consider extraction errors as expected operational errors instead of crashing the engine
+
+### Added
+
+- Add label `empty content` to reported issues on GitHub when server returns empty content or when PDF content is unextractable
+- Add label `invalid selector` to reported issues on GitHub when CSS selector is invalid
+
+### Fixed
+
+- Do not crash when source documents links point to invalid URLs
+- Take into account the `--types` CLI option 
+- Fix counts of terms when they are specified via the CLI option `--types`
+- Fix display issues for service ID and terms type associated with errors
+
 ## 1.1.3 - 2024-05-02
 
 _Full changeset and discussions: [#1068](https://github.com/OpenTermsArchive/engine/pull/1068)._

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -172,7 +172,7 @@ There are five ways to handle operational errors:
 - **Log the error — and do nothing else**. If it's a minor error and there’s nothing you can do about, and there is no reason to stop the whole process.
 - **Crash immediately**. If the error cannot be handled and can affect data integrity.
 
-In our case, we consider all `fetch`-related errors as expected, so as operational errors and we handle them by logging but we do not stop the whole process. We handle errors related to the `notifier` in the same way.
+In our case, we consider all `fetch`-related and `extract`-related errors as expected, so as operational errors and we handle them by logging but we do not stop the whole process. We handle errors related to the `notifier` in the same way.
 In contrast, we consider errors from the `recorder` module as fatal, and we crash immediately.
 
 #### Handling programmer errors

diff --git a/src/archivist/extract/errors.js b/src/archivist/extract/errors.js
@@ -0,0 +1,6 @@
+export class ExtractDocumentError extends Error {
+  constructor(message) {
+    super(`Extract failed: ${message}`);
+    this.name = 'ExtractDocumentError';
+  }
+}
diff --git a/src/archivist/extract/index.js b/src/archivist/extract/index.js
@@ -1,13 +1,13 @@
-import url from 'url';
-
 import ciceroMark from '@accordproject/markdown-cicero';
 import mardownPdf from '@accordproject/markdown-pdf';
 import TurndownService from '@opentermsarchive/turndown';
 import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
 import jsdom from 'jsdom';
 import mime from 'mime';
 
-import { InaccessibleContentError } from '../errors.js';
+import { ExtractDocumentError } from './errors.js';
+
+export { ExtractDocumentError } from './errors.js';
 
 const { JSDOM } = jsdom;
 const turndownService = new TurndownService();
@@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
  * @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
 */
 export default async function extract(sourceDocument) {
-  if (sourceDocument.mimeType == mime.getType('pdf')) {
-    return extractFromPDF(sourceDocument);
-  }
+  try {
+    if (sourceDocument.mimeType == mime.getType('pdf')) {
+      return await extractFromPDF(sourceDocument);
+    }
 
-  return extractFromHTML(sourceDocument);
+    return await extractFromHTML(sourceDocument);
+  } catch (error) {
+    throw new ExtractDocumentError(error.message);
+  }
 }
 
 export async function extractFromHTML(sourceDocument) {
@@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
       });
       /* eslint-enable no-await-in-loop */
     } catch (error) {
-      throw new InaccessibleContentError(`The filter function "${filterFunction.name}" failed: ${error}`);
+      throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
     }
   }
 
@@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
   const domFragment = select(webPageDOM, contentSelectors);
 
   if (!domFragment.children.length) {
-    throw new InaccessibleContentError(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
+    throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
   }
 
   convertRelativeURLsToAbsolute(domFragment, location);
@@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
   const markdownContent = transform(domFragment);
 
   if (!markdownContent) {
-    throw new InaccessibleContentError(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
+    throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
   }
 
   return markdownContent;
 }
 
-export async function extractFromPDF({ content: pdfBuffer }) {
+export async function extractFromPDF({ location, content: pdfBuffer }) {
+  let markdownContent;
+
   try {
     const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);
 
-    return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
+    markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
   } catch (error) {
     if (error.parserError) {
-      throw new InaccessibleContentError("Can't parse PDF file");
+      throw new Error("Can't parse PDF file");
     }
 
     throw error;
   }
+
+  if (!markdownContent) {
+    throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
+  }
+
+  return markdownContent;
 }
 
 function selectRange(webPageDOM, rangeSelector) {
@@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
   const endNode = webPageDOM.querySelector(endBefore || endAfter);
 
   if (!startNode) {
-    throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
+    throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
   }
 
   if (!endNode) {
-    throw new InaccessibleContentError(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
+    throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
   }
 
   selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
@@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {
 
 export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
   Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
-    link.href = url.resolve(baseURL, link.href);
+    try {
+      link.href = new URL(link.href, baseURL).href;
+    } catch (error) {
+      // Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
+    }
   });
 }