Skip to content

Commit

Permalink
Consider extraction errors as expected operational errors without cra…
Browse files Browse the repository at this point in the history
…shing the engine (#1069)
  • Loading branch information
Ndpnt committed May 8, 2024
2 parents 0b59189 + 244a789 commit 572511c
Show file tree
Hide file tree
Showing 18 changed files with 569 additions and 434 deletions.
20 changes: 20 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,26 @@

All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased [minor]

> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
### Changed

- Consider extraction errors as expected operational errors instead of crashing the engine

### Added

- Add label `empty content` to reported issues on GitHub when server returns empty content or when PDF content is unextractable
- Add label `invalid selector` to reported issues on GitHub when CSS selector is invalid

### Fixed

- Do not crash when source documents links point to invalid URLs
- Take into account the `--types` CLI option
- Fix counts of terms when they are specified via the CLI option `--types`
- Fix display issues for service ID and terms type associated with errors

## 1.1.3 - 2024-05-02

_Full changeset and discussions: [#1068](https://github.com/OpenTermsArchive/engine/pull/1068)._
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ There are five ways to handle operational errors:
- **Log the error — and do nothing else**. If it's a minor error and there’s nothing you can do about, and there is no reason to stop the whole process.
- **Crash immediately**. If the error cannot be handled and can affect data integrity.

In our case, we consider all `fetch`-related errors as expected, so as operational errors and we handle them by logging but we do not stop the whole process. We handle errors related to the `notifier` in the same way.
In our case, we consider all `fetch`-related and `extract`-related errors as expected, so as operational errors and we handle them by logging but we do not stop the whole process. We handle errors related to the `notifier` in the same way.
In contrast, we consider errors from the `recorder` module as fatal, and we crash immediately.

#### Handling programmer errors
Expand Down
6 changes: 6 additions & 0 deletions src/archivist/extract/errors.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export class ExtractDocumentError extends Error {
constructor(message) {
super(`Extract failed: ${message}`);
this.name = 'ExtractDocumentError';
}
}
48 changes: 32 additions & 16 deletions src/archivist/extract/index.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import url from 'url';

import ciceroMark from '@accordproject/markdown-cicero';
import mardownPdf from '@accordproject/markdown-pdf';
import TurndownService from '@opentermsarchive/turndown';
import turndownPluginGithubFlavouredMarkdown from 'joplin-turndown-plugin-gfm';
import jsdom from 'jsdom';
import mime from 'mime';

import { InaccessibleContentError } from '../errors.js';
import { ExtractDocumentError } from './errors.js';

export { ExtractDocumentError } from './errors.js';

const { JSDOM } = jsdom;
const turndownService = new TurndownService();
Expand All @@ -29,11 +29,15 @@ const ciceroMarkTransformer = new CiceroMarkTransformer();
* @returns {Promise<string>} Promise which is fulfilled once the content is extracted and converted in Markdown. The promise will resolve into a string containing the extracted content in Markdown format
*/
export default async function extract(sourceDocument) {
if (sourceDocument.mimeType == mime.getType('pdf')) {
return extractFromPDF(sourceDocument);
}
try {
if (sourceDocument.mimeType == mime.getType('pdf')) {
return await extractFromPDF(sourceDocument);
}

return extractFromHTML(sourceDocument);
return await extractFromHTML(sourceDocument);
} catch (error) {
throw new ExtractDocumentError(error.message);
}
}

export async function extractFromHTML(sourceDocument) {
Expand Down Expand Up @@ -63,7 +67,7 @@ export async function extractFromHTML(sourceDocument) {
});
/* eslint-enable no-await-in-loop */
} catch (error) {
throw new InaccessibleContentError(`The filter function "${filterFunction.name}" failed: ${error}`);
throw new Error(`The filter function "${filterFunction.name}" failed: ${error}`);
}
}

Expand All @@ -72,7 +76,7 @@ export async function extractFromHTML(sourceDocument) {
const domFragment = select(webPageDOM, contentSelectors);

if (!domFragment.children.length) {
throw new InaccessibleContentError(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
throw new Error(`The provided selector "${contentSelectors}" has no match in the web page at '${location}'`);
}

convertRelativeURLsToAbsolute(domFragment, location);
Expand All @@ -92,24 +96,32 @@ export async function extractFromHTML(sourceDocument) {
const markdownContent = transform(domFragment);

if (!markdownContent) {
throw new InaccessibleContentError(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
throw new Error(`The provided selector "${contentSelectors}" matches an empty content in the web page at '${location}'`);
}

return markdownContent;
}

export async function extractFromPDF({ content: pdfBuffer }) {
export async function extractFromPDF({ location, content: pdfBuffer }) {
let markdownContent;

try {
const ciceroMarkdown = await PdfTransformer.toCiceroMark(pdfBuffer);

return ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
markdownContent = ciceroMarkTransformer.toMarkdown(ciceroMarkdown);
} catch (error) {
if (error.parserError) {
throw new InaccessibleContentError("Can't parse PDF file");
throw new Error("Can't parse PDF file");
}

throw error;
}

if (!markdownContent) {
throw new Error(`The PDF file at '${location}' contains no text, it might contain scanned images of text instead of actual text`);
}

return markdownContent;
}

function selectRange(webPageDOM, rangeSelector) {
Expand All @@ -120,11 +132,11 @@ function selectRange(webPageDOM, rangeSelector) {
const endNode = webPageDOM.querySelector(endBefore || endAfter);

if (!startNode) {
throw new InaccessibleContentError(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
throw new Error(`The "start" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
}

if (!endNode) {
throw new InaccessibleContentError(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
throw new Error(`The "end" selector has no match in document in: ${JSON.stringify(rangeSelector)}`);
}

selection[startBefore ? 'setStartBefore' : 'setStartAfter'](startNode);
Expand All @@ -135,7 +147,11 @@ function selectRange(webPageDOM, rangeSelector) {

export function convertRelativeURLsToAbsolute(webPageDOM, baseURL) {
Array.from(webPageDOM.querySelectorAll(LINKS_TO_CONVERT_SELECTOR)).forEach(link => {
link.href = url.resolve(baseURL, link.href);
try {
link.href = new URL(link.href, baseURL).href;
} catch (error) {
// Leave the URL as is if it's invalid in the source document and can't be converted to an absolute URL
}
});
}

Expand Down

0 comments on commit 572511c

Please sign in to comment.