From 0afb29f8404fbe09949db82e04174a978bc2723e Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Wed, 3 Sep 2025 17:57:20 +0200 Subject: [PATCH 01/17] Improve anchor in title and refItem --- .../assets/css/components/refItem.css | 28 +++++++++++++++++++ .../assets/css/elements/syntax.css | 8 +++--- .../assets/css/elements/titles.css | 19 +++++++++---- themes/opentermsarchive/assets/js/icons.js | 3 +- .../_default/_markup/render-heading.html | 8 +++++- .../layouts/shortcodes/refItem.html | 18 ++++++------ 6 files changed, 65 insertions(+), 19 deletions(-) diff --git a/themes/opentermsarchive/assets/css/components/refItem.css b/themes/opentermsarchive/assets/css/components/refItem.css index 5c84ee57..df6f53cb 100644 --- a/themes/opentermsarchive/assets/css/components/refItem.css +++ b/themes/opentermsarchive/assets/css/components/refItem.css @@ -6,6 +6,14 @@ box-shadow: inset 0 1px var(--colorBlack200); } +.refItem-name { + position: relative; + display: flex; + align-items: center; + gap: 0.5rem; + color:inherit +} + .refItem-name code { font-weight: 600; overflow-y: auto; @@ -27,6 +35,26 @@ font-weight: 400; } +.refItem-anchor-icon { + opacity: 0; + color: var(--colorBlack600); + text-decoration: none; + transition: opacity 0.1s ease; +} + +.refItem-name:hover .refItem-anchor-icon { + opacity: 1; +} + +.refItem-anchor-icon:hover { + color: var(--colorBlack800); +} + +.refItem-anchor-icon { + width: 1em; + height: 1em; +} + .refItem-details { display: flex; } diff --git a/themes/opentermsarchive/assets/css/elements/syntax.css b/themes/opentermsarchive/assets/css/elements/syntax.css index f40c741e..c47fdd4c 100644 --- a/themes/opentermsarchive/assets/css/elements/syntax.css +++ b/themes/opentermsarchive/assets/css/elements/syntax.css @@ -1,5 +1,5 @@ -/* Background */ .bg { background-color: var(--colorBlack200); } -/* PreWrapper */ .chroma { background-color: var(--colorBlack200); font-size: 1.4rem; } +/* Background */ .bg { background-color: var(--colorBlack100); border-radius: 0.2em; } +/* PreWrapper */ .chroma { background-color: var(--colorBlack100); font-size: 1.4rem; border-radius: 0.2em;} /* Other */ .chroma .x { } /* Error */ .chroma .err { color: var(--colorError); } /* CodeLine */ .chroma .cl { } @@ -72,11 +72,11 @@ /* CommentPreproc */ .chroma .cp { color: #67707b; font-style: italic } /* CommentPreprocFile */ .chroma .cpf { color: #67707b; font-style: italic } /* Generic */ .chroma .g { } -/* GenericDeleted */ .chroma .gd { } +/* GenericDeleted */ .chroma .gd { background-color: rgb(255, 206, 203); } /* GenericEmph */ .chroma .ge { } /* GenericError */ .chroma .gr { } /* GenericHeading */ .chroma .gh { } -/* GenericInserted */ .chroma .gi { } +/* GenericInserted */ .chroma .gi { background-color: rgb(172, 238, 187); } /* GenericOutput */ .chroma .go { } /* GenericPrompt */ .chroma .gp { } /* GenericStrong */ .chroma .gs { } diff --git a/themes/opentermsarchive/assets/css/elements/titles.css b/themes/opentermsarchive/assets/css/elements/titles.css index 031f60c1..1d9a4c3c 100644 --- a/themes/opentermsarchive/assets/css/elements/titles.css +++ b/themes/opentermsarchive/assets/css/elements/titles.css @@ -120,17 +120,26 @@ h6, line-height: 1.25; } +.title-link { + color: inherit; + text-decoration: none; +} + +.title-link:hover { + color: inherit; +} + .title-anchor { - color: var(--colorBlack400); font-size: 0.8em; font-weight: normal; - display: none; + opacity: 0; + transition: opacity 0.1s ease; } h2, h3, h4, h5, h6 { - &:hover { - a.title-anchor { - display: inline; + .title-link:hover { + .title-anchor { + opacity: 1; } } } diff --git a/themes/opentermsarchive/assets/js/icons.js b/themes/opentermsarchive/assets/js/icons.js index ff135d18..d67b247f 100644 --- a/themes/opentermsarchive/assets/js/icons.js +++ b/themes/opentermsarchive/assets/js/icons.js @@ -1,7 +1,7 @@ import { ChevronDown, X, - + Link, createIcons, } from 'lucide'; @@ -9,6 +9,7 @@ createIcons({ icons: { X, ChevronDown, + Link, }, attrs: { 'aria-hidden': true }, }); diff --git a/themes/opentermsarchive/layouts/_default/_markup/render-heading.html b/themes/opentermsarchive/layouts/_default/_markup/render-heading.html index 106e65a5..2b7737c1 100644 --- a/themes/opentermsarchive/layouts/_default/_markup/render-heading.html +++ b/themes/opentermsarchive/layouts/_default/_markup/render-heading.html @@ -1,6 +1,12 @@ + {{ if ne .Level 1 }} + + {{ end }} {{ .Text | safeHTML }} {{ if ne .Level 1 }} - đź”— + + {{ end }} + + diff --git a/themes/opentermsarchive/layouts/shortcodes/refItem.html b/themes/opentermsarchive/layouts/shortcodes/refItem.html index 8bd96902..c0804292 100644 --- a/themes/opentermsarchive/layouts/shortcodes/refItem.html +++ b/themes/opentermsarchive/layouts/shortcodes/refItem.html @@ -7,15 +7,17 @@ {{/* Get description either from attribute or nested content */}} {{ $description := .Get "description" }} {{ $example := .Get "example" }} +{{ $anchorID := $name | lower | replaceRE "[^a-z0-9-]" "-" | replaceRE "-+" "-" | replaceRE "^-" "" | replaceRE "-$" "" }} -
-
- {{ $name }} - {{ $type }} - {{ with $required }} - {{ if eq . true }}required{{ else }}{{ . | markdownify }}{{ end }} - {{ end }} -
+
+ + {{ $name }} + {{ $type }} + {{ with $required }} + {{ if eq . true }}required{{ else }}{{ . | markdownify }}{{ end }} + {{ end }} + +
{{ $description | markdownify | safeHTML }}
{{ if $default }} From a0beaa7543f366b676256e19fd556200a24f72b8 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Wed, 3 Sep 2025 17:57:55 +0200 Subject: [PATCH 02/17] Improve code block style --- themes/opentermsarchive/assets/css/components/textContent.css | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/themes/opentermsarchive/assets/css/components/textContent.css b/themes/opentermsarchive/assets/css/components/textContent.css index 58cc7ab7..05ad1ff2 100644 --- a/themes/opentermsarchive/assets/css/components/textContent.css +++ b/themes/opentermsarchive/assets/css/components/textContent.css @@ -214,8 +214,10 @@ } & code { - background-color: var(--colorBlack200); + background-color: var(--colorBlack100); font-size: 0.9em; + white-space: pre-wrap; + border-radius: 0.2em; } & button { From 6a1fb282c6ec6a75f74c0b689c98012248e1184d Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Wed, 3 Sep 2025 17:58:28 +0200 Subject: [PATCH 03/17] Add reference for built in filters --- content/terms/reference/built-in-filters.md | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 content/terms/reference/built-in-filters.md diff --git a/content/terms/reference/built-in-filters.md b/content/terms/reference/built-in-filters.md new file mode 100644 index 00000000..3660600b --- /dev/null +++ b/content/terms/reference/built-in-filters.md @@ -0,0 +1,31 @@ +--- +title: "Built-in filters" +--- + +# Built-in filters + +This reference documentation details all available built-in filters that can be used to avoid noise in the terms content. + +## Filters + +{{< refItem + name="removeQueryParams" + description="Removes specified query parameters from URLs in links and images within the terms content" +>}} + +```json +"filter": [ + { + "removeQueryParams": ["utm_source", "utm_medium"] + } +] +``` + +Result: + +```diff +-

Read the list of our affiliates.

++

Read the list of our affiliates.

+``` + +{{< /refItem >}} From 65d3606aef265a8adc5492c0aafbf61cabfee14b Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Wed, 3 Sep 2025 17:58:51 +0200 Subject: [PATCH 04/17] Update declaration ref --- content/terms/reference/declaration.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/content/terms/reference/declaration.md b/content/terms/reference/declaration.md index 454d895d..1aa1985f 100644 --- a/content/terms/reference/declaration.md +++ b/content/terms/reference/declaration.md @@ -139,10 +139,18 @@ As an array of those: {{< refItem name="filter" - type="array of strings" - description="Array of filter function names to apply. Function will be executed in the order of the array. See the [Filters]({{< relref \"terms/reference/filters\" >}}) section for more information." - example="[\"filterName1\", \"filterName2\"]" -/>}} + type="array of strings or objects" + description="Array of filter functions to apply. Each item can be either a string (function name) or an object (function name as key, parameters as value). Functions will be executed in the order of the array. See the [Filters]({{< relref \"terms/reference/~~filters\" >}}) section for more information." +>}} +```json +"filter": [ + "filterName1", + { + "filterName2": "params" + } +] +``` +{{< /refItem >}} {{< refItem name="combine" From 796a4e7eece19fe14930c6364308ca0ff8479799 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Wed, 3 Sep 2025 17:59:13 +0200 Subject: [PATCH 05/17] Add how to guide to apply filters --- content/terms/how-to/apply-filters.md | 160 ++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 content/terms/how-to/apply-filters.md diff --git a/content/terms/how-to/apply-filters.md b/content/terms/how-to/apply-filters.md new file mode 100644 index 00000000..82f6abd7 --- /dev/null +++ b/content/terms/how-to/apply-filters.md @@ -0,0 +1,160 @@ +--- +title: Apply filters +weight: 7 +--- + +# Apply filters + +This guide explains how to apply filters to existing declarations to remove meaningless content that changes on each page load or that cannot be removed with CSS selectors to avoid noise in the terms changes history. + +## Prerequisites + +- An existing terms declaration file +- Identified the noise you want to remove and ensure it cannot be removed with CSS selectors with the [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) property. + +## Step 1: Check for built-in filters + +[Built-in filters]({{< relref "/terms/reference/built-in-filters" >}}) are pre-defined functions that handle common noise patterns. They're the easiest way to clean up content without writing custom code. + +Review the available built-in filters in the [filters reference]({{< relref "/terms/reference/built-in-filters" >}}) to find one that matches your needs. + +If you find a suitable built-in filter, proceed to [Step 2](#step-2-declare-the-filter), otherwise you will need to create a custom filter. + +### Create a custom filter (optional) + +If no built-in filter matches your needs, you'll need to create a custom filter. This requires JavaScript knowledge and familiarity with DOM manipulation. + +#### Create the filter file + +Create a JavaScript file with the same name as your service declaration but with `.filters.js` extension. For example, if your declaration is `declarations/MyService.json`, create `declarations/MyService.filters.js`. + +#### Write the filter function + +Define your filter function following this signature: + +```js +export function myCustomFilter(document, parameters, documentDeclaration) { + // Your filter logic here +} +``` + +**Parameters:** + +- `document`: JSDOM document instance representing the web page +- `parameters`: Values passed from the declaration (optional) +- `documentDeclaration`: The complete declaration object (optional) + +**Example: Remove session IDs from text content** + +For example, let's say you want to remove session IDs from text content: + +```html +

We collect your data for the following purposes:

+
    +
  • To provide our services
  • +
  • To improve user experience
  • +
+

Last updated on 2023-12-07 (Session: abc123def456)

+``` + +You can implement this filter as follows: + +```js +export function removeSessionIds(document) { + // Find all paragraphs that might contain session IDs + const paragraphs = document.querySelectorAll('p.session-id'); + + paragraphs.forEach(paragraph => { + let text = paragraph.textContent; + // Remove session ID patterns like "Session: abc123" or "(Session: def456)" + text = text.replace(/\s*\(?Session:\s*[a-zA-Z0-9]+\)?/g, ''); + paragraph.textContent = text.trim(); + }); +} +``` + +Result after applying the filter: + +```diff +

We collect your data for the following purposes:

+
    +
  • To provide our services
  • +
  • To improve user experience
  • +
+-

Last updated on 2023-12-07 (Session: abc123def456)

++

Last updated on 2023-12-07

+``` + +## Step 2: Declare the filter + +Open your service declaration file (e.g., `declarations/MyService.json`) and locate the `filter` property of the specific terms you want to apply the filter to. If it doesn't exist, add it as an array. + +### Filter without parameters + +For filters that don't require parameters, add the filter name as a string: + +```json +{ + "name": "MyService", + "terms": { + "Privacy Policy": { + "fetch": "https://my.service.com/en/privacy-policy", + "select": ".textcontent", + "filter": [ + "removeSessionIds" + ] + } + } +} +``` + +### Parameterized filter + +For filters that require parameters, use an object format, for example with the built-in filter `removeQueryParams` to remove query parameters from URLs: + +```json +{ + "name": "MyService", + "terms": { + "Privacy Policy": { + "fetch": "https://my.service.com/en/privacy-policy", + "select": ".textcontent", + "filter": [ + { + "removeQueryParams": ["utm_source", "utm_medium", "utm_campaign"] + } + ] + } + } +} +``` + +### Multiple filters + +You can combine multiple filters in the same declaration: + +```json +{ + "name": "MyService", + "terms": { + "Privacy Policy": { + "fetch": "https://my.service.com/en/privacy-policy", + "select": ".textcontent", + "filter": [ + { + "removeQueryParams": ["utm_source", "utm_medium"] + }, + "removeSessionIds" + ] + } + } +} +``` + +## Step 3: Test the filter + +After adding the filter, test your declaration to ensure it works correctly: + +1. Start the terms tracking process +2. Check that the noise has been removed +3. Verify that important content is preserved From 1664c0d7be74a70b7b23a37c5b0c1f610dbabef9 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 4 Sep 2025 10:17:57 +0200 Subject: [PATCH 06/17] Remove obsolete reference --- content/terms/reference/filters.md | 54 ------------------------------ 1 file changed, 54 deletions(-) delete mode 100644 content/terms/reference/filters.md diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md deleted file mode 100644 index 6006c8a0..00000000 --- a/content/terms/reference/filters.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Filters" ---- - -# Filters - -Some documents require more complex filtering beyond basic element selection and removal. For example, web pages often contain dynamically generated content like tracking IDs in URLs that change on each page load. While these elements are part of the page, they are not meaningful to the terms content itself. If such dynamic content is included in the archived versions, it creates a lot of insignificant versions and pollutes the archive with noise that makes it harder to identify actual changes to the terms. - -Filters address this need by providing a way to programmatically clean up and normalize the content before archiving. They are implemented as JavaScript functions that can manipulate the downloaded web page using the [DOM API](https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model), allowing for sophisticated content transformations beyond what's possible with simple CSS selectors. - -Filters take the document DOM and the terms declaration as parameters and are: - -- **in-place**: they modify the document structure and content directly; -- **idempotent**: they return the same document structure and content even if run repeatedly on their own result. - -Filters are loaded automatically from files named after the service they operate on. For example, filters for the Meetup service, which is declared in `declarations/Meetup.json`, are loaded from `declarations/Meetup.filters.js`. - -The generic function signature for a filter is: - -```js -export [async] function filterName(document, documentDeclaration) -``` - -Each filter is exposed as a named function export that takes a `document` parameter and behaves like the `document` object in a browser DOM. These functions can be `async`, but they will still run sequentially. The whole document declaration is passed as second parameter. - -> The `document` parameter is actually a [JSDOM](https://github.com/jsdom/jsdom) document instance. - -You can learn more about usual noise and ways to handle it [in the guidelines]({{< relref "/terms/guideline/declaring#usual-noise" >}}). - -### Example - -Let's assume a service adds a unique `clickId` parameter in the query string of all link destinations. These parameters change on each page load, leading to recording noise in versions. Since links should still be recorded, it is not appropriate to use `remove` to remove the links entirely. Instead, a filter will manipulate the links destinations to remove the always-changing parameter. Concretely, the goal is to apply the following filter: - -```diff -- Read the list of our affiliates. -+ Read the list of our affiliates. -``` - -The code below implements this filter: - -```js -function removeTrackingIdsQueryParam(document) { - const QUERY_PARAM_TO_REMOVE = 'clickId'; - - document.querySelectorAll('a').forEach(link => { // iterate over every link in the page - const url = new URL(link.getAttribute('href'), document.location); // URL is part of the DOM API, see https://developer.mozilla.org/en-US/docs/Web/API/URL - const params = new URLSearchParams(url.search); // URLSearchParams is part of the DOM API, see https://developer.mozilla.org/en-US/docs/Web/API/URLSearchParams - - params.delete(QUERY_PARAM_TO_REMOVE); // we use the DOM API instead of RegExp because we can't know in advance in which order parameters will be written - url.search = params.toString(); // store the query string without the parameter - link.setAttribute('href', url.toString()); // write the destination URL without the parameter - }); -} -``` From 829ce0f9e968fe7b168fc78e4764e0163fef2f2d Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 4 Sep 2025 11:13:58 +0200 Subject: [PATCH 07/17] Improve filters doc --- content/terms/explanation/filters.md | 39 +++++++++++++ content/terms/reference/declaration.md | 2 +- content/terms/reference/filters.md | 81 ++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 content/terms/explanation/filters.md create mode 100644 content/terms/reference/filters.md diff --git a/content/terms/explanation/filters.md b/content/terms/explanation/filters.md new file mode 100644 index 00000000..5ff59a72 --- /dev/null +++ b/content/terms/explanation/filters.md @@ -0,0 +1,39 @@ +--- +title: "Filters" +weight: 3 +--- + +# Filters + +Filters solve noise issues in terms versions that cannot be addressed with direct selection or removal of content using CSS selectors or range selectors. + +## Why filters are needed + +Web pages often contain dynamically generated content or content that cannot be targeted with CSS selectors that creates noise in the archive: + +- Tracking parameters in URLs, for example `utm_source`, `utm_medium`, … +- Content that are date based and can change between visits, for example "Updated X days ago" can be converted to a "Last updated on YYYY-MM-DD". +- Dynamic elements with changing classes or IDs + +Without filters, this dynamic content creates changes that are not meaningful to the terms. + +## How filters work + +Filters are JavaScript functions that receive a JSDOM document instance and can manipulate the DOM structure directly. They modify the document structure and content in-place and they run sequentially in the order specified in the declaration. + +## Filter design principles + +When designing filters, follow these core principles: + +- **Be specific**: Target only the noise you want to remove. Avoid broad selectors that might accidentally remove important content. +- **Be safe**: Ensure your filter doesn't accidentally remove important content. Always check that the generated version still contains the whole terms content. +- **Be idempotent**: Your filter should produce the same result even if run multiple times on its own output. This ensures consistency and prevents unexpected behavior. +- **Be efficient**: Use efficient DOM queries and avoid unnecessary operations. Process only the elements you need to modify. + +## When to use filters + +Use filters when: + +- **CSS selectors are insufficient**: When noise appears within content that can't be targeted with selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. +- **Meaningful content is dynamic**: When elements change on each page load, for example "Updated X days ago" can be converted to a "Last updated on YYYY-MM-DD". +- **Patterns are complex**: When simple removal isn't possible, for example removing all the tracking parameters in URLs. diff --git a/content/terms/reference/declaration.md b/content/terms/reference/declaration.md index 1aa1985f..e71fe066 100644 --- a/content/terms/reference/declaration.md +++ b/content/terms/reference/declaration.md @@ -140,7 +140,7 @@ As an array of those: {{< refItem name="filter" type="array of strings or objects" - description="Array of filter functions to apply. Each item can be either a string (function name) or an object (function name as key, parameters as value). Functions will be executed in the order of the array. See the [Filters]({{< relref \"terms/reference/~~filters\" >}}) section for more information." + description="Array of filter functions to apply. Each item can be either a string (function name) or an object (function name as key, parameters as value). Functions will be executed in the order of the array. See the [Filters]({{< relref \"terms/reference/filters\" >}}) section for more information." >}} ```json "filter": [ diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md new file mode 100644 index 00000000..cbe93a7b --- /dev/null +++ b/content/terms/reference/filters.md @@ -0,0 +1,81 @@ +--- +title: "Filters" +--- + +# Filters + +Filters are JavaScript functions that take the document DOM as parameter and are: + +- **in-place**: they modify the document structure and content directly; +- **idempotent**: they return the same document structure and content even if run repeatedly on their own result. + +The generic function signature for a filter is: + +```js +export [async] function filterName(document, [parameters]) +``` + +Each filter is exposed as a named function export that takes a `document` parameter and behaves like the `document` object in a browser DOM. +> The `document` parameter is actually a [JSDOM](https://github.com/jsdom/jsdom) document instance. + +Filters can have parameters that are passed as second parameter. + +These functions can be `async`, but they will still run sequentially. + +## Usage + +### Simple filter + +```js +// .filters.js +export function customFilter(document) { + // filter logic here +} +``` + +Can be used as follows in the declaration: + +```json +// .json +{ + "name": "", + "terms": { + "": { + "fetch": "", + "select": "", + "filter": [ + { + "customParameterizedFilter": "params" + } + ] + } + } +} +``` From 05292b190c9ddb3edc4a013e529883741ca7125b Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Thu, 4 Sep 2025 11:16:34 +0200 Subject: [PATCH 08/17] Minor improvement --- content/terms/reference/filters.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index cbe93a7b..1b6d5769 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -41,8 +41,8 @@ Can be used as follows in the declaration: "name": "", "terms": { "": { - "fetch": "", - "select": "", + "fetch": "", + "select": "", "filter": [ { - "customParameterizedFilter": "params" + "customParameterizedFilter": ["param1", "param2"] } ] } From 1e346d8fec73e4977487e01b464f6ce8f0199412 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 14:24:49 +0200 Subject: [PATCH 09/17] Improve filter docs --- content/terms/how-to/apply-filters.md | 7 +++---- content/terms/reference/filters.md | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/content/terms/how-to/apply-filters.md b/content/terms/how-to/apply-filters.md index 82f6abd7..e626c379 100644 --- a/content/terms/how-to/apply-filters.md +++ b/content/terms/how-to/apply-filters.md @@ -14,9 +14,9 @@ This guide explains how to apply filters to existing declarations to remove mean ## Step 1: Check for built-in filters -[Built-in filters]({{< relref "/terms/reference/built-in-filters" >}}) are pre-defined functions that handle common noise patterns. They're the easiest way to clean up content without writing custom code. +Built-in filters are pre-defined functions that handle common noise patterns. They're the easiest way to clean up content without writing custom code. -Review the available built-in filters in the [filters reference]({{< relref "/terms/reference/built-in-filters" >}}) to find one that matches your needs. +Review the available [built-in filters]({{< relref "/terms/reference/built-in-filters" >}}) to find if one matches your needs. If you find a suitable built-in filter, proceed to [Step 2](#step-2-declare-the-filter), otherwise you will need to create a custom filter. @@ -33,7 +33,7 @@ Create a JavaScript file with the same name as your service declaration but with Define your filter function following this signature: ```js -export function myCustomFilter(document, parameters, documentDeclaration) { +export function myCustomFilter(document, [parameters]) { // Your filter logic here } ``` @@ -42,7 +42,6 @@ export function myCustomFilter(document, parameters, documentDeclaration) { - `document`: JSDOM document instance representing the web page - `parameters`: Values passed from the declaration (optional) -- `documentDeclaration`: The complete declaration object (optional) **Example: Remove session IDs from text content** diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index 1b6d5769..15bd3022 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -18,8 +18,6 @@ export [async] function filterName(document, [parameters]) Each filter is exposed as a named function export that takes a `document` parameter and behaves like the `document` object in a browser DOM. > The `document` parameter is actually a [JSDOM](https://github.com/jsdom/jsdom) document instance. -Filters can have parameters that are passed as second parameter. - These functions can be `async`, but they will still run sequentially. ## Usage From be7bb36e1bc2654d421a3fec8b60bf5513f8360e Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 15:02:20 +0200 Subject: [PATCH 10/17] Improve writing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Biron --- content/terms/explanation/filters.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/terms/explanation/filters.md b/content/terms/explanation/filters.md index 5ff59a72..65e6c345 100644 --- a/content/terms/explanation/filters.md +++ b/content/terms/explanation/filters.md @@ -5,11 +5,11 @@ weight: 3 # Filters -Filters solve noise issues in terms versions that cannot be addressed with direct selection or removal of content using CSS selectors or range selectors. +Filters solve [noise]({{< relref "/terms/guideline/declaring/#usual-noise" >}}) issues in terms versions that cannot be addressed with direct selection or removal of content using CSS selectors or range selectors. ## Why filters are needed -Web pages often contain dynamically generated content or content that cannot be targeted with CSS selectors that creates noise in the archive: +Web pages often contain dynamically generated content or content that cannot be targeted with CSS selectors that creates noise in the recorded version, for example: - Tracking parameters in URLs, for example `utm_source`, `utm_medium`, … - Content that are date based and can change between visits, for example "Updated X days ago" can be converted to a "Last updated on YYYY-MM-DD". From d19bad998788881b5efba6484c89f3955e9bf24d Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 15:52:23 +0200 Subject: [PATCH 11/17] Improve filter docs --- content/terms/reference/declaration.md | 2 +- content/terms/reference/filters.md | 83 ++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/content/terms/reference/declaration.md b/content/terms/reference/declaration.md index e71fe066..3a8b5c5a 100644 --- a/content/terms/reference/declaration.md +++ b/content/terms/reference/declaration.md @@ -146,7 +146,7 @@ As an array of those: "filter": [ "filterName1", { - "filterName2": "params" + "filterName2": "param" } ] ``` diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index 15bd3022..47f8bd2a 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -49,6 +49,42 @@ Can be used as follows in the declaration: } ``` +Example: + +```js +export function convertTimeAgoToDate(document) { + const timeElements = document.querySelectorAll('time'); + + timeElements.forEach(timeElement => { + const dateTimeValue = timeElement.getAttribute('datetime'); + const textNode = document.createTextNode(dateTimeValue); + timeElement.parentNode.replaceChild(textNode, timeElement); + }); +} +``` + +```json +{ + "name": "MyService", + "terms": { + "Privacy Policy": { + "fetch": "https://example.com/privacy", + "select": ".content", + "filter": [ + "convertTimeAgoToDate" + ] + } + } +} +``` + +Result: + +```diff +- ++ +``` + ### Filter with parameters ```js @@ -77,3 +113,50 @@ Can be used as follows in the declaration: } } ``` + +Example: + +```js +export function removeLinksWithText(document, text) { + const links = document.querySelectorAll('a'); + links.forEach(link => { + if (link.textContent.trim() === text) { + link.remove(); + } + }); +} +``` + +```json +{ + "name": "MyService", + "terms": { + "Privacy Policy": { + "fetch": "https://example.com/privacy", + "select": ".content", + "filter": [ + { "removeLinksWithText": "Return to previous section" } + { "removeLinksWithText": "Go to next section" } + ] + } + } +} +``` + +Result: + +```diff +
+- Go to next section +

...

+
+ + +``` From f97d959b93feb666b803763093cadcdb77fd43a4 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 16:30:32 +0200 Subject: [PATCH 12/17] Improve examples --- content/terms/reference/filters.md | 61 +++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index 47f8bd2a..a04329ae 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -49,7 +49,7 @@ Can be used as follows in the declaration: } ``` -Example: +#### Example ```js export function convertTimeAgoToDate(document) { @@ -114,13 +114,15 @@ Can be used as follows in the declaration: } ``` -Example: +#### Example 1 ```js -export function removeLinksWithText(document, text) { +export function removeLinksWithText(document, textArray) { const links = document.querySelectorAll('a'); + const textsToRemove = Array.isArray(textArray) ? textArray : [textArray]; + links.forEach(link => { - if (link.textContent.trim() === text) { + if (textsToRemove.includes(link.textContent.trim())) { link.remove(); } }); @@ -135,8 +137,7 @@ export function removeLinksWithText(document, text) { "fetch": "https://example.com/privacy", "select": ".content", "filter": [ - { "removeLinksWithText": "Return to previous section" } - { "removeLinksWithText": "Go to next section" } + { "removeLinksWithText": ["Return to previous section", "Go to next section"] } ] } } @@ -160,3 +161,51 @@ Result:

...

``` + +#### Example 2 + +```js +import fetch from 'isomorphic-fetch'; + +export async function convertImagesToBase64(document, selector, documentDeclaration) { + const images = Array.from(document.querySelectorAll(`selector`)); + + return Promise.all(images.map(async ({ src }, index) => { + if (src.startsWith('data:')) { + return; // Already a data-URI, skip + } + + const imageUrl = new URL(src, documentDeclaration.fetch).href; // Ensure url is absolute + const response = await fetch(imageUrl); + const mimeType = response.headers.get('content-type'); + const content = await response.arrayBuffer(); + + const base64Content = btoa(String.fromCharCode(...new Uint8Array(content))); + + images[index].src = `data:${mimeType};base64,${base64Content}`; + })); + +} +``` + +```json +{ + "name": "MyService", + "terms": { + "Privacy Policy": { + "fetch": "https://example.com/privacy", + "select": ".content", + "filter": [ + { "convertImagesToBase64": ".meaningful-illustration" } + ] + } + } +} +``` + +Result: + +```diff +- ++ +``` From 75acc97169b4be63c510a5135fc42196b933d0c3 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 17:14:30 +0200 Subject: [PATCH 13/17] Improve filter doc --- content/terms/explanation/filters.md | 35 ++++++++++++++++------------ content/terms/reference/filters.md | 19 ++++++++++----- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/content/terms/explanation/filters.md b/content/terms/explanation/filters.md index 65e6c345..11184f73 100644 --- a/content/terms/explanation/filters.md +++ b/content/terms/explanation/filters.md @@ -7,15 +7,19 @@ weight: 3 Filters solve [noise]({{< relref "/terms/guideline/declaring/#usual-noise" >}}) issues in terms versions that cannot be addressed with direct selection or removal of content using CSS selectors or range selectors. -## Why filters are needed +## When filters are needed -Web pages often contain dynamically generated content or content that cannot be targeted with CSS selectors that creates noise in the recorded version, for example: +Filters are necessary when standard CSS selectors and range selectors cannot adequately address noise in terms versions. They provide a solution for complex content manipulation that goes beyond simple selection and removal. -- Tracking parameters in URLs, for example `utm_source`, `utm_medium`, … -- Content that are date based and can change between visits, for example "Updated X days ago" can be converted to a "Last updated on YYYY-MM-DD". -- Dynamic elements with changing classes or IDs +Use filters when: -Without filters, this dynamic content creates changes that are not meaningful to the terms. +- **CSS selectors are insufficient**: When noise appears within content that can't be targeted with selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. +- **Content is dynamically generated**: When elements change on each page load, such as: + - Tracking parameters in URLs (e.g., `utm_source`, `utm_medium`) + - Dynamic elements with changing classes or IDs +- **Complex tasks are needed**: When content transformation is needed such as: + - Converting images to base64 to store them in the terms version. + - Converting date-based content to a more stable format (e.g., "Updated X days ago" to "Last updated on YYYY-MM-DD") ## How filters work @@ -25,15 +29,16 @@ Filters are JavaScript functions that receive a JSDOM document instance and can When designing filters, follow these core principles: -- **Be specific**: Target only the noise you want to remove. Avoid broad selectors that might accidentally remove important content. -- **Be safe**: Ensure your filter doesn't accidentally remove important content. Always check that the generated version still contains the whole terms content. -- **Be idempotent**: Your filter should produce the same result even if run multiple times on its own output. This ensures consistency and prevents unexpected behavior. -- **Be efficient**: Use efficient DOM queries and avoid unnecessary operations. Process only the elements you need to modify. +- **Be specific**: Target only the noise you want to remove. Avoid broad selectors that might accidentally remove important content. -## When to use filters + > For example, if your filter converts relative dates to absolute dates, use `.metadata time` not `time` which might also affect important effective dates within the terms content. -Use filters when: +- **Be idempotent**: Filters should produce the same result even if run multiple times on their own output. This ensures consistency and prevents unexpected behavior. -- **CSS selectors are insufficient**: When noise appears within content that can't be targeted with selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. -- **Meaningful content is dynamic**: When elements change on each page load, for example "Updated X days ago" can be converted to a "Last updated on YYYY-MM-DD". -- **Patterns are complex**: When simple removal isn't possible, for example removing all the tracking parameters in URLs. + > For example, if your filter adds section numbers like "1." to headings, check if numbers already exist to prevent "1. Privacy Policy" from becoming "1. 1. Privacy Policy" on repeated runs. + +- **Be efficient**: Use efficient DOM queries and avoid unnecessary operations. Process only the elements you need to modify. + + > For example, if your filter updates timestamp elements with a specific class, use `document.querySelector('.timestamp')` instead of `document.querySelectorAll('*')` followed by filtering for timestamp elements. + +- **Be safe**: Filters should not accidentally remove important content. The generated version should always be checked after adding a filter to ensure it still contains the whole terms content. diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index a04329ae..5acd570d 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -11,12 +11,19 @@ Filters are JavaScript functions that take the document DOM as parameter and are The generic function signature for a filter is: +- For simple filters: + +```js +export [async] function filterName(document, [documentDeclaration]) +``` + +- For filters with parameters: + ```js -export [async] function filterName(document, [parameters]) +export [async] function filterName(document, parameters, [documentDeclaration]) ``` -Each filter is exposed as a named function export that takes a `document` parameter and behaves like the `document` object in a browser DOM. -> The `document` parameter is actually a [JSDOM](https://github.com/jsdom/jsdom) document instance. +Each filter is exposed as a named function export that takes a `document` parameter and behaves like the `document` object in a browser DOM. The `document` parameter is actually a [JSDOM](https://github.com/jsdom/jsdom) document instance. These functions can be `async`, but they will still run sequentially. @@ -53,7 +60,7 @@ Can be used as follows in the declaration: ```js export function convertTimeAgoToDate(document) { - const timeElements = document.querySelectorAll('time'); + const timeElements = document.querySelectorAll('.metadata time'); timeElements.forEach(timeElement => { const dateTimeValue = timeElement.getAttribute('datetime'); @@ -81,8 +88,8 @@ export function convertTimeAgoToDate(document) { Result: ```diff -- -+ +- ++ ``` ### Filter with parameters From 1ef4725ea58010c32fe63e47fda91bc63d6c8ae9 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 17:45:20 +0200 Subject: [PATCH 14/17] Improve doc --- content/terms/explanation/filters.md | 18 +++++++----------- content/terms/reference/filters.md | 2 +- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/content/terms/explanation/filters.md b/content/terms/explanation/filters.md index 11184f73..03f01f27 100644 --- a/content/terms/explanation/filters.md +++ b/content/terms/explanation/filters.md @@ -13,13 +13,9 @@ Filters are necessary when standard CSS selectors and range selectors cannot ade Use filters when: -- **CSS selectors are insufficient**: When noise appears within content that can't be targeted with selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. -- **Content is dynamically generated**: When elements change on each page load, such as: - - Tracking parameters in URLs (e.g., `utm_source`, `utm_medium`) - - Dynamic elements with changing classes or IDs -- **Complex tasks are needed**: When content transformation is needed such as: - - Converting images to base64 to store them in the terms version. - - Converting date-based content to a more stable format (e.g., "Updated X days ago" to "Last updated on YYYY-MM-DD") +- **CSS selectors are insufficient**, for example when noise appears within content that can't be targeted with selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. +- **Content is dynamically generated**, for example when elements change on each page load with tracking parameters in URLs (like `utm_source`, `utm_medium`) or dynamic elements with changing classes or IDs. +- **Complex tasks are needed**, for example when content transformation is required such as converting images to base64 to store them in the terms version or converting date-based content to a more stable format (like "Updated X days ago" to "Last updated on YYYY-MM-DD"). ## How filters work @@ -29,16 +25,16 @@ Filters are JavaScript functions that receive a JSDOM document instance and can When designing filters, follow these core principles: -- **Be specific**: Target only the noise you want to remove. Avoid broad selectors that might accidentally remove important content. +- **Be specific**: target only the noise you want to remove. Avoid broad selectors that might accidentally remove important content. > For example, if your filter converts relative dates to absolute dates, use `.metadata time` not `time` which might also affect important effective dates within the terms content. -- **Be idempotent**: Filters should produce the same result even if run multiple times on their own output. This ensures consistency and prevents unexpected behavior. +- **Be idempotent**: filters should produce the same result even if run multiple times on their own output. This ensures consistency and prevents unexpected behavior. > For example, if your filter adds section numbers like "1." to headings, check if numbers already exist to prevent "1. Privacy Policy" from becoming "1. 1. Privacy Policy" on repeated runs. -- **Be efficient**: Use efficient DOM queries and avoid unnecessary operations. Process only the elements you need to modify. +- **Be efficient**: use efficient DOM queries and avoid unnecessary operations. Process only the elements you need to modify. > For example, if your filter updates timestamp elements with a specific class, use `document.querySelector('.timestamp')` instead of `document.querySelectorAll('*')` followed by filtering for timestamp elements. -- **Be safe**: Filters should not accidentally remove important content. The generated version should always be checked after adding a filter to ensure it still contains the whole terms content. +- **Be safe**: filters should not accidentally remove important content. The generated version should always be checked after adding a filter to ensure it still contains the whole terms content. diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index 5acd570d..68faa4f1 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -175,7 +175,7 @@ Result: import fetch from 'isomorphic-fetch'; export async function convertImagesToBase64(document, selector, documentDeclaration) { - const images = Array.from(document.querySelectorAll(`selector`)); + const images = Array.from(document.querySelectorAll(selector)); return Promise.all(images.map(async ({ src }, index) => { if (src.startsWith('data:')) { From 4bd20bca8b350f94696d71a7a3c4c51ee0eb1dd7 Mon Sep 17 00:00:00 2001 From: Nicolas Dupont Date: Mon, 8 Sep 2025 17:48:39 +0200 Subject: [PATCH 15/17] Fix title --- content/terms/how-to/apply-filters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/terms/how-to/apply-filters.md b/content/terms/how-to/apply-filters.md index e626c379..4a7304dc 100644 --- a/content/terms/how-to/apply-filters.md +++ b/content/terms/how-to/apply-filters.md @@ -3,7 +3,7 @@ title: Apply filters weight: 7 --- -# Apply filters +# How to apply filters This guide explains how to apply filters to existing declarations to remove meaningless content that changes on each page load or that cannot be removed with CSS selectors to avoid noise in the terms changes history. From f825bf88f60aa39e70be663dc6bca3e30e5b0ff8 Mon Sep 17 00:00:00 2001 From: Matti Schneider Date: Wed, 10 Sep 2025 15:51:43 +0200 Subject: [PATCH 16/17] Improve filters documentation --- content/terms/explanation/filters.md | 30 ++++++------- content/terms/how-to/apply-filters.md | 50 +++++++++++---------- content/terms/reference/built-in-filters.md | 8 +--- content/terms/reference/filters.md | 19 +++++--- 4 files changed, 54 insertions(+), 53 deletions(-) diff --git a/content/terms/explanation/filters.md b/content/terms/explanation/filters.md index 03f01f27..5e574e9e 100644 --- a/content/terms/explanation/filters.md +++ b/content/terms/explanation/filters.md @@ -5,36 +5,34 @@ weight: 3 # Filters -Filters solve [noise]({{< relref "/terms/guideline/declaring/#usual-noise" >}}) issues in terms versions that cannot be addressed with direct selection or removal of content using CSS selectors or range selectors. +Filters enable solving [noise]({{< relref "/terms/guideline/declaring/#usual-noise" >}}) issues in versions that cannot be addressed with direct selection or removal of content using selectors. ## When filters are needed -Filters are necessary when standard CSS selectors and range selectors cannot adequately address noise in terms versions. They provide a solution for complex content manipulation that goes beyond simple selection and removal. - Use filters when: -- **CSS selectors are insufficient**, for example when noise appears within content that can't be targeted with selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. -- **Content is dynamically generated**, for example when elements change on each page load with tracking parameters in URLs (like `utm_source`, `utm_medium`) or dynamic elements with changing classes or IDs. -- **Complex tasks are needed**, for example when content transformation is required such as converting images to base64 to store them in the terms version or converting date-based content to a more stable format (like "Updated X days ago" to "Last updated on YYYY-MM-DD"). +- **Content selectors are insufficient**, for example when noise appears within content that can't be targeted with CSS selectors or [range selectors]({{< relref "terms/explanation/range-selectors" >}}) with the [`select`]({{< relref "terms/reference/declaration/#ref-select" >}}) and [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) properties. +- **Content is dynamically generated**, for example when elements change on each page load with changing classes or IDs that cannot be targeted with [attribute selectors](https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors). +- **Complex tasks are needed**, for example when content transformation is required such as converting images to base64 to store them in the terms version or converting date-based content to a stable format (like “Updated X days ago” to “Last updated on YYYY-MM-DD”). ## How filters work -Filters are JavaScript functions that receive a JSDOM document instance and can manipulate the DOM structure directly. They modify the document structure and content in-place and they run sequentially in the order specified in the declaration. +Filters are JavaScript functions that can manipulate the DOM structure directly. They modify the document structure and content in-place. ## Filter design principles -When designing filters, follow these core principles: +Filters should follow these core principles: + +- **Specific**: target only the noise to remove. Avoid broad selectors that might accidentally remove important content. -- **Be specific**: target only the noise you want to remove. Avoid broad selectors that might accidentally remove important content. + > For example, if a filter converts relative dates to absolute dates, make sure to scope the targeted dates. This might translate to selecting with `.metadata time`, not `time`, which might also affect important effective dates within the terms content. - > For example, if your filter converts relative dates to absolute dates, use `.metadata time` not `time` which might also affect important effective dates within the terms content. +- **Idempotent**: filters should produce the same result even if run multiple times on their own output. This ensures consistency. -- **Be idempotent**: filters should produce the same result even if run multiple times on their own output. This ensures consistency and prevents unexpected behavior. + > For example, if a filter adds section numbers like "1." to headings, it should check if the numbers already exist, to prevent "1. Privacy Policy" from becoming "1. 1. Privacy Policy" on repeated runs. - > For example, if your filter adds section numbers like "1." to headings, check if numbers already exist to prevent "1. Privacy Policy" from becoming "1. 1. Privacy Policy" on repeated runs. +- **Efficient**: DOM queries should be optimised and filters should avoid unnecessary operations, processing only the elements needed. -- **Be efficient**: use efficient DOM queries and avoid unnecessary operations. Process only the elements you need to modify. - - > For example, if your filter updates timestamp elements with a specific class, use `document.querySelector('.timestamp')` instead of `document.querySelectorAll('*')` followed by filtering for timestamp elements. + > For example, if a filter updates timestamp elements with a specific class, using `document.querySelectorAll('.timestamp')` is more efficient than `document.querySelectorAll('*')` followed by filtering for timestamp elements. -- **Be safe**: filters should not accidentally remove important content. The generated version should always be checked after adding a filter to ensure it still contains the whole terms content. +- **Safe**: filters must not accidentally remove important content. The generated version should always be checked after adding a filter to ensure it still contains the whole terms content. diff --git a/content/terms/how-to/apply-filters.md b/content/terms/how-to/apply-filters.md index 4a7304dc..b250e8a6 100644 --- a/content/terms/how-to/apply-filters.md +++ b/content/terms/how-to/apply-filters.md @@ -5,32 +5,34 @@ weight: 7 # How to apply filters -This guide explains how to apply filters to existing declarations to remove meaningless content that changes on each page load or that cannot be removed with CSS selectors to avoid noise in the terms changes history. +This guide explains how to add filters to existing declarations to remove meaningless content that cannot be removed with CSS selectors, to prevent noise in the versions. ## Prerequisites -- An existing terms declaration file -- Identified the noise you want to remove and ensure it cannot be removed with CSS selectors with the [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) property. +- An existing terms declaration file. +- Having already identified the noise to remove and having double-checked it cannot be removed with CSS selectors with the [`remove`]({{< relref "terms/reference/declaration/#ref-remove" >}}) property. ## Step 1: Check for built-in filters -Built-in filters are pre-defined functions that handle common noise patterns. They're the easiest way to clean up content without writing custom code. +Built-in filters are pre-defined functions that handle common noise patterns. They are the easiest way to clean up content. Review the available [built-in filters]({{< relref "/terms/reference/built-in-filters" >}}) to find if one matches your needs. -If you find a suitable built-in filter, proceed to [Step 2](#step-2-declare-the-filter), otherwise you will need to create a custom filter. +If you find a suitable built-in filter, proceed to [Step 3](#step-3-declare-the-filter), otherwise you will need to create a custom filter. -### Create a custom filter (optional) +## Step 2: Create a custom filter _(optional)_ -If no built-in filter matches your needs, you'll need to create a custom filter. This requires JavaScript knowledge and familiarity with DOM manipulation. +If no built-in filter matches your needs, you will need to create a custom filter. This requires JavaScript knowledge and familiarity with DOM manipulation. -#### Create the filter file +### Create the filter file -Create a JavaScript file with the same name as your service declaration but with `.filters.js` extension. For example, if your declaration is `declarations/MyService.json`, create `declarations/MyService.filters.js`. +Create a JavaScript file in the same folder and with the same name as your service declaration, but with `.filters.js` extension. -#### Write the filter function +> For example, if your declaration is `declarations/MyService.json`, create `declarations/MyService.filters.js`. -Define your filter function following this signature: +### Write the filter function + +Define your filter function with the following signature: ```js export function myCustomFilter(document, [parameters]) { @@ -38,12 +40,12 @@ export function myCustomFilter(document, [parameters]) { } ``` -**Parameters:** +#### Parameters - `document`: JSDOM document instance representing the web page -- `parameters`: Values passed from the declaration (optional) +- `parameters`: values passed from the declaration _(optional)_ -**Example: Remove session IDs from text content** +#### Example: Remove session IDs from text content For example, let's say you want to remove session IDs from text content: @@ -61,7 +63,7 @@ You can implement this filter as follows: ```js export function removeSessionIds(document) { // Find all paragraphs that might contain session IDs - const paragraphs = document.querySelectorAll('p.session-id'); + const paragraphs = document.querySelectorAll('.session-id'); paragraphs.forEach(paragraph => { let text = paragraph.textContent; @@ -84,20 +86,20 @@ Result after applying the filter: +

Last updated on 2023-12-07

``` -## Step 2: Declare the filter +## Step 3: Declare the filter -Open your service declaration file (e.g., `declarations/MyService.json`) and locate the `filter` property of the specific terms you want to apply the filter to. If it doesn't exist, add it as an array. +Open your service declaration file (e.g. `declarations/MyService.json`) and locate the `filter` property of the specific terms you want to apply the filter to. If it doesn't exist, add it as an array. ### Filter without parameters -For filters that don't require parameters, add the filter name as a string: +For filters that don’t require parameters, add the filter name as a string: ```json { "name": "MyService", "terms": { "Privacy Policy": { - "fetch": "https://my.service.com/en/privacy-policy", + "fetch": "https://my.service.example/en/privacy-policy", "select": ".textcontent", "filter": [ "removeSessionIds" @@ -107,16 +109,16 @@ For filters that don't require parameters, add the filter name as a string: } ``` -### Parameterized filter +### Filter with parameters -For filters that require parameters, use an object format, for example with the built-in filter `removeQueryParams` to remove query parameters from URLs: +For filters that take parameters, use an object format, for example with the built-in filter `removeQueryParams` to remove query parameters from URLs: ```json { "name": "MyService", "terms": { "Privacy Policy": { - "fetch": "https://my.service.com/en/privacy-policy", + "fetch": "https://my.service.example/en/privacy-policy", "select": ".textcontent", "filter": [ { @@ -137,7 +139,7 @@ You can combine multiple filters in the same declaration: "name": "MyService", "terms": { "Privacy Policy": { - "fetch": "https://my.service.com/en/privacy-policy", + "fetch": "https://my.service.example/en/privacy-policy", "select": ".textcontent", "filter": [ { @@ -150,7 +152,7 @@ You can combine multiple filters in the same declaration: } ``` -## Step 3: Test the filter +## Step 4: Test the filter After adding the filter, test your declaration to ensure it works correctly: diff --git a/content/terms/reference/built-in-filters.md b/content/terms/reference/built-in-filters.md index 3660600b..c3075330 100644 --- a/content/terms/reference/built-in-filters.md +++ b/content/terms/reference/built-in-filters.md @@ -4,13 +4,11 @@ title: "Built-in filters" # Built-in filters -This reference documentation details all available built-in filters that can be used to avoid noise in the terms content. - -## Filters +This reference details all available built-in [filters]({{< relref "terms/explanation/filters" >}}) that can be applied to avoid noise in versions. {{< refItem name="removeQueryParams" - description="Removes specified query parameters from URLs in links and images within the terms content" + description="Removes specified query parameters from URLs in links and images." >}} ```json @@ -21,8 +19,6 @@ This reference documentation details all available built-in filters that can be ] ``` -Result: - ```diff -

Read the list of our affiliates.

+

Read the list of our affiliates.

diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index 68faa4f1..e5a5deb7 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -8,16 +8,21 @@ Filters are JavaScript functions that take the document DOM as parameter and are - **in-place**: they modify the document structure and content directly; - **idempotent**: they return the same document structure and content even if run repeatedly on their own result. +- **ordered**: they are run sequentially in the order specified in the declaration. + +Learn more about the concept and constraints on the [filters explanation]({{< relref "terms/explanation/filters" >}}). + +## Signature The generic function signature for a filter is: -- For simple filters: +- For filters that take no parameter: ```js export [async] function filterName(document, [documentDeclaration]) ``` -- For filters with parameters: +- For filters that take parameters: ```js export [async] function filterName(document, parameters, [documentDeclaration]) @@ -29,7 +34,7 @@ These functions can be `async`, but they will still run sequentially. ## Usage -### Simple filter +### Filters that take no parameter ```js // .filters.js @@ -75,7 +80,7 @@ export function convertTimeAgoToDate(document) { "name": "MyService", "terms": { "Privacy Policy": { - "fetch": "https://example.com/privacy", + "fetch": "https://my.service.example/privacy", "select": ".content", "filter": [ "convertTimeAgoToDate" @@ -141,7 +146,7 @@ export function removeLinksWithText(document, textArray) { "name": "MyService", "terms": { "Privacy Policy": { - "fetch": "https://example.com/privacy", + "fetch": "https://my.service.example/privacy", "select": ".content", "filter": [ { "removeLinksWithText": ["Return to previous section", "Go to next section"] } @@ -200,7 +205,7 @@ export async function convertImagesToBase64(document, selector, documentDeclarat "name": "MyService", "terms": { "Privacy Policy": { - "fetch": "https://example.com/privacy", + "fetch": "https://my.service.example/privacy", "select": ".content", "filter": [ { "convertImagesToBase64": ".meaningful-illustration" } @@ -213,6 +218,6 @@ export async function convertImagesToBase64(document, selector, documentDeclarat Result: ```diff -- +- + ``` From f5afdf75a07ac3625828e25a01a30bd17c371918 Mon Sep 17 00:00:00 2001 From: Matti Schneider Date: Wed, 10 Sep 2025 15:51:52 +0200 Subject: [PATCH 17/17] Document how to add third-party libs in filters --- content/terms/reference/filters.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/content/terms/reference/filters.md b/content/terms/reference/filters.md index e5a5deb7..6031c2c6 100644 --- a/content/terms/reference/filters.md +++ b/content/terms/reference/filters.md @@ -221,3 +221,7 @@ Result: - + ``` + +## Third-party libraries + +As can be seen in the last example, third-party libraries can be imported in the filters. These should be declared in the `package.json` of the collection to be available.