From 826369ce524a7af68743f07d36496dbb7c840afa Mon Sep 17 00:00:00 2001 From: hbhalodia Date: Fri, 21 Nov 2025 10:39:15 +0530 Subject: [PATCH 1/5] Use HTML parsing for sanitization instead of regex --- src/js/_enqueues/wp/sanitize.js | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/js/_enqueues/wp/sanitize.js b/src/js/_enqueues/wp/sanitize.js index 4252d0a014f7a..85a10c259589c 100644 --- a/src/js/_enqueues/wp/sanitize.js +++ b/src/js/_enqueues/wp/sanitize.js @@ -25,17 +25,9 @@ stripTags: function( text ) { let _text = text || ''; - // Do the search-replace until there is nothing to be replaced. - do { - // Keep pre-replace text for comparison. - text = _text; - - // Do the replacement. - _text = text - .replace( /|$)/g, '' ) - .replace( /<(script|style)[^>]*>[\s\S]*?(<\/\1>|$)/ig, '' ) - .replace( /<\/?[a-z][\s\S]*?(>|$)/ig, '' ); - } while ( _text !== text ); + const htmlElement = document.createElement( 'div' ); + htmlElement.innerHTML = _text; + _text = htmlElement.textContent || htmlElement.innerText || ''; // Return the text with stripped tags. return _text; From 0612d03c4773f5708553cdc5ecb9b3a8f20d7947 Mon Sep 17 00:00:00 2001 From: hbhalodia Date: Wed, 26 Nov 2025 12:01:34 +0530 Subject: [PATCH 2/5] Update the stripTags to use DOMParser --- src/js/_enqueues/wp/sanitize.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/js/_enqueues/wp/sanitize.js b/src/js/_enqueues/wp/sanitize.js index 85a10c259589c..6bd17085e1a83 100644 --- a/src/js/_enqueues/wp/sanitize.js +++ b/src/js/_enqueues/wp/sanitize.js @@ -25,12 +25,12 @@ stripTags: function( text ) { let _text = text || ''; - const htmlElement = document.createElement( 'div' ); - htmlElement.innerHTML = _text; - _text = htmlElement.textContent || htmlElement.innerText || ''; + const domParser = new DOMParser(); + const htmlDocument = domParser.parseFromString( _text, 'text/html' ); + htmlDocument.body.innerText = htmlDocument.body.innerText || ''; // Return the text with stripped tags. - return _text; + return htmlDocument.body.innerHTML; }, /** From 84c2bd930e9a657d2e4237b6942a563724e7c567 Mon Sep 17 00:00:00 2001 From: hbhalodia Date: Wed, 26 Nov 2025 12:03:19 +0530 Subject: [PATCH 3/5] Add proper comments for the change --- src/js/_enqueues/wp/sanitize.js | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/js/_enqueues/wp/sanitize.js b/src/js/_enqueues/wp/sanitize.js index 6bd17085e1a83..52df494722532 100644 --- a/src/js/_enqueues/wp/sanitize.js +++ b/src/js/_enqueues/wp/sanitize.js @@ -26,7 +26,20 @@ let _text = text || ''; const domParser = new DOMParser(); - const htmlDocument = domParser.parseFromString( _text, 'text/html' ); + const htmlDocument = domParser.parseFromString( + _text, + 'text/html' + ); + + /* + * This looks funny and appears to be a no-op, but it + * enforces the escaping. How? when _read_ the `innerText` + * property decodes character references, returning a raw + * string. When _written_, however, it re-encodes to ensure + * that the rendered text replicates what it’s given. + * + * See: https://github.com/WordPress/wordpress-develop/pull/10536#discussion_r2550615378 + */ htmlDocument.body.innerText = htmlDocument.body.innerText || ''; // Return the text with stripped tags. From 32c72ceeeb1d0c881d5b935564ce017202d04797 Mon Sep 17 00:00:00 2001 From: hbhalodia Date: Wed, 26 Nov 2025 13:32:23 +0530 Subject: [PATCH 4/5] Address feedbacks --- src/js/_enqueues/wp/sanitize.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/js/_enqueues/wp/sanitize.js b/src/js/_enqueues/wp/sanitize.js index 52df494722532..e76a1d7ec9870 100644 --- a/src/js/_enqueues/wp/sanitize.js +++ b/src/js/_enqueues/wp/sanitize.js @@ -23,11 +23,10 @@ * @return {string} Stripped text. */ stripTags: function( text ) { - let _text = text || ''; const domParser = new DOMParser(); const htmlDocument = domParser.parseFromString( - _text, + text, 'text/html' ); @@ -40,7 +39,7 @@ * * See: https://github.com/WordPress/wordpress-develop/pull/10536#discussion_r2550615378 */ - htmlDocument.body.innerText = htmlDocument.body.innerText || ''; + htmlDocument.body.innerText = htmlDocument.body.innerText; // Return the text with stripped tags. return htmlDocument.body.innerHTML; From 36c9856f2fd9728c26644bb581edd22364655180 Mon Sep 17 00:00:00 2001 From: hbhalodia Date: Thu, 27 Nov 2025 11:39:41 +0530 Subject: [PATCH 5/5] Addressed feedbacks --- src/js/_enqueues/wp/sanitize.js | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/js/_enqueues/wp/sanitize.js b/src/js/_enqueues/wp/sanitize.js index e76a1d7ec9870..4fec26ab30683 100644 --- a/src/js/_enqueues/wp/sanitize.js +++ b/src/js/_enqueues/wp/sanitize.js @@ -23,7 +23,6 @@ * @return {string} Stripped text. */ stripTags: function( text ) { - const domParser = new DOMParser(); const htmlDocument = domParser.parseFromString( text, @@ -31,13 +30,13 @@ ); /* - * This looks funny and appears to be a no-op, but it - * enforces the escaping. How? when _read_ the `innerText` - * property decodes character references, returning a raw - * string. When _written_, however, it re-encodes to ensure - * that the rendered text replicates what it’s given. + * The following self-assignment appears to be a no-op, but it isn't. + * It enforces the escaping. Reading the `innerText` property decodes + * character references, returning a raw string. When written, however, + * the text is re-escaped to ensure that the rendered text replicates + * what it's given. * - * See: https://github.com/WordPress/wordpress-develop/pull/10536#discussion_r2550615378 + * See . */ htmlDocument.body.innerText = htmlDocument.body.innerText;