From 0413162cf97e07ce879939f75310e138bf7bf250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ella=20van=C2=A0Durpe?= Date: Mon, 25 Nov 2019 17:26:08 +0100 Subject: [PATCH] Paste: Remove HTML Formatting Space (#17470) * Paste: Remove HTML Formatting Space * Simplify * Add unit tests * Ignore pre * Address feedback * Add extra test * Add extra plain text test --- .../raw-handling/html-formatting-remover.js | 92 +++++++++++++++ .../src/api/raw-handling/paste-handler.js | 2 + .../test/html-formatting-remover.js | 110 ++++++++++++++++++ test/integration/fixtures/apple-out.html | 22 +--- test/integration/fixtures/classic-out.html | 2 +- test/integration/fixtures/evernote-out.html | 12 +- test/integration/fixtures/markdown-out.html | 3 +- test/integration/fixtures/ms-word-out.html | 26 +---- .../fixtures/ms-word-styled-out.html | 12 +- 9 files changed, 215 insertions(+), 66 deletions(-) create mode 100644 packages/blocks/src/api/raw-handling/html-formatting-remover.js create mode 100644 packages/blocks/src/api/raw-handling/test/html-formatting-remover.js diff --git a/packages/blocks/src/api/raw-handling/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/html-formatting-remover.js new file mode 100644 index 0000000000000..615feb80508a9 --- /dev/null +++ b/packages/blocks/src/api/raw-handling/html-formatting-remover.js @@ -0,0 +1,92 @@ +/** + * Internal dependencies + */ +import { isPhrasingContent } from './phrasing-content'; + +function getSibling( node, which ) { + const sibling = node[ `${ which }Sibling` ]; + + if ( sibling && isPhrasingContent( sibling ) ) { + return sibling; + } + + const { parentNode } = node; + + if ( ! parentNode || ! isPhrasingContent( parentNode ) ) { + return; + } + + return getSibling( parentNode, which ); +} + +function isFormattingSpace( character ) { + return ( + character === ' ' || + character === '\r' || + character === '\n' || + character === '\t' + ); +} + +/** + * Removes spacing that formats HTML. + * + * @see https://www.w3.org/TR/css-text-3/#white-space-processing + * + * @param {Node} node The node to be processed. + * @return {void} + */ +export default function( node ) { + if ( node.nodeType !== node.TEXT_NODE ) { + return; + } + + // Ignore pre content. + if ( node.parentElement.closest( 'pre' ) ) { + return; + } + + // First, replace any sequence of HTML formatting space with a single space. + let newData = node.data.replace( /[ \r\n\t]+/g, ' ' ); + + // Remove the leading space if the text element is at the start of a block, + // is preceded by a line break element, or has a space in the previous + // node. + if ( newData[ 0 ] === ' ' ) { + const previousSibling = getSibling( node, 'previous' ); + + if ( + ! previousSibling || + previousSibling.nodeName === 'BR' || + previousSibling.textContent.slice( -1 ) === ' ' + ) { + newData = newData.slice( 1 ); + } + } + + // Remove the trailing space if the text element is at the end of a block, + // is succeded by a line break element, or has a space in the next text + // node. + if ( newData[ newData.length - 1 ] === ' ' ) { + const nextSibling = getSibling( node, 'next' ); + + if ( + ! nextSibling || + nextSibling.nodeName === 'BR' || + ( + nextSibling.nodeType === nextSibling.TEXT_NODE && + isFormattingSpace( nextSibling.textContent[ 0 ] ) + ) + ) { + newData = newData.slice( 0, -1 ); + } + } + + // If there's no data left, remove the node, so `previousSibling` stays + // accurate. Otherwise, update the node data. + if ( ! newData ) { + node.parentNode.removeChild( node ); + } else { + node.data = newData; + } +} diff --git a/packages/blocks/src/api/raw-handling/paste-handler.js b/packages/blocks/src/api/raw-handling/paste-handler.js index fdea17bd0eba2..4468eb87c51db 100644 --- a/packages/blocks/src/api/raw-handling/paste-handler.js +++ b/packages/blocks/src/api/raw-handling/paste-handler.js @@ -24,6 +24,7 @@ import shortcodeConverter from './shortcode-converter'; import markdownConverter from './markdown-converter'; import iframeRemover from './iframe-remover'; import googleDocsUIDRemover from './google-docs-uid-remover'; +import htmlFormattingRemover from './html-formatting-remover'; import { getPhrasingContentSchema } from './phrasing-content'; import { deepFilterHTML, @@ -224,6 +225,7 @@ export function pasteHandler( { HTML = '', plainText = '', mode = 'AUTO', tagNam piece = deepFilterHTML( piece, filters, blockContentSchema ); piece = removeInvalidHTML( piece, schema ); + piece = deepFilterHTML( piece, [ htmlFormattingRemover ], blockContentSchema ); piece = normaliseBlocks( piece ); // Allows us to ask for this information when we get a report. diff --git a/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js new file mode 100644 index 0000000000000..d5efd50f4db85 --- /dev/null +++ b/packages/blocks/src/api/raw-handling/test/html-formatting-remover.js @@ -0,0 +1,110 @@ +/** + * Internal dependencies + */ +import filter from '../html-formatting-remover'; +import { deepFilterHTML } from '../utils'; + +describe( 'HTMLFormattingRemover', () => { + it( 'should trim text node without parent', () => { + const input = 'a'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( input ); + } ); + + it( 'should remove formatting space', () => { + const input = ` +
+ a + b +
+ `; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove nested formatting space', () => { + const input = ` +
+ + a + b + +
+ `; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should not remove leading or trailing space if previous or next element has no space', () => { + const input = ` +
+ a + b + c +
+ `; + const output = '
a b c
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove formatting space (empty)', () => { + const input = ` +
+
+ `; + const output = '
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove block level formatting space', () => { + const input = ` +
+
+ a +
+
+ b +
+
+ `; + const output = '
a
b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove formatting space around br', () => { + const input = ` +
+ a +
+ b +
+ `; + const output = '
a
b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should remove formatting space around phasing content elements', () => { + const input = ` +
+ + a + + + b + +
+ `; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); + + it( 'should ignore pre', () => { + const input = `
 a\n b\n
`; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( input ); + } ); + + it( 'should not remove white space if next elemnt has none', () => { + const input = `
a b
`; + const output = '
a b
'; + expect( deepFilterHTML( input, [ filter ] ) ).toEqual( output ); + } ); +} ); diff --git a/test/integration/fixtures/apple-out.html b/test/integration/fixtures/apple-out.html index 40d4f1a8d133e..f040d38235291 100644 --- a/test/integration/fixtures/apple-out.html +++ b/test/integration/fixtures/apple-out.html @@ -19,27 +19,9 @@ -
-One - -Two - -Three -
-1 - -2 - -3 -
-I - -II - -III -
+
OneTwoThree
123
IIIIII
-

An image:

+

An image:

diff --git a/test/integration/fixtures/classic-out.html b/test/integration/fixtures/classic-out.html index dbc94528db7b2..1298053376dfa 100644 --- a/test/integration/fixtures/classic-out.html +++ b/test/integration/fixtures/classic-out.html @@ -15,7 +15,7 @@ -

Fourth paragraph

+

Fourth paragraph

diff --git a/test/integration/fixtures/evernote-out.html b/test/integration/fixtures/evernote-out.html index c984df312f0be..32d2deb764897 100644 --- a/test/integration/fixtures/evernote-out.html +++ b/test/integration/fixtures/evernote-out.html @@ -1,7 +1,5 @@ -

This is a paragraph. -
This is a link. -

+

This is a paragraph.
This is a link.

@@ -17,13 +15,7 @@ -
One -Two -Three -
Four -Five -Six -
+
OneTwoThree
FourFiveSix
diff --git a/test/integration/fixtures/markdown-out.html b/test/integration/fixtures/markdown-out.html index 85a105b640923..dc064b3c12f00 100644 --- a/test/integration/fixtures/markdown-out.html +++ b/test/integration/fixtures/markdown-out.html @@ -7,8 +7,7 @@

This is a heading with italic

-

Preserve
-line breaks please.

+

Preserve
line breaks please.

diff --git a/test/integration/fixtures/ms-word-out.html b/test/integration/fixtures/ms-word-out.html index 47f7cab83ae73..79871a73e6a3f 100644 --- a/test/integration/fixtures/ms-word-out.html +++ b/test/integration/fixtures/ms-word-out.html @@ -1,11 +1,9 @@ -

This is a -title

+

This is a title

-

This is a -subtitle

+

This is a subtitle

@@ -29,25 +27,7 @@

This is a heading level 2

-
- One - - Two - - Three -
- 1 - - 2 - - 3 -
- I - - II - - III -
+
OneTwoThree
123
IIIIII
diff --git a/test/integration/fixtures/ms-word-styled-out.html b/test/integration/fixtures/ms-word-styled-out.html index 2eef7365f9668..277d8d37bbc63 100644 --- a/test/integration/fixtures/ms-word-styled-out.html +++ b/test/integration/fixtures/ms-word-styled-out.html @@ -1,15 +1,7 @@ -

-Lorem -ipsum dolor sit amet, consectetur adipiscing elit  -

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit 

-

-Lorem -ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque -aliquet hendrerit auctor. Nam lobortis, est vel lacinia tincidunt, -purus tellus vehicula ex, nec pharetra justo dui sed lorem. Nam -congue laoreet massa, quis varius est tincidunt ut.

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Pellentesque aliquet hendrerit auctor. Nam lobortis, est vel lacinia tincidunt, purus tellus vehicula ex, nec pharetra justo dui sed lorem. Nam congue laoreet massa, quis varius est tincidunt ut.