Skip to content

Commit

Permalink
[Text Extraction] Ignore transparent (or nearly-transparent) elements…
Browse files Browse the repository at this point in the history
… when extracting text

https://bugs.webkit.org/show_bug.cgi?id=270598
rdar://124102506

Reviewed by Megan Gardner and Abrar Rahman Protyasha.

When extracting visible text, ignore subtrees where the renderer is transparent (or nearly
transparent). To do this, we adjust `extractItemData` to return an enum (`SkipExtraction`)
indicating whether we should skip text extraction for just the current node, or for the entire
subtree; we then use this to skip subtrees where there is either no renderer (i.e. `display: none;`)
or the opacity is near 0.

* LayoutTests/fast/text-extraction/basic-text-extraction.html:
* Source/WebCore/page/text-extraction/TextExtraction.cpp:
(WebCore::TextExtraction::extractItemData):
(WebCore::TextExtraction::extractRecursive):
(WebCore::TextExtraction::extractRenderedText):

Canonical link: https://commits.webkit.org/275769@main
  • Loading branch information
whsieh committed Mar 7, 2024
1 parent 97e744c commit e23f17a
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 10 deletions.
5 changes: 5 additions & 0 deletions LayoutTests/fast/text-extraction/basic-text-extraction.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
body {
white-space: pre-wrap;
}

.transparent {
opacity: 0;
}
</style>
<script src="../../resources/ui-helper.js"></script>
</head>
Expand All @@ -17,6 +21,7 @@
<li><a href="https://example.com">Link in list item</a></li>
<li><img src="../images/resources/green-256x256.jpg" alt="Green square" /></li>
</ul>
<div class="transparent"><p>This transparent text should not be extracted</p></div>
<div contenteditable="true">This is an editable area: <a href="https://webkit.org">WebKit</a> <a href="https://webkit.org/downloads">downloads</a>.</div>
<script>
addEventListener("load", async () => {
Expand Down
38 changes: 28 additions & 10 deletions Source/WebCore/page/text-extraction/TextExtraction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@
namespace WebCore {
namespace TextExtraction {

static constexpr auto minOpacityToConsiderVisible = 0.05;

using TextNodesAndText = Vector<std::pair<Ref<Text>, String>>;
using TextAndSelectedRange = std::pair<String, std::optional<CharacterRange>>;
using TextAndSelectedRangeMap = HashMap<RefPtr<Text>, TextAndSelectedRange>;
Expand Down Expand Up @@ -242,26 +244,31 @@ static inline String labelText(HTMLElement& element)
return { };
}

static inline std::variant<std::monostate, ItemData, URL, Editable> extractItemData(Node& node, TraversalContext& context)
enum class SkipExtraction : bool {
Self,
SelfAndSubtree
};

static inline std::variant<SkipExtraction, ItemData, URL, Editable> extractItemData(Node& node, TraversalContext& context)
{
CheckedPtr renderer = node.renderer();
if (!renderer)
return { };
if (!renderer || renderer->style().opacity() < minOpacityToConsiderVisible)
return { SkipExtraction::SelfAndSubtree };

if (renderer->style().visibility() == Visibility::Hidden)
return { };
return { SkipExtraction::Self };

if (RefPtr textNode = dynamicDowncast<Text>(node)) {
if (auto iterator = context.visibleText.find(textNode); iterator != context.visibleText.end()) {
auto& [textContent, selectedRange] = iterator->value;
return { TextItemData { { }, selectedRange, textContent, { } } };
}
return { };
return { SkipExtraction::Self };
}

RefPtr element = dynamicDowncast<Element>(node);
if (!element)
return { };
return { SkipExtraction::Self };

if (element->isLink()) {
if (auto href = element->attributeWithoutSynchronization(HTMLNames::hrefAttr); !href.isEmpty()) {
Expand All @@ -274,7 +281,7 @@ static inline std::variant<std::monostate, ItemData, URL, Editable> extractItemD
// FIXME: This isn't quite right in the case where a richly contenteditable element
// contains more nested editable containers underneath it (for instance, a textarea
// element inside of a Mail compose draft).
return { };
return { SkipExtraction::Self };
}

if (!element->isInUserAgentShadowTree() && element->isRootEditableElement())
Expand Down Expand Up @@ -327,17 +334,26 @@ static inline std::variant<std::monostate, ItemData, URL, Editable> extractItemD
if (renderer->style().hasViewportConstrainedPosition())
return { ItemData { ContainerType::ViewportConstrained } };

return { };
return { SkipExtraction::Self };
}

static inline void extractRecursive(Node& node, Item& parentItem, TraversalContext& context)
{
std::optional<Item> item;
std::optional<Editable> editable;
std::optional<URL> linkURL;
bool shouldSkipSubtree = false;

WTF::switchOn(extractItemData(node, context),
[&](std::monostate) { },
[&](SkipExtraction skipExtraction) {
switch (skipExtraction) {
case SkipExtraction::Self:
return;
case SkipExtraction::SelfAndSubtree:
shouldSkipSubtree = true;
return;
}
},
[&](URL&& result) { linkURL = WTFMove(result); },
[&](Editable&& result) { editable = WTFMove(result); },
[&](ItemData&& result) {
Expand All @@ -346,6 +362,9 @@ static inline void extractRecursive(Node& node, Item& parentItem, TraversalConte
item = { { WTFMove(result), WTFMove(bounds), { } } };
});

if (shouldSkipSubtree)
return;

bool onlyCollectTextAndLinks = linkURL || editable;
if (onlyCollectTextAndLinks) {
if (auto bounds = rootViewBounds(node); context.shouldIncludeNodeWithRect(bounds)) {
Expand Down Expand Up @@ -487,7 +506,6 @@ static void extractRenderedText(Vector<StringsAndBlockOffset>& stringsAndOffsets
if (descendant.style().visibility() == Visibility::Hidden)
continue;

static constexpr auto minOpacityToConsiderVisible = 0.05;
if (descendant.style().opacity() < minOpacityToConsiderVisible)
continue;

Expand Down

0 comments on commit e23f17a

Please sign in to comment.