Skip to content

Commit

Permalink
Improve text-matching algorithm of amp-viewer-integration/findtext.js. (
Browse files Browse the repository at this point in the history
ampproject#23853)

This PR improves the algorithm to match sentences to highlight in featured snippets. With this change, we will highlight sentences from the smaller ranges than before.
This change affects roughly around 5% of pages in featured snippets.

The details of diffs by this change:
https://docs.google.com/document/d/1M_QO1zdqa8IH9-fxF6b6eWseEXZe2mb-M1MYgQIB_Cg/edit?usp=sharing
  • Loading branch information
yunabe authored and joshuarrrr committed Oct 22, 2019
1 parent 9b7531e commit 155d6a6
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 59 deletions.
228 changes: 170 additions & 58 deletions extensions/amp-viewer-integration/0.1/findtext.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
*/

import {computedStyle} from '../../../src/style';
import {devAssert} from '../../../src/log';

/**
* Simple implementation of CircularBuffer.
* Exported for test only.
* @template T
*/
export class CircularBuffer {
/**
Expand All @@ -32,25 +34,32 @@ export class CircularBuffer {
}

/**
* Adds item to buffer.
*
* @param {*} item
* Add one element to the end.
* @param {T} item
*/
add(item) {
push(item) {
this.buff_[this.next_] = item;
this.next_ = (this.next_ + 1) % this.max_;
}

/**
* @param {number} index The index of an element to get.
* @return {!TextPosDef}
* @return {T}
*/
get(index) {
if (this.buff_.length >= this.max_) {
index = (this.next_ + index) % this.max_;
}
return this.buff_[index];
}

/**
* The current buffer size.
* @return {number}
*/
size() {
return this.buff_.length;
}
}

/**
Expand All @@ -74,6 +83,18 @@ export function textPosChar(pos) {
*/
let TextRangeDef;

/**
* TextPosAndIdxDef represents a pair of TextPosDef and index.
* @typedef {{pos: !TextPosDef, idx: number}}
*/
let TextPosAndIdxDef;

/**
* TextRangeWithIdxDef represents a text range with TextPosAndIdxDef.
* @typedef {{start: !TextPosAndIdxDef, end: !TextPosAndIdxDef}}
*/
let TextRangeWithIdxDef;

const skipCharRe = /[,.\s\u2022()]/;

/**
Expand Down Expand Up @@ -110,6 +131,83 @@ export function canonicalizeString(s) {
return buf.join('');
}

/**
* Canonicalizes strings in sentences and remove empty strings.
* @param {!Array<string>} sentences
* @return {!Array<string>}
*/
function canonicalizeSentences(sentences) {
const ret = [];
for (let i = 0; i < sentences.length; i++) {
const sen = canonicalizeString(sentences[i]);
if (sen) {
ret.push(sen);
}
}
return ret;
}

/**
* Matcher finds occurrences of a sentence with the bad character rule of
* BM-algorithm.
*/
class Matcher {
/**
* @param {string} sen
* @param {!CircularBuffer<!TextPosAndIdxDef>} buf
*/
constructor(sen, buf) {
this.sen_ = sen;
this.buf_ = buf;
/** @type {!Array<!TextRangeWithIdxDef>} */
this.matches = [];
// Precomputed table for the bad character rule.
this.skipTable_ = {};
this.skip_ = sen.length - 1;
for (let i = 0; i < sen.length; i++) {
const c = sen[i];
this.skipTable_[c] = sen.length - 1 - i;
}
}

/**
* Peaks the current char and update internal matches.
*/
update() {
if (this.skip_ > 0) {
this.skip_--;
return;
}
const {sen_: sen, buf_: buf} = this;
const bufSize = buf.size();
for (let j = 0; j < sen.length; j++) {
const c = canonicalizeChar(textPosChar(buf.get(bufSize - 1 - j).pos));
if (sen[sen.length - 1 - j] == c) {
continue;
}
let skip = this.skipTable_[c];
if (skip == null) {
skip = sen.length;
}
skip -= j;
if (skip < 1) {
skip = 1;
}
this.skip_ = skip - 1;
return;
}
const endPosIdx = buf.get(bufSize - 1);
const endPos = endPosIdx.pos;
this.matches.push({
start: buf.get(bufSize - sen.length),
end: {
pos: {node: endPos.node, offset: endPos.offset + 1},
idx: endPosIdx.idx + 1,
},
});
}
}

/**
* findSentences find sentences from node and returns a list of TextRangeDef.
* @param {!Window} win
Expand All @@ -118,65 +216,79 @@ export function canonicalizeString(s) {
* @return {?Array<!TextRangeDef>}
*/
export function findSentences(win, node, sentences) {
sentences = canonicalizeSentences(sentences);
if (sentences.length <= 0) {
return null;
}
const scanner = new TextScanner(win, node);
const matches = [];
for (let senIdx = 0; senIdx < sentences.length; senIdx++) {
const sen = canonicalizeString(sentences[senIdx]);
if (!sen) {
// Creates a circular buffer with capacity = max(size of sentence).
// Don't pass Math.max to reduce directly because reduce passes idx and src
// to the callback.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/reduce
/** @type {!CircularBuffer<!TextPosAndIdxDef>} */
const buf = new CircularBuffer(
sentences.map(sen => sen.length).reduce((x, y) => Math.max(x, y))
);

// First, create a matcher for the first sentence to find only the first
// sentence.
/** @type {!Array<!Matcher>} */
const matchers = [new Matcher(sentences[0], buf)];
let posIdx = -1;
while (true) {
posIdx++;
const pos = scanner.next();
if (pos == null) {
// Reached to the end of the doc. mismatch.
return null;
}
if (skipCharRe.test(textPosChar(pos))) {
continue;
}
// BM-algorithm with bad-character rules.
const skipTable = {};
for (let i = 0; i < sen.length; i++) {
const c = sen[i];
skipTable[c] = sen.length - 1 - i;
buf.push({pos, idx: posIdx});
for (let i = 0; i < matchers.length; i++) {
matchers[i].update();
}
const buf = new CircularBuffer(sen.length);
let index = -1;
let nextIndex = sen.length - 1;
while (true) {
const pos = scanner.next();
if (pos == null) {
// mismatch
return null;
}
if (skipCharRe.test(textPosChar(pos))) {
continue;
}
buf.add(pos);
index++;
if (index < nextIndex) {
continue;
}
let ok = true;
for (let j = 0; j < sen.length; j++) {
const c = canonicalizeChar(textPosChar(buf.get(sen.length - j - 1)));
if (sen[sen.length - 1 - j] == c) {
continue;
}
ok = false;
let skip = skipTable[c];
if (skip == null) {
skip = sen.length;
}
skip -= j;
if (skip < 1) {
skip = 1;
}
nextIndex += skip;
break;
}
if (ok) {
const endPos = buf.get(sen.length - 1);
matches.push({
start: buf.get(0),
end: {node: endPos.node, offset: endPos.offset + 1},
});
break;
const lastMatcher = matchers[matchers.length - 1];
if (lastMatcher.matches.length == 0) {
// Continues to find a match for the last matcher.
continue;
}
if (matchers.length == sentences.length) {
// Found matches for all sentences.
break;
}
// Found a match for the last matcher. Starts to find the next sentence.
matchers.push(new Matcher(sentences[matchers.length], buf));
}
const matches = /** @type {!Array<!TextRangeWithIdxDef>} */ ([]);
// Looking back all matches to the smallest range from the last match.
outerLoop: for (let i = sentences.length - 1; i >= 0; i--) {
const mm = matchers[i].matches;
if (matches.length == 0) {
matches.push(mm[mm.length - 1]);
continue;
}
const prev = matches[matches.length - 1];
for (let j = mm.length - 1; j >= 0; j--) {
const match = mm[j];
if (prev.start.idx >= match.end.idx) {
matches.push(match);
continue outerLoop;
}
}
// This must not happen.
devAssert(false, 'missing valid match');
}
return matches.length > 0 ? matches : null;
const ret = /** @type {!Array<!TextRangeDef>} */ ([]);
for (let i = matches.length - 1; i >= 0; i--) {
const match = matches[i];
ret.push({
start: match.start.pos,
end: match.end.pos,
});
}
return ret;
}

/**
Expand Down Expand Up @@ -418,7 +530,7 @@ export class TextScanner {
*/
nextTextPos_() {
const text = this.node_.wholeText;
while (this.textIdx_ < text.length) {
if (this.textIdx_ < text.length) {
const idx = this.textIdx_;
this.textIdx_++;
return {node: /**@type {!Text}*/ (this.node_), offset: idx};
Expand Down

0 comments on commit 155d6a6

Please sign in to comment.