Improve text-matching algorithm of amp-viewer-integration/findtext.js. (

ampproject#23853) This PR improves the algorithm to match sentences to highlight in featured snippets. With this change, we will highlight sentences from the smaller ranges than before. This change affects roughly around 5% of pages in featured snippets. The details of diffs by this change: https://docs.google.com/document/d/1M_QO1zdqa8IH9-fxF6b6eWseEXZe2mb-M1MYgQIB_Cg/edit?usp=sharing
Parsely · Oct 22, 2019 · 155d6a6 · 155d6a6
1 parent 9b7531e
commit 155d6a6
Show file tree

Hide file tree

Showing 2 changed files with 282 additions and 59 deletions.
diff --git a/extensions/amp-viewer-integration/0.1/findtext.js b/extensions/amp-viewer-integration/0.1/findtext.js
@@ -15,10 +15,12 @@
  */
 
 import {computedStyle} from '../../../src/style';
+import {devAssert} from '../../../src/log';
 
 /**
  * Simple implementation of CircularBuffer.
  * Exported for test only.
+ * @template T
  */
 export class CircularBuffer {
   /**
@@ -32,25 +34,32 @@ export class CircularBuffer {
   }
 
   /**
-   * Adds item to buffer.
-   *
-   * @param {*} item
+   * Add one element to the end.
+   * @param {T} item
    */
-  add(item) {
+  push(item) {
     this.buff_[this.next_] = item;
     this.next_ = (this.next_ + 1) % this.max_;
   }
 
   /**
    * @param {number} index The index of an element to get.
-   * @return {!TextPosDef}
+   * @return {T}
    */
   get(index) {
     if (this.buff_.length >= this.max_) {
       index = (this.next_ + index) % this.max_;
     }
     return this.buff_[index];
   }
+
+  /**
+   * The current buffer size.
+   * @return {number}
+   */
+  size() {
+    return this.buff_.length;
+  }
 }
 
 /**
@@ -74,6 +83,18 @@ export function textPosChar(pos) {
  */
 let TextRangeDef;
 
+/**
+ * TextPosAndIdxDef represents a pair of TextPosDef and index.
+ * @typedef {{pos: !TextPosDef, idx: number}}
+ */
+let TextPosAndIdxDef;
+
+/**
+ * TextRangeWithIdxDef represents a text range with TextPosAndIdxDef.
+ * @typedef {{start: !TextPosAndIdxDef, end: !TextPosAndIdxDef}}
+ */
+let TextRangeWithIdxDef;
+
 const skipCharRe = /[,.\s\u2022()]/;
 
 /**
@@ -110,6 +131,83 @@ export function canonicalizeString(s) {
   return buf.join('');
 }
 
+/**
+ * Canonicalizes strings in sentences and remove empty strings.
+ * @param {!Array<string>} sentences
+ * @return {!Array<string>}
+ */
+function canonicalizeSentences(sentences) {
+  const ret = [];
+  for (let i = 0; i < sentences.length; i++) {
+    const sen = canonicalizeString(sentences[i]);
+    if (sen) {
+      ret.push(sen);
+    }
+  }
+  return ret;
+}
+
+/**
+ * Matcher finds occurrences of a sentence with the bad character rule of
+ * BM-algorithm.
+ */
+class Matcher {
+  /**
+   * @param {string} sen
+   * @param {!CircularBuffer<!TextPosAndIdxDef>} buf
+   */
+  constructor(sen, buf) {
+    this.sen_ = sen;
+    this.buf_ = buf;
+    /** @type {!Array<!TextRangeWithIdxDef>} */
+    this.matches = [];
+    // Precomputed table for the bad character rule.
+    this.skipTable_ = {};
+    this.skip_ = sen.length - 1;
+    for (let i = 0; i < sen.length; i++) {
+      const c = sen[i];
+      this.skipTable_[c] = sen.length - 1 - i;
+    }
+  }
+
+  /**
+   * Peaks the current char and update internal matches.
+   */
+  update() {
+    if (this.skip_ > 0) {
+      this.skip_--;
+      return;
+    }
+    const {sen_: sen, buf_: buf} = this;
+    const bufSize = buf.size();
+    for (let j = 0; j < sen.length; j++) {
+      const c = canonicalizeChar(textPosChar(buf.get(bufSize - 1 - j).pos));
+      if (sen[sen.length - 1 - j] == c) {
+        continue;
+      }
+      let skip = this.skipTable_[c];
+      if (skip == null) {
+        skip = sen.length;
+      }
+      skip -= j;
+      if (skip < 1) {
+        skip = 1;
+      }
+      this.skip_ = skip - 1;
+      return;
+    }
+    const endPosIdx = buf.get(bufSize - 1);
+    const endPos = endPosIdx.pos;
+    this.matches.push({
+      start: buf.get(bufSize - sen.length),
+      end: {
+        pos: {node: endPos.node, offset: endPos.offset + 1},
+        idx: endPosIdx.idx + 1,
+      },
+    });
+  }
+}
+
 /**
  * findSentences find sentences from node and returns a list of TextRangeDef.
  * @param {!Window} win
@@ -118,65 +216,79 @@ export function canonicalizeString(s) {
  * @return {?Array<!TextRangeDef>}
  */
 export function findSentences(win, node, sentences) {
+  sentences = canonicalizeSentences(sentences);
+  if (sentences.length <= 0) {
+    return null;
+  }
   const scanner = new TextScanner(win, node);
-  const matches = [];
-  for (let senIdx = 0; senIdx < sentences.length; senIdx++) {
-    const sen = canonicalizeString(sentences[senIdx]);
-    if (!sen) {
+  // Creates a circular buffer with capacity = max(size of sentence).
+  // Don't pass Math.max to reduce directly because reduce passes idx and src
+  // to the callback.
+  // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/reduce
+  /** @type {!CircularBuffer<!TextPosAndIdxDef>} */
+  const buf = new CircularBuffer(
+    sentences.map(sen => sen.length).reduce((x, y) => Math.max(x, y))
+  );
+
+  // First, create a matcher for the first sentence to find only the first
+  // sentence.
+  /** @type {!Array<!Matcher>} */
+  const matchers = [new Matcher(sentences[0], buf)];
+  let posIdx = -1;
+  while (true) {
+    posIdx++;
+    const pos = scanner.next();
+    if (pos == null) {
+      // Reached to the end of the doc. mismatch.
+      return null;
+    }
+    if (skipCharRe.test(textPosChar(pos))) {
       continue;
     }
-    // BM-algorithm with bad-character rules.
-    const skipTable = {};
-    for (let i = 0; i < sen.length; i++) {
-      const c = sen[i];
-      skipTable[c] = sen.length - 1 - i;
+    buf.push({pos, idx: posIdx});
+    for (let i = 0; i < matchers.length; i++) {
+      matchers[i].update();
     }
-    const buf = new CircularBuffer(sen.length);
-    let index = -1;
-    let nextIndex = sen.length - 1;
-    while (true) {
-      const pos = scanner.next();
-      if (pos == null) {
-        // mismatch
-        return null;
-      }
-      if (skipCharRe.test(textPosChar(pos))) {
-        continue;
-      }
-      buf.add(pos);
-      index++;
-      if (index < nextIndex) {
-        continue;
-      }
-      let ok = true;
-      for (let j = 0; j < sen.length; j++) {
-        const c = canonicalizeChar(textPosChar(buf.get(sen.length - j - 1)));
-        if (sen[sen.length - 1 - j] == c) {
-          continue;
-        }
-        ok = false;
-        let skip = skipTable[c];
-        if (skip == null) {
-          skip = sen.length;
-        }
-        skip -= j;
-        if (skip < 1) {
-          skip = 1;
-        }
-        nextIndex += skip;
-        break;
-      }
-      if (ok) {
-        const endPos = buf.get(sen.length - 1);
-        matches.push({
-          start: buf.get(0),
-          end: {node: endPos.node, offset: endPos.offset + 1},
-        });
-        break;
+    const lastMatcher = matchers[matchers.length - 1];
+    if (lastMatcher.matches.length == 0) {
+      // Continues to find a match for the last matcher.
+      continue;
+    }
+    if (matchers.length == sentences.length) {
+      // Found matches for all sentences.
+      break;
+    }
+    // Found a match for the last matcher. Starts to find the next sentence.
+    matchers.push(new Matcher(sentences[matchers.length], buf));
+  }
+  const matches = /** @type {!Array<!TextRangeWithIdxDef>} */ ([]);
+  // Looking back all matches to the smallest range from the last match.
+  outerLoop: for (let i = sentences.length - 1; i >= 0; i--) {
+    const mm = matchers[i].matches;
+    if (matches.length == 0) {
+      matches.push(mm[mm.length - 1]);
+      continue;
+    }
+    const prev = matches[matches.length - 1];
+    for (let j = mm.length - 1; j >= 0; j--) {
+      const match = mm[j];
+      if (prev.start.idx >= match.end.idx) {
+        matches.push(match);
+        continue outerLoop;
       }
     }
+    // This must not happen.
+    devAssert(false, 'missing valid match');
   }
-  return matches.length > 0 ? matches : null;
+  const ret = /** @type {!Array<!TextRangeDef>} */ ([]);
+  for (let i = matches.length - 1; i >= 0; i--) {
+    const match = matches[i];
+    ret.push({
+      start: match.start.pos,
+      end: match.end.pos,
+    });
+  }
+  return ret;
 }
 
 /**
@@ -418,7 +530,7 @@ export class TextScanner {
    */
   nextTextPos_() {
     const text = this.node_.wholeText;
-    while (this.textIdx_ < text.length) {
+    if (this.textIdx_ < text.length) {
       const idx = this.textIdx_;
       this.textIdx_++;
       return {node: /**@type {!Text}*/ (this.node_), offset: idx};