RunDevelopment · RunDevelopment · Aug 24, 2020 · Aug 9, 2020 · Aug 24, 2020 · Aug 24, 2020
diff --git a/README.md b/README.md
@@ -111,6 +111,7 @@ After:
 | :--- | :--: | :--: | :--- |
 | [confusing-quantifier](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/docs/rules/confusing-quantifier.md) |  | :warning: | Warn about confusing quantifiers. |
 | [consistent-match-all-characters](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/docs/rules/consistent-match-all-characters.md) | :wrench: | :warning: | Use one character class consistently whenever all characters have to be matched. |
+| [disjoint-alternatives](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/docs/rules/disjoint-alternatives.md) |  | :warning: | Disallow different alternatives that can match the same words. |
 | [identity-escape](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/docs/rules/identity-escape.md) | :wrench: | :warning: | How to handle identity escapes. |
 | [no-constant-capturing-group](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/docs/rules/no-constant-capturing-group.md) |  | :warning: | Disallow capturing groups that can match only one word. |
 | [no-empty-alternative](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/docs/rules/no-empty-alternative.md) |  | :warning: | Disallow backreferences that will always be replaced with the empty string. |

diff --git a/docs/rules/disjoint-alternatives.md b/docs/rules/disjoint-alternatives.md
@@ -0,0 +1,75 @@
+# `disjoint-alternatives`
+
+> Disallow different alternatives that can match the same words.
+
+Fixable: `no` <br> Recommended configuration: `"warn"`
+
+<!-- prettier-ignore -->
+[Source file](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/lib/rules/disjoint-alternatives.js) <br> [Test file](https://github.com/RunDevelopment/eslint-plugin-clean-regex/blob/master/tests/lib/rules/disjoint-alternatives.js)
+
+## Description
+
+This rule will point out alternatives that share at least one (non-empty) word.
+
+Non-disjoint alternatives usually indicate some kind of problem and are usually
+harder to reason about. Examples of non-disjoint alternatives include:
+
+-   **Duplicate alternatives**
+
+    Example: `/foo|bar|foo/`
+
+    Any duplicates can be removed without changing the meaning of the pattern.
+
+-   **Subset alternatives**
+
+    Example: `/\w+|Foo/`
+
+    Any alternative that matches a subset of a previous alternative can be
+    removed without affecting the pattern.
+
+-   **Superset alternatives**
+
+    Example: `/(Foo|\w+)\b/`
+
+    For any alternative that matches a superset of a previous alternative, the
+    other alternative might be unnecessary. It has to be decided on a
+    case-to-case basis whether the previous subset alternative can be removed.
+
+    (In the above example, the `Foo` alternative can be removed but only because
+    of the `\b`.)
+
+However, there are valid use-cases for non-disjoint alternatives, so sometimes
+it's ok to use a comment to disable this rule on certain regular expressions.
+But be careful because even those valid use-cases can still potentially cause
+exponential backtracking.
+
+### Exponential backtracking
+
+Non-disjoint alternatives inside `*` or `+` quantifiers almost always cause
+exponential backtracking.
+
+Example: `/(?:\w|\d)+-/`<br> The `\d` and `\w` alternatives are not disjoint and
+because they are quantified using `+`, they will cause exponential backtracking.
+This example is easy to fix because `\d` is a subset of `\w`,so removing the
+`\d` alternative will fix the exponential backtracking. An example string
+showing the exponential runtime of the unfixed pattern is `01234567890123456789`
+(add more digits to increase the time it takes to reject the string).
+
+### Examples
+
+Examples of **valid** code for this rule:
+
+<!-- prettier-ignore -->
+```js
+/a+|b*/
+/a(?:\w+|[+-]\d+)+/
+```
+
+Examples of **invalid** code for this rule:
+
+<!-- prettier-ignore -->
+```js
+/\w+|\d+/
+/[a-z]+|FOO/i
+/\w+(?:\s+(?:\S+|"[^"]*"))*/
+```
diff --git a/lib/format.js b/lib/format.js
@@ -0,0 +1,56 @@
+"use strict";
+
+const { JS } = require("refa");
+
+/**
+ * @typedef {{ source: string; flags: string; }} Literal
+ * @typedef {import("refa").FiniteAutomaton} FiniteAutomaton
+ * @typedef {import("regexpp/ast").Node} Node
+ */
+
+const format = {
+
+	/**
+	 * @param {string} string
+	 * @param {number} maxLength
+	 * @returns {string}
+	 */
+	shorten(string, maxLength) {
+		if (string.length <= maxLength) {
+			return string;
+		} else {
+			return string.substr(0, maxLength - 1) + "…";
+		}
+	},
+
+	/**
+	 * Converts the given value to the string of a `RegExp` literal.
+	 *
+	 * @param {Literal | FiniteAutomaton} value
+	 * @returns {string}
+	 * @example
+	 * toRegExpString(/foo/i) // returns "/foo/i"
+	 */
+	toRegExpString(value) {
+		if ("toRegex" in value) {
+			const re = value.toRegex();
+			const literal = JS.toLiteral(re);
+			return format.toRegExpString(literal);
+		} else {
+			return `/${value.source}/${value.flags}`;
+		}
+	},
+
+	/**
+	 * Returns a string that mentions the given node.
+	 *
+	 * @param {Node} node
+	 * @returns {string}
+	 */
+	mention(node) {
+		return "`" + node.raw + "`";
+	},
+
+};
+
+module.exports = format;
diff --git a/lib/index.js b/lib/index.js
@@ -11,6 +11,7 @@ module.exports = {
 			rules: {
 				"clean-regex/confusing-quantifier": "warn",
 				"clean-regex/consistent-match-all-characters": "warn",
+				"clean-regex/disjoint-alternatives": "warn",
 				"clean-regex/identity-escape": "warn",
 				"clean-regex/no-constant-capturing-group": "warn",
 				"clean-regex/no-empty-alternative": "warn",

diff --git a/lib/rules/disjoint-alternatives.js b/lib/rules/disjoint-alternatives.js
@@ -0,0 +1,195 @@
+"use strict";
+
+const { createRuleListener, getDocUrl } = require("../rules-util");
+const { toRegExpString, mention } = require("../format");
+const util = require("../util");
+const { JS, NFA } = require("refa");
+
+/**
+ * @typedef {import("regexpp/ast").Node} Node
+ * @typedef {import("regexpp/ast").Alternative} Alternative
+ *
+ * @typedef {import("refa").ReadonlyNFA} ReadonlyNFA
+ */
+
+
+
+/** @type {import("eslint").Rule.RuleModule} */
+module.exports = {
+	meta: {
+		type: "problem",
+		docs: {
+			description: "Disallow different alternatives that can match the same words.",
+			url: getDocUrl(__filename)
+		}
+	},
+
+	create(context) {
+		return createRuleListener(({ visitAST, flags, pattern, reportElement }) => {
+
+			const parser = JS.Parser.fromAst({ pattern, flags });
+
+			/**
+			 * Converts the given alternative to an NFA. The returned NFA does not accept the empty string.
+			 *
+			 * @param {Alternative} alt
+			 * @returns {ReadonlyNFA}
+			 */
+			function toNfa(alt) {
+				const result = parser.parseElement(alt, { lookarounds: "disable" });
+				const nfa = NFA.fromRegex(result.expression, { maxCharacter: result.maxCharacter });
+				nfa.removeEmptyWord();
+				return nfa;
+			}
+
+			/**
+			 * @param {Alternative[]} alternatives
+			 * @param {ReadonlyNFA} subset
+			 * @returns {Alternative | null}
+			 */
+			function findFirstSuperset(alternatives, subset) {
+				for (const alt of alternatives) {
+					if (util.nfaIsSupersetOf(toNfa(alt), subset)) {
+						return alt;
+					}
+				}
+				return null;
+			}
+			/**
+			 * @param {Alternative[]} alternatives
+			 * @param {ReadonlyNFA} set
+			 * @returns {Alternative | null}
+			 */
+			function findSingleNonDisjoint(alternatives, set) {
+				/** @type {Alternative | null} */
+				let nonDisjoint = null;
+				for (const alt of alternatives) {
+					if (!areDisjoint(toNfa(alt), set)) {
+						if (nonDisjoint === null) {
+							nonDisjoint = alt;
+						} else {
+							return null;
+						}
+					}
+				}
+				return nonDisjoint;
+			}
+			/**
+			 * @param {ReadonlyNFA} a
+			 * @param {ReadonlyNFA} b
+			 * @returns {boolean}
+			 */
+			function areDisjoint(a, b) {
+				return a.isDisjointWith(b, {
+					// limit the number of nodes that can be created during the intersection
+					maxNodes: 1000
+				});
+			}
+
+			/**
+			 * @param {readonly Alternative[]} alternatives
+			 * @returns {Result}
+			 *
+			 * @typedef {"disjoint" | "reported"} Result
+			 */
+			function checkAlternatives(alternatives) {
+				if (alternatives.length < 2) {
+					return "disjoint";
+				}
+
+				/** @type {Result} */
+				let result = "disjoint";
+
+				/** @type {NFA | undefined} */
+				let total = undefined;
+				for (const alt of alternatives) {
+					const nfa = toNfa(alt);
+
+					if (nfa.isEmpty) {
+						// skip this alternative
+					} else if (!total) {
+						total = nfa.copy();
+					} else if (areDisjoint(total, nfa)) {
+						total.union(nfa);
+					} else {
+						const altIndex = alternatives.indexOf(alt);
+						const beforeAlternatives = alternatives.slice(0, altIndex);
+
+						const intersection = NFA.intersect(total, nfa);
+						const isSubset = util.nfaEquals(nfa, intersection);
+
+						// try to find the single alternative that is not disjoint with this one
+						const cause = isSubset
+							? findFirstSuperset(beforeAlternatives, nfa)
+							: findSingleNonDisjoint(beforeAlternatives, nfa);
+						const causeMsg = cause
+							? mention(cause)
+							: "the previous one(s)";
+
+						// find out whether this alternative is a superset of the cause
+						const isSuperset = cause ? util.nfaIsSupersetOf(nfa, toNfa(cause)) : false;
+
+						let message;
+						if (isSubset) {
+							message = isSuperset
+								? `This alternative is the same as ${causeMsg} and can be removed.`
+								: `This alternative is a subset of ${causeMsg} and can be removed.`;
+						} else {
+							message = isSuperset
+								? `This alternative is a superset of ${causeMsg}.`
+								: (`This alternative is not disjoint with ${causeMsg}.`
+									+ ` The shared language is ${toRegExpString(intersection)}.`);
+						}
+
+						// whether this ambiguity might cause exponential backtracking
+						if (util.underAStar(alt)) {
+							message += " This alternative is likely to cause exponential backtracking.";
+						}
+
+						context.report({
+							message,
+							...reportElement(alt)
+						});
+						result = "reported";
+					}
+				}
+
+				return result;
+			}
+
+			/** @type {Set<Node>} */
+			const ignoreNodes = new Set();
+			/**
+			 * @param {Node} node
+			 */
+			function ignoreParents(node) {
+				for (let parent = node.parent; parent; parent = parent.parent) {
+					ignoreNodes.add(parent);
+				}
+			}
+			/**
+			 * @param {import("regexpp/ast").Group | import("regexpp/ast").CapturingGroup | import("regexpp/ast").LookaroundAssertion | import("regexpp/ast").Pattern} node
+			 * @returns {void}
+			 */
+			function process(node) {
+				if (!ignoreNodes.has(node)) {
+					if (checkAlternatives(node.alternatives) === "reported") {
+						ignoreParents(node);
+					}
+				}
+			}
+
+			visitAST({
+				onAssertionLeave(node) {
+					if (node.kind === "lookahead" || node.kind === "lookbehind") {
+						process(node);
+					}
+				},
+				onCapturingGroupLeave: process,
+				onGroupLeave: process,
+				onPatternLeave: process,
+			});
+
+		});
+	}
+};