Permalink
Fetching contributors…
Cannot retrieve contributors at this time
1480 lines (1307 sloc) 49.1 KB
/* vim:set fileencoding=utf-8 tabstop=2 shiftwidth=2 softtabstop=2 expandtab: */
/**
* <title>PointedEars' JSX: RegExp Library</title>
* @filename regexp.js
* @version $Id$
*
* @section Copyright & Disclaimer
*
* @author
* (C) 2005, 2009-2017 Thomas Lahn &lt;js@PointedEars.de&gt;
*
* @partof PointedEars' JavaScript Extensions (JSX)
*
* JSX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JSX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JSX. If not, see <http://www.gnu.org/licenses/>.
*/
if (typeof jsx != "object")
{
/**
* @namespace
*/
var jsx = {};
}
(function () {
"use strict";
var _jsx_object = jsx.object;
var _ADD_OVERWRITE = _jsx_object.ADD_OVERWRITE;
var _extend = _jsx_object.extend;
var _getClass = _jsx_object.getClass;
var _getKeys = _jsx_object.getKeys;
var _getDataObject = _jsx_object.getDataObject;
var _isString = _jsx_object.isString;
/**
* @namespace
* @property pattern : String
* The original pattern string, including pattern-match
* modifiers.
* @property _patternGroups : Array
* The part of the pattern string from the opening parenthesis
* of each pattern group to the end of the pattern, before
* character class expansion, and without pattern-match
* modifiers. The first item (index 0) holds the complete
* pattern without modifiers. Used internally; do not modify.
* <em>NOTE: For efficiency, the pattern groups are not isolated;
* further parsing on your part may very well be necessary.</em>
* @property groups : Object
* An Array-like object mapping group indexes to group names.
* Its <code>length</code> property yields the number of grouped
* subpatterns in the original pattern, including named groups.
* @property names : Object
* An object mapping group names to group indexes.
* @property flags : String
* The original flags string
* @property dotAll : boolean
* Flag specifying whether the used expression was built from
* a pattern where the dot matches newline as well (PCRE_DOTALL).
* @property extended : boolean
* Flag specifying whether the used expression was built from
* an extended pattern (PCRE_EXTENDED).
* @property unicodeMode : boolean
* Flag specifying whether the used expression was built using
* Unicode mode.
* @method exec
* A variant of the built-in {@link RegExp.prototype#exec()}
* to support named groups and Unicode mode transparently.
* @method _oldExec
* The original inherited exec() method. Used internally.
* @method _realExec
* The used exec() method. Used internally.
* @function
*/
var _RegExp2 = jsx.object.extend(
(
function () {
var
_destructure = jsx.array.destructure,
_WideString = _jsx_object.getFeature(
jsx, "string", "unicode", "WideString"),
_fromCharCode = function (codePoint) {
if (codePoint > 0xFFFF)
{
return _WideString.fromCharCode(codePoint);
}
return String.fromCharCode(codePoint);
},
propertyClasses,
ucdFields = ["codePoint",, "propertyClass"],
_beyondBMPsupport = jsx.tryThis(
function () {
new RegExp("", "u");
return true;
},
function () {
return false;
}
),
_parseUCDText = function () {
(new jsx.net.http.Request(
_RegExp2.ucdTextPath, "GET", false,
function (xhr) {
var lines = xhr.responseText.split(/\r?\n|\r/).map(
function (e) {
var entry = _destructure(e.split(";"), ucdFields);
entry.codePoint = parseInt(entry.codePoint, 16);
return entry;
});
lines.sort(function (a, b) {
if (a.propertyClass < b.propertyClass) return -1;
if (a.propertyClass > b.propertyClass) return 1;
if (a.codePoint < b.codePoint) return -1;
if (a.codePoint > b.codePoint) return 1;
return 0;
});
propertyClasses = _RegExp2.propertyClasses = {};
for (var i = 0, len = lines.length; i < len; ++i)
{
var
line = lines[i],
propertyClass = line.propertyClass,
prevClass,
codePoint = line.codePoint,
prevCodePoint;
if (isNaN(codePoint)
|| (codePoint > 0xFFFF && (!_WideString || !_beyondBMPsupport)))
{
continue;
}
if (propertyClass != prevClass)
{
if (codePoint != prevCodePoint + 1)
{
if (startRange)
{
propertyClasses[prevClass] += ("-" + _fromCharCode(prevCodePoint));
}
}
propertyClasses[propertyClass] = _fromCharCode(codePoint);
var startRange = false;
}
else
{
if (codePoint != prevCodePoint + 1)
{
if (startRange)
{
propertyClasses[prevClass] += ("-" + _fromCharCode(prevCodePoint));
startRange = false;
}
propertyClasses[propertyClass] += _fromCharCode(codePoint);
}
else
{
startRange = true;
}
}
prevClass = propertyClass,
prevCodePoint = codePoint;
}
if (startRange)
{
propertyClasses[prevClass] += ("-" + _fromCharCode(prevCodePoint));
}
}
)).send();
},
sPropertyEscapes = /\\(p)\{([^\}]+)\}/.source,
rxNegEscape = new RegExp(sPropertyEscapes.toUpperCase() + /|\\([DSW])/.source, "g"),
/**
* @param {String} charClassContent
* @param {boolean} bUnicodeMode
* @return {string}
*/
_normalizeCharClass = function (charClassContent, bUnicodeMode) {
var negEscapes = [];
if (charClassContent == "") return "[]";
if (charClassContent == "^") return "[^]";
var reduced = charClassContent.replace(
rxNegEscape,
function (m, cP, charProperty, cDSW) {
var escapeChar = cP || cDSW;
if (escapeChar == "P" || bUnicodeMode)
{
negEscapes.push("\\" + escapeChar.toLowerCase()
+ (charProperty ? "{" + charProperty + "}" : ""));
return "";
}
return m;
});
if (negEscapes.length > 0)
{
/* Do not let negated empty class from reduction match everything */
if (reduced == "^")
{
reduced = "";
}
if (reduced !== "")
{
jsx.warn(
"jsx.regexp.RegExp: Combined negative escapes in character classes"
+ " require support for non-capturing parentheses");
}
return (reduced ? "(?:[" + reduced + "]|" : "")
+ "[" + (charClassContent.charAt(0) == "^" ? "" : "^")
+ negEscapes.join("") + "]"
+ (reduced ? ")" : "");
}
return "[" + reduced + "]";
},
rxPropertyEscapes = new RegExp(sPropertyEscapes, "gi"),
sNonPropEscInRange = /([^\]\\]|\\[^p])*/.source,
sEscapes =
"\\[(\\^?(" + sNonPropEscInRange + "(" + sPropertyEscapes
+ ")+" + sNonPropEscInRange + ")+)\\]"
+ "|" + sPropertyEscapes + "",
rxEscapes = new RegExp(sEscapes, "gi"),
/*jshint -W072*/
fEscapeMapper = function (match, classRanges, p2, p3, p4, p5, p6, p7,
standalonePropSpec, standaloneClass) {
propertyClasses = _RegExp2.propertyClasses;
/* If the Unicode Character Database (UCD) is not statically loaded */
if (!propertyClasses)
{
/* load it dynamically, ignore exceptions */
var ucdScriptPath = _RegExp2.ucdScriptPath;
if (ucdScriptPath)
{
jsx.tryThis(function () { jsx.importFrom(ucdScriptPath); });
propertyClasses = _RegExp2.propertyClasses;
}
/* if this failed */
if (!propertyClasses)
{
if (!jsx.net || !jsx.net.http || typeof jsx.net.http.Request != "function")
{
jsx.throwThis("jsx.regexp.UCDLoadError",
['"' + _RegExp2.ucdScriptPath + '" (jsx.regexp.RegExp.ucdScriptPath)',
"http.js"]);
}
/* parse the text version of the UCD */
_parseUCDText();
}
}
/*
* Define property classes required for Unicode mode
* if not already defined
*/
_jsx_object.extend(propertyClasses, {
L: "\\p{Ll}\\p{Lm}\\p{Lo}\\p{Lt}\\p{Lu}",
M: "\\p{Mc}\\p{Me}\\p{Mn}",
N: "\\p{Nd}\\p{Nl}\\p{No}",
Digit: "\\p{Nd}",
Space: "\u0009\u000a\u000c\u000d\u0020\u0085\u00a0"
+ "\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005"
+ "\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f"
+ "\u205f\u3000",
Word: "\\p{L}\\p{M}\\p{N}\\p{Pc}"
});
var _rangesStack = _getDataObject({
seen: _getDataObject(),
items: [],
indexOf: function (item) {
item = this.seen[item];
return (item ? item.index : -1);
},
pop: function () {
var items = this.items;
var last = items.pop();
delete this.seen[last];
this.length = items.length;
return last;
},
push: function (item) {
var items = this.items;
this.seen[item] = _getDataObject({index: items.length});
items.push(item);
this.length = items.length;
},
/**
* @return {string}
*/
toString: function () {
return this.items.join(" --> ");
}
});
var _propertyClassReplacer = function (match, propertySpecifier, propertyClass) {
if (propertySpecifier === "P")
{
jsx.throwThis("jsx.regexp.InvalidPropertyClassError",
_rangesStack.pop()
+ " contains the negative property specifier \\P{" + propertyClass + "}");
return;
}
return _getRanges(propertyClass);
};
var _getRanges =
/**
* Retrieves class ranges by property class, and throws a specialized
* exception if this fails.
* @param {String} propertyClass
* @throws jsx.regexp#UndefinedPropertyClassError
*/
function (propertyClass) {
return jsx.tryThis(
function () {
if (_rangesStack.indexOf(propertyClass) > -1)
{
jsx.throwThis("jsx.regexp.InvalidPropertyClassError",
propertyClass + " is cyclically defined ("
+ _rangesStack + " --> " + propertyClass
+ ")");
return;
}
_rangesStack.push(propertyClass);
var escapedRange = _jsx_object.getProperty(propertyClasses, propertyClass);
/*
* Resolve property class references in property class values,
* watch for cyclic structures.
*/
var rxPropertyEscapes = new RegExp(sPropertyEscapes, "gi");
var unescapedRange = escapedRange.replace(rxPropertyEscapes, _propertyClassReplacer);
_rangesStack.pop();
return unescapedRange;
},
function (e) {
if (e.name == "jsx.object.PropertyError")
{
jsx.throwThis("jsx.regexp.UndefinedPropertyClassError",
propertyClass + (_rangesStack.length > 1 ? " in " + _rangesStack : ""));
}
else
{
jsx.rethrowThis(e);
}
});
};
/* We can handle standalone class references … */
if (standaloneClass)
{
var result = _getRanges(standaloneClass);
result = "[" + (standalonePropSpec == "P" ? "^" : "") + result + "]";
}
else
{
/* … and class references in character classes */
result = _normalizeCharClass(classRanges);
result = result.replace(
rxPropertyEscapes,
function (match, propertySpecifier, propertyClass) {
var ranges = _getRanges(propertyClass);
return ranges;
});
}
return result;
};
/*jshint +W072*/
/**
* Creates and returns an extended {@link RegExp} object.
*
* This constructor accepts pattern and flags arguments where you
* can use some features of Perl and Perl-compatible regular
* expressions (PCRE); like {@link RegExp()}, it can also be called
* as a function to do the same. The {@link RegExp} instance it
* returns is augmented with properties to support those features
* when matching it against a string.
*
* The following additional features are currently supported:
* <ul>
* <li>Flags:
* <ul>
* <li><tt>s</tt> (PCRE_DOTALL) – the <tt>.</tt> metacharacter
* matches newline as well.</li>
* <li><tt>u</tt> (Unicode mode) – the meaning of
* character class escape sequences <tt>\b</tt>, <tt>\w</tt>,
* and <tt>\W</tt> is extended to include Unicode character
* properties.</li>
* <li><tt>x</tt> (PCRE_EXTENDED) – whitespace within
* the pattern is ignored, so that it is easier
* human-readable.</li>
* </ul><p>
* Flags except for Unicode mode can be set and unset for
* parts of the expression outside of character classes using
* the <tt>(?…)</tt> and <tt>(?-…)</tt> notations.
* </li>
* <li>Unicode property classes using e.g. the \p{…} notation</li>
* <li>Named capturing groups by passing strings with the
* <tt>(?P&lt;name>…)</tt> or <tt>(?P'name'…)</tt> notation,
* where the <tt>P</tt> is optional, respectively.</li>
* </ul><p>
* This is facilitated through the following steps:
* </p><ol>
* <li>The flags <code>x</code>, <code>s</code> and <code>u</code>
* in the optional <var>sFlags</var> argument set the initial
* state of the pattern-match modifiers; the extended
* {@link RegExp}'s <code>extended</code>, <code>dotAll</code>,
* and <code>unicodeMode</code> properties are set accordingly.
* These flags are removed from the <var>sFlags</var>
* argument subsequently, as it is reused to create the
* {@link RegExp} instance. [Conforming implementations of
* ECMA-262 Ed. 5.1 MUST throw a <code>SyntaxError</code>
* exception on flags other than <code>g</code>, <code>i</code>,
* and <code>m</code> (section 15.10.4.1); implementations of
* Ed. 6.0 and later may also support the <code>u</code>
* (Unicode) and <code>y</code> (sticky) flags,
* but nothing else. Note that as of Ed. 8.0 the <em>standard</em>
* <code>u</code> flag does not provide support for Unicode character
* properties.]</li>
* <li>The pattern is run through several passes, where in each
* one it is scanned from left to right using another
* {@link RegExp}:
* <ol style="margin-bottom: 1em; list-style-type: lower-roman">
* <li><p>Capturing groups and pattern-match modifiers in the
* pattern are matched and replaced.
* <p>Capturing groups are replaced with the opening
* parenthesis if they were assigned a name. The
* extended {@link RegExp}'s <code>groups</code>,
* <code>names</code>, and <code>_patternGroups</code>
* properties are set accordingly. They are used in an
* overwritten <code>exec()</code> method and when matching
* against a <code>jsx.regexp.String</code> using its
* <tt>match(…)</tt> method.</p>
* <p style="margin-bottom: 0">
* Pattern-match modifiers are set and unset as they
* are scanned. The corresponding substrings are
* removed from the pattern. If the group is otherwise
* empty, and therefore is not a group at all,
* the entire pseudo-group is removed.</p>
* <ol style="margin-top: 0; list-style-type: lower-latin">
* <li>With PCRE_EXTENDED set, single-line
* comments starting with <tt>#</tt> and unescaped
* whitespace are removed from the pattern. The backslash
* is removed from the pattern when in front of
* whitespace.</li>
* <li>With PCRE_DOTALL set, unescaped <tt>.</tt>
* (period) characters are replaced with the character class
* <tt>[\S\s]</tt> which matches all Unicode characters.</li>
* </ol>
* <p><em>NOTE: Unlike in Perl and PCRE, a pattern-match
* modifier affects all of the pattern that follows,
* even outside the group in which the modifier was
* set/unset. This will be fixed in a later version.</em>
* </p></li>
* <li>When in Unicode mode,
* <ol style="list-style-type: lower-latin">
* <li>in the second pass, character class escape sequences
* <tt>\w</tt> and <tt>\W</tt> are replaced with
* corresponding uses of <tt>\p{Word}</tt>.</li>
* <li>in the third pass, <tt>\b</tt> is replaced with
* corresponding uses of character classes and negative
* lookahead.
* </ol></li>
* <li style="margin-top: 1em">The <tt>\p{…}</tt> and <tt>\P{…}</tt>
* escape sequences are replaced with the corresponding
* character classes.</li>
* </ol></li>
* <li>The resulting expression and remaining flags are passed
* to the {@link RegExp} constructor.</li>
* <li>The created {@link RegExp} instance is augmented with
* properties and returned.</li>
* </ol><p>
* There are the following possibilities to make Unicode property
* classes known to this constructor:
* </p><ol>
* <li>Provide the Unicode Character Database, or parts thereof,
* as an Object;</li>
* <li>Provide the Unicode Character Database, or parts thereof,
* as a plain text resource that is accessed with
* XMLHttpRequest;</li>
* <li>Define property classes manually</li>
* </ol>
* <p>
* Variant #1 requires you to define a mapping object with
* the following namespace and structure:
* </p>
* <pre><code>
* jsx.regexp.RegExp.propertyClasses = {
* ...,
* Sc: "\u20AC...",
* ...
* };
* </code></pre>
* <p>
* The property name is the name of the Unicode property class
* (here: <tt>Sc</tt>). The property value (a string) defines
* which characters belong to that class. You may use "-"
* to specify character ranges, i.e., the range of characters
* including the characters having the boundaries as code point
* value, and all characters that have a code point value
* in-between. (For a literal "-", you may use "\\-".)
* An example file to mirror the Unicode 5.0 Character Database,
* UnicodeData.js, is distributed with this file. Include it
* <em>after</em> the file that declares the constructor (this
* file) to use it. If you do not include it, but use the
* <code>\p{...}</code> notation, an attempt will be made to load
* the file specified by the <code>ucdScriptPath</code> property
* (default: <code>"/scripts/UnicodeData.js"</code>) using
* synchronous XHR (see below).
* </p>
* <p>
* Variant #2 is going to support two different methods:
* Synchronous and asynchronous request-response handling.
* Synchronous request-response handling requests the (partial)
* Unicode Character Database from the resource specified by
* the <code>ucdTextPath</code> property (default:
* <code>"/scripts/UnicodeData.txt"</code>) and halts execution
* until a response has been received or the connection timed out.
* Asynchronous request-response handling allows script execution
* to continue while the request and response are in progress, but
* you need to provide a callback as third argument where actions
* related to the regular expression must be performed.
* Asynchronous handling is recommended for applications that need
* to be responsive to user input. <strong>Currently, only
* synchronous handling is implemented.</strong>
* </p>
* <p>
* Variant #3 can be combined with the other variants.
* The constructor has a definePropertyClasses() method which can
* be used to define and redefine property classes. This allows
* an extended RegExp object to support only a subset of Unicode
* property classes, and to support user-defined character
* property classes.
* </p>
*
* The returned {@link RegExp} has additional properties to
* accomodate syntax extensions in the pattern string:
*
* @constructor
* @param {String|RegExp} expression
* A regular expression pattern string that may use the features
* described above. If it is a {@link RegExp}, its
* <code>source</code> property is used and combined with
* <var>sFlags</var>. That is, <code>jsx.regexp.RegExp(/foo/, "i")</code>
* returns the same as <code>jsx.regexp.RegExp(/foo/i)</code>.
* @param {String} sFlags
* Optional string containing none, one or more of the standard
* {@link RegExp} modifiers and the flags described above.
* Unsupported flags are ignored, but passed on to {@link RegExp}.
* Note that modifiers in <var>expression</var> can temporarily
* unset and set the "s" and "x" flags. Following Perl, the "u"
* flag (Unicode mode) can only be enabled, but not disabled.
* @return {RegExp}
* A regular expression with the property class escape sequences
* expanded according to the specified data, with the specified
* flags set if they are natively supported.
*/
function jsx_regexp_RegExp (expression, sFlags)
{
var flags = _getDataObject();
var flagsMap = _getDataObject({
g: "global",
i: "ignoreCase",
m: "multiline",
y: "sticky"
});
var flagsMapKeys = _getKeys(flagsMap);
var sExpressionFlags = "";
if (expression && _getClass(expression) == "RegExp")
{
for (var i = 0, len = flagsMapKeys.length; i < len; ++i)
{
var key = flagsMapKeys[i];
var expressionFlag = !!expression[flagsMap[key]];
flags[flagsMap[key]] = expressionFlag;
if (expressionFlag) sExpressionFlags += key;
}
expression = expression.source;
}
var t = typeof expression;
if (t != "string")
{
if (arguments.length < 1)
{
expression = "";
}
else
{
expression = String(expression);
}
}
var pattern = expression;
if (typeof sFlags == "undefined")
{
sFlags = sExpressionFlags;
}
else
{
for (i = 0, len = flagsMapKeys.length; i < len; ++i)
{
flags[flagsMap[flagsMapKeys[i]]] = (sFlags.indexOf(flagsMapKeys[i]) > -1);
}
}
var extended = false;
var dotAll = false;
var unicodeMode = false;
if (sFlags)
{
if (sFlags.indexOf("x") > -1)
{
var originalExtended = extended = true;
}
if (sFlags.indexOf("s") > -1)
{
var originalDotAll = dotAll = true;
}
if (sFlags.indexOf("u") > -1)
{
unicodeMode = true;
}
sFlags = sFlags.replace(/[xs]/g, "");
if (unicodeMode && !_beyondBMPsupport)
{
sFlags = sFlags.replace(/u/g, "");
}
}
_extend(flags, {
extended: !!originalExtended,
dotAll: !!originalDotAll,
unicodeMode: unicodeMode
}, _ADD_OVERWRITE);
/* Support for capturing and special groups */
var groupCount = 0;
var groups = _getDataObject();
var names = _getDataObject();
var patternGroups = [expression];
/*jshint -W072*/
expression = expression.replace(
/(\\\()/.concat(
"|",
/(\((\?P?(([adlupimsx]+)?(-([imsx]+))?)(<([^>]+)>|'([^']+)'|([:!]))?(\))?)?)/g,
"|",
/(#.*(\r?\n|\r|$))|\\(\s)/,
"|",
/\[([^\\\]]|\\.)*\]|(\s+)|\\\.|(\.)/g
),
function (match, escapedLParen,
group, specialGroup, modifierGroup,
positiveModifiers, negativeModifiers_opt, negativeModifiers,
namedGroup, bracketedName, quotedName,
nonCapturingGroup, emptyGroup,
comment, newline,
escapedWS, charClassContent, whitespace,
plainDot,
index, all) {
if (group)
{
var capturingGroup = (!nonCapturingGroup && !(modifierGroup && emptyGroup));
if (capturingGroup) ++groupCount;
if (positiveModifiers)
{
var
rxPosModifiers = /[sx]/g,
m;
while ((m = rxPosModifiers.exec(positiveModifiers)))
{
switch (m[0])
{
case "s":
dotAll = true;
break;
case "x":
extended = true;
}
}
}
if (negativeModifiers)
{
var rxNegModifiers = /[sx]/g;
while ((m = rxNegModifiers.exec(negativeModifiers)))
{
switch (m[0])
{
case "s":
dotAll = false;
break;
case "x":
extended = false;
}
}
}
if (capturingGroup)
{
/* Support for named capturing groups (PCRE-compliant) */
var name = bracketedName || quotedName;
if (name)
{
if (names[name])
{
jsx.throwThis("SyntaxError", "Duplicate symbolic name");
}
groups[groupCount] = name;
names[name] = groupCount;
}
/*
* NOTE: Helps with determining in exec() and match()
* whether \b matched at beginning and \Ws need to be
* ltrimmed from match
*/
patternGroups.push(all.substring(index));
return "(";
}
return (emptyGroup ? "" : "(?" + nonCapturingGroup);
}
/* PCRE_EXTENDED */
if (extended)
{
/* Remove comments */
if (comment) return "";
/* Keep escaped whitespace, remove escape */
if (escapedWS) return escapedWS;
/* Remove unescaped whitespace */
if (whitespace) return "";
}
/* PCRE_DOTALL */
if (dotAll && plainDot) return "[\\S\\s]";
return match;
});
/*jshint +W072*/
groups.length = groupCount;
/* Unicode mode */
if (unicodeMode)
{
var characterEscapes = {
"d": "\\p{Digit}",
"s": "\\p{Space}",
"w": "\\p{Word}"
};
expression = expression.replace(
/\[(([^\]\\]|\\.)*)\]|(\\([dsw]))/gi,
function (match, charClassContent, p2, classCharacter, escapeLetter) {
if (charClassContent)
{
/* Do not expand PCRE_DOTALL expansion */
/* TODO: Never expand all-inclusive character classes */
if (charClassContent == "\\S\\s")
{
return "[" + charClassContent + "]";
}
var normalized = _normalizeCharClass(charClassContent, true);
return normalized.replace(
/\\\\|\\([dsw])/gi,
function (match, escapeLetter) {
if (escapeLetter)
{
return characterEscapes[escapeLetter.toLowerCase()];
}
return match;
});
}
if (classCharacter)
{
return "["
+ (escapeLetter >= "A" && escapeLetter <= "Z" ? "^" : "")
+ characterEscapes[escapeLetter.toLowerCase()] + "]";
}
return match;
});
/* Replace \b */
var firstGroup = expression.match(/\((\?(P?(<([^>]+)>|'([^']+)')|[:!]))?/);
var afterFirstGroup = (firstGroup && (firstGroup.index + firstGroup[0].length) || 0);
var wordEscape = characterEscapes.w;
expression = expression.replace(
/\\\\|(\\b)/g,
function (match, wordBorder, index, all) {
if (wordBorder)
{
/* Handle \b in leading groups properly */
if (index > afterFirstGroup)
{
return "(?!" + wordEscape + ")";
}
return "(?:^|[^" + wordEscape + "])";
}
return match;
});
}
/* Support for Unicode character property classes (PCRE-compliant) */
expression = expression.replace(rxEscapes, fEscapeMapper);
var rx = new RegExp(expression, sFlags);
/* Augmented properties */
rx.pattern = pattern;
rx._patternGroups = patternGroups;
rx.groups = groups;
rx.names = names;
rx._flags = flags;
rx.dotAll = !!originalDotAll;
rx.extended = !!originalExtended;
rx.unicodeMode = unicodeMode;
rx._oldExec = rx.exec;
rx.exec = jsx_regexp_RegExp.exec;
return rx;
}
return jsx_regexp_RegExp;
}()
),
{
/**
* @memberOf jsx.regexp.RegExp
*/
ucdScriptPath: "/scripts/UnicodeData.js",
ucdTextPath: "/scripts/UnicodeData.txt",
/**
* Determines if an object has been constructed using this constructor.
*
* @param rx
* @return {boolean}
*/
isInstance: function (rx) {
return !!rx.pattern;
},
/**
* @function
*/
exec: (function () {
var rx2;
/**
* @param {string} s
* @param {jsx.regexp.RegExp} rx
* @return {Object|null}
* @see RegExp.prototype.exec()
*/
function _exec (s, rx)
{
/* NOTE: Use passed expression only when called statically */
if (_getClass(this) == "RegExp") rx = this;
rx._realExec = (rx._oldExec || rx.exec);
var matches = rx._realExec(s);
if (matches && _RegExp2.isInstance(rx))
{
matches.groups = _getDataObject();
if (rx.unicodeMode && !rx2)
{
rx2 = new _RegExp2("^\\W+", "u");
}
for (var i = 0, len = matches.length; i < len; ++i)
{
/* Trim leading \b matches */
var patternGroup = rx._patternGroups[i];
if (rx.unicodeMode
&& patternGroup
&& patternGroup.match(
/^(\((\?P?(<([^>]+)>|'([^']+)'))?)*\\b/))
{
matches[i] = matches[i].replace(rx2, "");
}
matches.groups[rx.groups[i] || i] = matches[i];
}
}
return matches;
}
return _exec;
}()),
/**
* (Re-)defines one or more property classes.
*
* @param {Object} o
* Object whose own enumerable properties are used
* for property class definitions
* @return {jsx.regexp.RegExp}
* This object
*/
definePropertyClasses: function (o) {
for (var keys = _jsx_object.getKeys(o), i = 0, len = keys.length;
i < len; ++i)
{
var p = keys[i];
this.propertyClasses[p] = o[p];
}
return this;
},
/**
* Deletes a property class.
*
* @param {String} p
* @return {boolean}
* <code>true</code> if successful, <code>false</code> otherwise.
*/
deletePropertyClass: function (p) {
return (delete this.propertyClasses[p]);
}
}
);
/**
* Exception thrown if a character property class is referenced,
* but the Unicode Character Database (UCD) cannot be loaded
*
* @constructor
* @param {String} sUCDScript
* The script that contains the UCD in the specified form
* @param {String} sHTTPScript
* The script that contains the HTTP request type to load the UCD
* dynamically
* @type jsx.regexp.UCDLoadError
* @extends jsx#Error
*/
var _UCDLoadError = function (sUCDScript, sHTTPScript) {
_UCDLoadError._super.call(this,
"Unable to load the Unicode Character Database."
+ " Please include " + sUCDScript + " or " + sHTTPScript + ".");
}.extend(jsx.Error, {
/**
* @memberOf jsx.regexp.UCDLoadError#prototype
*/
name: "jsx.regexp.UCDLoadError"
});
/**
* Exception thrown if a referred character property class
* cannot be resolved
*
* @constructor
* @param sMsg
* @extends jsx.object.PropertyError
*/
var _UndefinedPropertyClassError = function (sMsg) {
_UndefinedPropertyClassError._super.call(this);
this.message = "Undefined property class"
+ (arguments.length > 0 ? (": " + sMsg) : "");
}.extend(jsx.object.PropertyError, {
/**
* @memberOf jsx.regexp.UndefinedPropertyClassError#prototype
*/
name: "jsx.regexp.UndefinedPropertyClassError"
});
/**
* Exception thrown if a property class value can not be expanded
*
* @constructor
* @param sMsg
* @extends jsx.object.ObjectError
*/
var _InvalidPropertyClassError = function (sMsg) {
_InvalidPropertyClassError._super.call(this);
this.message = "Invalid property class value"
+ (arguments.length > 0 ? (": " + sMsg) : "");
}.extend(jsx.object.ObjectError, {
name: "jsx.regexp.InvalidPropertyClassError"
});
/**
* @constructor
* @extends String
*/
var _String = function jsx_regexp_String (s) {
if (this.constructor != jsx_regexp_String)
{
jsx.throwThis("jsx.Error", "Must be called as constructor",
"jsx.regexp.String");
}
this.value = String(s);
}.extend(String, (function () {
var _replace = String.prototype.replace;
function _toString () { return this.value; }
return {
/**
* Matches a string against a regular expression, using special features
* of jsx.regexp.RegExp if possible
*
* @function
*/
match: (function () {
var rxLeadingGroups, rxNonWordChars;
/**
* @param {RegExp|jsx.regexp.RegExp} rx
* @return {Array}
* The Array as if returned by String.prototype.match.call(this, rx)
*/
return function (rx) {
var matches = this.value.match(rx);
if (matches && _RegExp2.isInstance(rx))
{
if (rx.unicodeMode)
{
if (!rxNonWordChars)
{
rxLeadingGroups = /^(\((\?P?(<([^>]+)>|'([^']+)'))?)*\\b/;
rxNonWordChars = new _RegExp2("^\\W+", "u");
}
}
if (rx.global)
{
/* Trim \b matches */
if (rx.unicodeMode)
{
var patternGroup = rx._patternGroups[0];
if (patternGroup.match(rxLeadingGroups))
{
for (var i = 0, len = matches.length; i < len; ++i)
{
matches[i] = matches[i].replace(rxNonWordChars, "");
}
}
}
}
else
{
matches.groups = _getDataObject();
for (i = 0, len = matches.length; i < len; ++i)
{
if (rx.unicodeMode)
{
patternGroup = rx._patternGroups[i];
if (patternGroup.match(rxLeadingGroups))
{
matches[i] = matches[i].replace(rxNonWordChars, "");
}
}
matches.groups[rx.groups[i] || i] = matches[i];
}
}
}
return matches;
};
}()),
/**
* Replaces matches in a string, and returns the new string.
*
* Different to {@link String.prototype.replace()},
* this methods also allows you to refer to backreferences
* by name. In a String-like object, you may use
* <code>"${name}"</code>, and in a replacement function
* you may use <code>this.groups["name"]</code>.
*
* NOTE: Because of the latter the replacement function
* is called as a method of this object, not of
* the Global Object anymore. The <code>groups</code>
* property of this object is retained; that is, the last
* arguments to this method can be found in there.
* (Arguments and return value of the replacement function
* still work as specified in ECMAScript.)
*
* @memberOf jsx.regexp.String.prototype
* @param {jsx.regexp.RegExp|RegExp|String} expression
* @param {String|Function} replacement
* @return {string}
* @see String.prototype.replace
*/
replace: function (expression, replacement) {
if (jsx.regexp.RegExp.isInstance(expression))
{
var groups = expression.groups;
var len = groups.length;
if (typeof replacement == "function")
{
var me = this;
return _replace.call(this, expression, function () {
me.groups = _getDataObject();
for (var i = 1; i <= len; ++i)
{
me.groups[groups[i]] = arguments[i];
}
return replacement.apply(me, arguments);
});
}
for (var i = 1; i <= len; ++i)
{
/* replace "${name}" with "${index}" */
replacement = _replace.call(
replacement,
new RegExp("\\$\\{" + groups[i] + "\\}", "g"),
"$" + i);
}
}
return _replace.call(this, expression, replacement);
},
/**
* Returns this object's encapsulated string value
*/
toString: _toString,
valueOf: _toString
};
}()));
/**
* Concatenates strings or regular expressions ({@link RegExp})
* and returns the resulting <code>RegExp</code>.
*
* If flags are set with either <code>RegExp</code> argument, the
* resulting <code>RegExp</code> has all of those flags set.
*
* @author Copyright (c) 2005
* Thomas Lahn &lt;regexp.js@PointedEars.de&gt;
* @param {RegExp|String}
* Expressions to be concatenated. If a not a {@link RegExp},
* the argument is converted to {@link String}; this allows
* for expressions to be grouped and used in alternation.
* For characters to lose their special meaning, escape them in
* a <code>RegExp</code> argument or escape them twice in
* a converted (e.g. string) argument.
*
* If this function is called as method of a <code>RegExp</code>,
* the expressions given are concatenated beginning with the
* <code>this</code> value.
* @return {RegExp}
* The resulting <code>RegExp</code>
*/
function _concat ()
{
var aParts = [];
var oFlags = {
flags: {
g: "global",
i: "ignoreCase",
m: "multiline",
y: "sticky",
s: "dotAll",
x: "extended",
u: "unicodeMode"
},
g: false,
i: false,
m: false,
y: false,
s: false,
x: false,
u: false,
setFromTemplate: function (template) {
var flags = this.flags;
for (var flag in flags)
{
if (!this[flag] && template[flags[flag]]) this[flag] = true;
}
},
toString:
/**
* @return {string}
*/
function () {
var a = [];
for (var flag in this.flags)
{
if (this[flag] === true) a.push(flag);
}
return a.join("");
}
};
var regexp2str = jsx.regexp.toString2;
var partIsExtended = false;
if (_getClass(this) == "RegExp")
{
aParts.push(regexp2str(this));
oFlags.setFromTemplate(this);
if (!partIsExtended) partIsExtended = _RegExp2.isInstance(this);
}
for (var i = 0, iArgnum = arguments.length; i < iArgnum; i++)
{
var a = arguments[i];
if (_getClass(a) == "RegExp")
{
if (!partIsExtended) partIsExtended = _RegExp2.isInstance(a);
aParts.push(regexp2str(a));
oFlags.setFromTemplate(a);
}
else
{
aParts.push(String(a));
}
}
var C = partIsExtended ? _RegExp2 : RegExp;
return new C(aParts.join(""), oFlags.toString());
}
/**
* Returns a {@link RegExp} that is an intersection of two
* regular expressions.
*
* @param {RegExp} pattern2
* @param {RegExp} pattern1
* @return {RegExp}
* A regular expression which matches the strings that both
* <var>pattern1</var> (or this object) and <var>pattern2</var>
* would match.
*/
function _intersect (pattern2, pattern1)
{
if (!pattern1 || _getClass(pattern1) != "RegExp")
{
if (_getClass(this) != "RegExp") return null;
pattern1 = this;
}
/* Rule out invalid values */
if (!pattern2 || _getClass(pattern2) != "RegExp") return null;
/* Remove outer parentheses */
var
s = pattern1.source.replace(/^\(?([^)]*)\)?$/, "$1"),
s2 = pattern2.source.replace(/^\(?([^)]*)\)?$/, "$1");
/* Register all parts within alternation of this pattern */
var
a = s.split("|"),
o = {};
for (var i = 0, len = a.length; i < len; i++)
{
o[a[i]] = true;
}
/* Register all parts within alternation of pattern2 */
var
a2 = s2.split("|"),
o2 = {};
for (i = 0, len = a2.length; i < len; i++)
{
o2[a2[i]] = true;
}
/* Compose the new alternation out of common parts */
var hasOwnProperty = (function () {
return (
(typeof Object.prototype.hasOwnProperty == "function")
? function (o, p) {
return o.hasOwnProperty(p);
}
: function (o, p) {
/* suffices _here_ */
return typeof o[p] != "undefined"
&& typeof o.constructor.prototype[p] == "undefined";
}
);
}());
a = [];
for (var p in o)
{
if (hasOwnProperty(o2, p)) a.push(p);
}
return new RegExp("(" + a.join("|") + ")");
}
/**
* Returns an escaped version of the string that can be passed
* as an argument to {@link Global#RegExp(string, string) RegExp()}
* to match that string.
*
* @param {string} s
* @return {string}
*/
function _escape (s)
{
if (arguments.length === 0 && _isString(this.constructor)) s = this;
return s.replace(/[\]\\^$*+?.(){}|[]/g, "\\$&");
}
/**
* Returns the string representation of a {@link RegExp}
* without delimiters.
*
* @param {RegExp} rx
* @return {string}
* The string representation of <var>rx</var>
*/
function _toString2 (rx)
{
// return rx.toString().replace(/[^\/]*\/((\\\/|[^\/])+)\/[^\/]*/, "$1");
if (!rx) rx = this;
return rx.source || rx.toString().replace(/[^\/]*\/(.+)\/[^\/]*/, "$1");
}
/**
* @namespace
*/
jsx.regexp = {
/**
* @version
* @memberOf jsx.regexp
*/
version: "0.1.$Revision$",
copyright: "Copyright \xA9 2005, 2009-2017",
author: "Thomas Lahn",
email: "js@PointedEars.de",
path: "http://pointedears.de/scripts/",
SINGLE_QUOTED_STRING: /'([^\\']|\\.)'/,
DOUBLE_QUOTED_STRING: /"([^\\"]|\\.)"/,
UCDLoadError: _UCDLoadError,
UndefinedPropertyClassError: _UndefinedPropertyClassError,
InvalidPropertyClassError: _InvalidPropertyClassError,
RegExp: _RegExp2,
String: _String,
concat: _concat,
intersect: _intersect,
escape: _escape,
toString2: _toString2
};
// jsx.regexp.docURL = jsx.regexp.path + "regexp.htm";
if (jsx.options.augmentPrototypes)
{
jsx.object.extend(RegExp.prototype, {
/**
* @memberOf RegExp.prototype
*/
intersect: _intersect,
concat: _concat,
toString2: _toString2
});
jsx.object.extend(String.prototype, {
/**
* @memberOf String.prototype
*/
regExpEscape: _escape
});
};
}());
/*jshint -W098*/
/** @deprecated */
var regexp2str = jsx.regexp.toString2;
/** @deprecated */
var regexp_concat = jsx.regexp.concat;
/** @deprecated */
var regexp_intersect = jsx.regexp.intersect;
/** @deprecated */
var strRegExpEscape = jsx.regexp.escape;