/
defaultSettings.js
114 lines (101 loc) · 4.18 KB
/
defaultSettings.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/** @typedef {import('./index').WPWordCountStrategy} WPWordCountStrategy */
/** @typedef {Partial<{type: WPWordCountStrategy, shortcodes: string[]}>} WPWordCountL10n */
/**
* @typedef WPWordCountSettingsFields
* @property {RegExp} HTMLRegExp Regular expression that matches HTML tags
* @property {RegExp} HTMLcommentRegExp Regular expression that matches HTML comments
* @property {RegExp} spaceRegExp Regular expression that matches spaces in HTML
* @property {RegExp} HTMLEntityRegExp Regular expression that matches HTML entities
* @property {RegExp} connectorRegExp Regular expression that matches word connectors, like em-dash
* @property {RegExp} removeRegExp Regular expression that matches various characters to be removed when counting
* @property {RegExp} astralRegExp Regular expression that matches astral UTF-16 code points
* @property {RegExp} wordsRegExp Regular expression that matches words
* @property {RegExp} characters_excluding_spacesRegExp Regular expression that matches characters excluding spaces
* @property {RegExp} characters_including_spacesRegExp Regular expression that matches characters including spaces
* @property {RegExp} shortcodesRegExp Regular expression that matches WordPress shortcodes
* @property {string[]} shortcodes List of all shortcodes
* @property {WPWordCountStrategy} type Describes what and how are we counting
* @property {WPWordCountL10n} l10n Object with human translations
*/
/**
* Lower-level settings for word counting that can be overridden.
*
* @typedef {Partial<WPWordCountSettingsFields>} WPWordCountUserSettings
*/
// Disable reason: JSDoc linter doesn't seem to parse the union (`&`) correctly: https://github.com/jsdoc/jsdoc/issues/1285
/* eslint-disable jsdoc/valid-types */
/**
* Word counting settings that include non-optional values we set if missing
*
* @typedef {WPWordCountUserSettings & typeof defaultSettings} WPWordCountDefaultSettings
*/
/* eslint-enable jsdoc/valid-types */
export const defaultSettings = {
HTMLRegExp: /<\/?[a-z][^>]*?>/gi,
HTMLcommentRegExp: /<!--[\s\S]*?-->/g,
spaceRegExp: / | /gi,
HTMLEntityRegExp: /&\S+?;/g,
// \u2014 = em-dash
connectorRegExp: /--|\u2014/g,
// Characters to be removed from input text.
removeRegExp: new RegExp(
[
'[',
// Basic Latin (extract)
'\u0021-\u0040\u005B-\u0060\u007B-\u007E',
// Latin-1 Supplement (extract)
'\u0080-\u00BF\u00D7\u00F7',
/*
* The following range consists of:
* General Punctuation
* Superscripts and Subscripts
* Currency Symbols
* Combining Diacritical Marks for Symbols
* Letterlike Symbols
* Number Forms
* Arrows
* Mathematical Operators
* Miscellaneous Technical
* Control Pictures
* Optical Character Recognition
* Enclosed Alphanumerics
* Box Drawing
* Block Elements
* Geometric Shapes
* Miscellaneous Symbols
* Dingbats
* Miscellaneous Mathematical Symbols-A
* Supplemental Arrows-A
* Braille Patterns
* Supplemental Arrows-B
* Miscellaneous Mathematical Symbols-B
* Supplemental Mathematical Operators
* Miscellaneous Symbols and Arrows
*/
'\u2000-\u2BFF',
// Supplemental Punctuation
'\u2E00-\u2E7F',
']',
].join( '' ),
'g'
),
// Remove UTF-16 surrogate points, see https://en.wikipedia.org/wiki/UTF-16#U.2BD800_to_U.2BDFFF
astralRegExp: /[\uD800-\uDBFF][\uDC00-\uDFFF]/g,
wordsRegExp: /\S\s+/g,
characters_excluding_spacesRegExp: /\S/g,
/*
* Match anything that is not a formatting character, excluding:
* \f = form feed
* \n = new line
* \r = carriage return
* \t = tab
* \v = vertical tab
* \u00AD = soft hyphen
* \u2028 = line separator
* \u2029 = paragraph separator
*/
characters_including_spacesRegExp: /[^\f\n\r\t\v\u00AD\u2028\u2029]/g,
l10n: {
type: 'words',
},
};