mirrored from git://develop.git.wordpress.org/
/
class-wp-html-tag-processor.php
2308 lines (2092 loc) · 72.6 KB
/
class-wp-html-tag-processor.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
/**
* Scans through an HTML document to find specific tags, then
* transforms those tags by adding, removing, or updating the
* values of the HTML attributes within that tag (opener).
*
* Does not fully parse HTML or _recurse_ into the HTML structure
* Instead this scans linearly through a document and only parses
* the HTML tag openers.
*
* ### Possible future direction for this module
*
* - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c".
* This would increase the size of the changes for some operations but leave more
* natural-looking output HTML.
* - Decode HTML character references within class names when matching. E.g. match having
* class `1<"2` needs to recognize `class="1<"2"`. Currently the Tag Processor
* will fail to find the right tag if the class name is encoded as such.
* - Properly decode HTML character references in `get_attribute()`. PHP's
* `html_entity_decode()` is wrong in a couple ways: it doesn't account for the
* no-ambiguous-ampersand rule, and it improperly handles the way semicolons may
* or may not terminate a character reference.
*
* @package WordPress
* @subpackage HTML-API
* @since 6.2.0
*/
/**
* Modifies attributes in an HTML document for tags matching a query.
*
* ## Usage
*
* Use of this class requires three steps:
*
* 1. Create a new class instance with your input HTML document.
* 2. Find the tag(s) you are looking for.
* 3. Request changes to the attributes in those tag(s).
*
* Example:
* ```php
* $tags = new WP_HTML_Tag_Processor( $html );
* if ( $tags->next_tag( 'option' ) ) {
* $tags->set_attribute( 'selected', true );
* }
* ```
*
* ### Finding tags
*
* The `next_tag()` function moves the internal cursor through
* your input HTML document until it finds a tag meeting any of
* the supplied restrictions in the optional query argument. If
* no argument is provided then it will find the next HTML tag,
* regardless of what kind it is.
*
* If you want to _find whatever the next tag is_:
* ```php
* $tags->next_tag();
* ```
*
* | Goal | Query |
* |-----------------------------------------------------------|---------------------------------------------------------------------------------|
* | Find any tag. | `$tags->next_tag();` |
* | Find next image tag. | `$tags->next_tag( array( 'tag_name' => 'img' ) );` |
* | Find next image tag (without passing the array). | `$tags->next_tag( 'img' );` |
* | Find next tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'class_name' => 'fullwidth' ) );` |
* | Find next image tag containing the `fullwidth` CSS class. | `$tags->next_tag( array( 'tag_name' => 'img', 'class_name' => 'fullwidth' ) );` |
*
* If a tag was found meeting your criteria then `next_tag()`
* will return `true` and you can proceed to modify it. If it
* returns `false`, however, it failed to find the tag and
* moved the cursor to the end of the file.
*
* Once the cursor reaches the end of the file the processor
* is done and if you want to reach an earlier tag you will
* need to recreate the processor and start over, as it's
* unable to back up or move in reverse.
*
* See the section on bookmarks for an exception to this
* no-backing-up rule.
*
* #### Custom queries
*
* Sometimes it's necessary to further inspect an HTML tag than
* the query syntax here permits. In these cases one may further
* inspect the search results using the read-only functions
* provided by the processor or external state or variables.
*
* Example:
* ```php
* // Paint up to the first five DIV or SPAN tags marked with the "jazzy" style.
* $remaining_count = 5;
* while ( $remaining_count > 0 && $tags->next_tag() ) {
* if (
* ( 'DIV' === $tags->get_tag() || 'SPAN' === $tags->get_tag() ) &&
* 'jazzy' === $tags->get_attribute( 'data-style' )
* ) {
* $tags->add_class( 'theme-style-everest-jazz' );
* $remaining_count--;
* }
* }
* ```
*
* `get_attribute()` will return `null` if the attribute wasn't present
* on the tag when it was called. It may return `""` (the empty string)
* in cases where the attribute was present but its value was empty.
* For boolean attributes, those whose name is present but no value is
* given, it will return `true` (the only way to set `false` for an
* attribute is to remove it).
*
* ### Modifying HTML attributes for a found tag
*
* Once you've found the start of an opening tag you can modify
* any number of the attributes on that tag. You can set a new
* value for an attribute, remove the entire attribute, or do
* nothing and move on to the next opening tag.
*
* Example:
* ```php
* if ( $tags->next_tag( array( 'class' => 'wp-group-block' ) ) ) {
* $tags->set_attribute( 'title', 'This groups the contained content.' );
* $tags->remove_attribute( 'data-test-id' );
* }
* ```
*
* If `set_attribute()` is called for an existing attribute it will
* overwrite the existing value. Similarly, calling `remove_attribute()`
* for a non-existing attribute has no effect on the document. Both
* of these methods are safe to call without knowing if a given attribute
* exists beforehand.
*
* ### Modifying CSS classes for a found tag
*
* The tag processor treats the `class` attribute as a special case.
* Because it's a common operation to add or remove CSS classes, this
* interface adds helper methods to make that easier.
*
* As with attribute values, adding or removing CSS classes is a safe
* operation that doesn't require checking if the attribute or class
* exists before making changes. If removing the only class then the
* entire `class` attribute will be removed.
*
* Example:
* ```php
* // from `<span>Yippee!</span>`
* // to `<span class="is-active">Yippee!</span>`
* $tags->add_class( 'is-active' );
*
* // from `<span class="excited">Yippee!</span>`
* // to `<span class="excited is-active">Yippee!</span>`
* $tags->add_class( 'is-active' );
*
* // from `<span class="is-active heavy-accent">Yippee!</span>`
* // to `<span class="is-active heavy-accent">Yippee!</span>`
* $tags->add_class( 'is-active' );
*
* // from `<input type="text" class="is-active rugby not-disabled" length="24">`
* // to `<input type="text" class="is-active not-disabled" length="24">
* $tags->remove_class( 'rugby' );
*
* // from `<input type="text" class="rugby" length="24">`
* // to `<input type="text" length="24">
* $tags->remove_class( 'rugby' );
*
* // from `<input type="text" length="24">`
* // to `<input type="text" length="24">
* $tags->remove_class( 'rugby' );
* ```
*
* When class changes are enqueued but a direct change to `class` is made via
* `set_attribute` then the changes to `set_attribute` (or `remove_attribute`)
* will take precedence over those made through `add_class` and `remove_class`.
*
* ### Bookmarks
*
* While scanning through the input HTMl document it's possible to set
* a named bookmark when a particular tag is found. Later on, after
* continuing to scan other tags, it's possible to `seek` to one of
* the set bookmarks and then proceed again from that point forward.
*
* Because bookmarks create processing overhead one should avoid
* creating too many of them. As a rule, create only bookmarks
* of known string literal names; avoid creating "mark_{$index}"
* and so on. It's fine from a performance standpoint to create a
* bookmark and update it frequently, such as within a loop.
*
* ```php
* $total_todos = 0;
* while ( $p->next_tag( array( 'tag_name' => 'UL', 'class_name' => 'todo' ) ) ) {
* $p->set_bookmark( 'list-start' );
* while ( $p->next_tag( array( 'tag_closers' => 'visit' ) ) ) {
* if ( 'UL' === $p->get_tag() && $p->is_tag_closer() ) {
* $p->set_bookmark( 'list-end' );
* $p->seek( 'list-start' );
* $p->set_attribute( 'data-contained-todos', (string) $total_todos );
* $total_todos = 0;
* $p->seek( 'list-end' );
* break;
* }
*
* if ( 'LI' === $p->get_tag() && ! $p->is_tag_closer() ) {
* $total_todos++;
* }
* }
* }
* ```
*
* ## Design and limitations
*
* The Tag Processor is designed to linearly scan HTML documents and tokenize
* HTML tags and their attributes. It's designed to do this as efficiently as
* possible without compromising parsing integrity. Therefore it will be
* slower than some methods of modifying HTML, such as those incorporating
* over-simplified PCRE patterns, but will not introduce the defects and
* failures that those methods bring in, which lead to broken page renders
* and often to security vulnerabilities. On the other hand, it will be faster
* than full-blown HTML parsers such as DOMDocument and use considerably
* less memory. It requires a negligible memory overhead, enough to consider
* it a zero-overhead system.
*
* The performance characteristics are maintained by avoiding tree construction
* and semantic cleanups which are specified in HTML5. Because of this, for
* example, it's not possible for the Tag Processor to associate any given
* opening tag with its corresponding closing tag, or to return the inner markup
* inside an element. Systems may be built on top of the Tag Processor to do
* this, but the Tag Processor is and should be constrained so it can remain an
* efficient, low-level, and reliable HTML scanner.
*
* The Tag Processor's design incorporates a "garbage-in-garbage-out" philosophy.
* HTML5 specifies that certain invalid content be transformed into different forms
* for display, such as removing null bytes from an input document and replacing
* invalid characters with the Unicode replacement character U+FFFD �. Where errors
* or transformations exist within the HTML5 specification, the Tag Processor leaves
* those invalid inputs untouched, passing them through to the final browser to handle.
* While this implies that certain operations will be non-spec-compliant, such as
* reading the value of an attribute with invalid content, it also preserves a
* simplicity and efficiency for handling those error cases.
*
* Most operations within the Tag Processor are designed to minimize the difference
* between an input and output document for any given change. For example, the
* `add_class` and `remove_class` methods preserve whitespace and the class ordering
* within the `class` attribute; and when encountering tags with duplicated attributes,
* the Tag Processor will leave those invalid duplicate attributes where they are but
* update the proper attribute which the browser will read for parsing its value. An
* exception to this rule is that all attribute updates store their values as
* double-quoted strings, meaning that attributes on input with single-quoted or
* unquoted values will appear in the output with double-quotes.
*
* @since 6.2.0
*/
class WP_HTML_Tag_Processor {
/**
* The maximum number of bookmarks allowed to exist at
* any given time.
*
* @see set_bookmark()
* @since 6.2.0
* @var int
*/
const MAX_BOOKMARKS = 10;
/**
* Maximum number of times seek() can be called.
* Prevents accidental infinite loops.
*
* @see seek()
* @since 6.2.0
* @var int
*/
const MAX_SEEK_OPS = 1000;
/**
* The HTML document to parse.
*
* @since 6.2.0
* @var string
*/
protected $html;
/**
* The last query passed to next_tag().
*
* @since 6.2.0
* @var array|null
*/
private $last_query;
/**
* The tag name this processor currently scans for.
*
* @since 6.2.0
* @var string|null
*/
private $sought_tag_name;
/**
* The CSS class name this processor currently scans for.
*
* @since 6.2.0
* @var string|null
*/
private $sought_class_name;
/**
* The match offset this processor currently scans for.
*
* @since 6.2.0
* @var int|null
*/
private $sought_match_offset;
/**
* Whether to visit tag closers, e.g. </div>, when walking an input document.
*
* @since 6.2.0
* @var bool
*/
private $stop_on_tag_closers;
/**
* Holds updated HTML as updates are applied.
*
* Updates and unmodified portions of the input document are
* appended to this value as they are applied. It will hold
* a copy of the updated document up until the point of the
* latest applied update. The fully-updated HTML document
* will comprise this value plus the part of the input document
* which follows that latest update.
*
* @see $bytes_already_copied
*
* @since 6.2.0
* @var string
*/
private $output_buffer = '';
/**
* How many bytes from the original HTML document have been read and parsed.
*
* This value points to the latest byte offset in the input document which
* has been already parsed. It is the internal cursor for the Tag Processor
* and updates while scanning through the HTML tokens.
*
* @since 6.2.0
* @var int
*/
private $bytes_already_parsed = 0;
/**
* How many bytes from the input HTML document have already been
* copied into the output buffer.
*
* Lexical updates are enqueued and processed in batches. Prior
* to any given update in the input document, there might exist
* a span of HTML unaffected by any changes. This span ought to
* be copied verbatim into the output buffer before applying the
* following update. This value will point to the starting byte
* offset in the input document where that unaffected span of
* HTML starts.
*
* @since 6.2.0
* @var int
*/
private $bytes_already_copied = 0;
/**
* Byte offset in input document where current tag name starts.
*
* Example:
* ```
* <div id="test">...
* 01234
* - tag name starts at 1
* ```
*
* @since 6.2.0
* @var int|null
*/
private $tag_name_starts_at;
/**
* Byte length of current tag name.
*
* Example:
* ```
* <div id="test">...
* 01234
* --- tag name length is 3
* ```
*
* @since 6.2.0
* @var int|null
*/
private $tag_name_length;
/**
* Byte offset in input document where current tag token ends.
*
* Example:
* ```
* <div id="test">...
* 0 1 |
* 01234567890123456
* --- tag name ends at 14
* ```
*
* @since 6.2.0
* @var int|null
*/
private $tag_ends_at;
/**
* Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
*
* @var bool
*/
private $is_closing_tag;
/**
* Lazily-built index of attributes found within an HTML tag, keyed by the attribute name.
*
* Example:
* ```php
* // supposing the parser is working through this content
* // and stops after recognizing the `id` attribute
* // <div id="test-4" class=outline title="data:text/plain;base64=asdk3nk1j3fo8">
* // ^ parsing will continue from this point
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 )
* );
*
* // when picking up parsing again, or when asking to find the
* // `class` attribute we will continue and add to this array
* $this->attributes = array(
* 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ),
* 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 )
* );
*
* // Note that only the `class` attribute value is stored in the index.
* // That's because it is the only value used by this class at the moment.
* ```
*
* @since 6.2.0
* @var WP_HTML_Attribute_Token[]
*/
private $attributes = array();
/**
* Which class names to add or remove from a tag.
*
* These are tracked separately from attribute updates because they are
* semantically distinct, whereas this interface exists for the common
* case of adding and removing class names while other attributes are
* generally modified as with DOM `setAttribute` calls.
*
* When modifying an HTML document these will eventually be collapsed
* into a single `set_attribute( 'class', $changes )` call.
*
* Example:
* ```php
* // Add the `wp-block-group` class, remove the `wp-group` class.
* $classname_updates = array(
* // Indexed by a comparable class name
* 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS,
* 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS
* );
* ```
*
* @since 6.2.0
* @var bool[]
*/
private $classname_updates = array();
/**
* Tracks a semantic location in the original HTML which
* shifts with updates as they are applied to the document.
*
* @since 6.2.0
* @var WP_HTML_Span[]
*/
protected $bookmarks = array();
const ADD_CLASS = true;
const REMOVE_CLASS = false;
const SKIP_CLASS = null;
/**
* Lexical replacements to apply to input HTML document.
*
* "Lexical" in this class refers to the part of this class which
* operates on pure text _as text_ and not as HTML. There's a line
* between the public interface, with HTML-semantic methods like
* `set_attribute` and `add_class`, and an internal state that tracks
* text offsets in the input document.
*
* When higher-level HTML methods are called, those have to transform their
* operations (such as setting an attribute's value) into text diffing
* operations (such as replacing the sub-string from indices A to B with
* some given new string). These text-diffing operations are the lexical
* updates.
*
* As new higher-level methods are added they need to collapse their
* operations into these lower-level lexical updates since that's the
* Tag Processor's internal language of change. Any code which creates
* these lexical updates must ensure that they do not cross HTML syntax
* boundaries, however, so these should never be exposed outside of this
* class or any classes which intentionally expand its functionality.
*
* These are enqueued while editing the document instead of being immediately
* applied to avoid processing overhead, string allocations, and string
* copies when applying many updates to a single document.
*
* Example:
* ```php
* // Replace an attribute stored with a new value, indices
* // sourced from the lazily-parsed HTML recognizer.
* $start = $attributes['src']->start;
* $end = $attributes['src']->end;
* $modifications[] = new WP_HTML_Text_Replacement( $start, $end, $new_value );
*
* // Correspondingly, something like this will appear in this array.
* $lexical_updates = array(
* WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' )
* );
* ```
*
* @since 6.2.0
* @var WP_HTML_Text_Replacement[]
*/
protected $lexical_updates = array();
/**
* Tracks and limits `seek()` calls to prevent accidental infinite loops.
*
* @see seek
* @since 6.2.0
* @var int
*/
protected $seek_count = 0;
/**
* Constructor.
*
* @since 6.2.0
*
* @param string $html HTML to process.
*/
public function __construct( $html ) {
$this->html = $html;
}
/**
* Finds the next tag matching the $query.
*
* @since 6.2.0
*
* @param array|string|null $query {
* Optional. Which tag name to find, having which class, etc. Default is to find any tag.
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type int|null $match_offset Find the Nth tag matching all search criteria.
* 1 for "first" tag, 3 for "third," etc.
* Defaults to first tag.
* @type string|null $class_name Tag must contain this whole class name to match.
* @type string|null $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>.
* }
* @return boolean Whether a tag was matched.
*/
public function next_tag( $query = null ) {
$this->parse_query( $query );
$already_found = 0;
do {
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
return false;
}
// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
$this->bytes_already_parsed = strlen( $this->html );
return false;
}
// Parse all of its attributes.
while ( $this->parse_next_attribute() ) {
continue;
}
// Ensure that the tag closes before the end of the document.
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
return false;
}
$this->tag_ends_at = $tag_ends_at;
$this->bytes_already_parsed = $tag_ends_at;
// Finally, check if the parsed tag and its attributes match the search query.
if ( $this->matches() ) {
++$already_found;
}
/*
* For non-DATA sections which might contain text that looks like HTML tags but
* isn't, scan with the appropriate alternative mode. Looking at the first letter
* of the tag name as a pre-check avoids a string allocation when it's not needed.
*/
$t = $this->html[ $this->tag_name_starts_at ];
if ( ! $this->is_closing_tag && ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) ) {
$tag_name = $this->get_tag();
if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
$this->bytes_already_parsed = strlen( $this->html );
return false;
} elseif (
( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
! $this->skip_rcdata( $tag_name )
) {
$this->bytes_already_parsed = strlen( $this->html );
return false;
}
}
} while ( $already_found < $this->sought_match_offset );
return true;
}
/**
* Sets a bookmark in the HTML document.
*
* Bookmarks represent specific places or tokens in the HTML
* document, such as a tag opener or closer. When applying
* edits to a document, such as setting an attribute, the
* text offsets of that token may shift; the bookmark is
* kept updated with those shifts and remains stable unless
* the entire span of text in which the token sits is removed.
*
* Release bookmarks when they are no longer needed.
*
* Example:
* ```
* <main><h2>Surprising fact you may not know!</h2></main>
* ^ ^
* \-|-- this `H2` opener bookmark tracks the token
*
* <main class="clickbait"><h2>Surprising fact you may no…
* ^ ^
* \-|-- it shifts with edits
* ```
*
* Bookmarks provide the ability to seek to a previously-scanned
* place in the HTML document. This avoids the need to re-scan
* the entire document.
*
* Example:
* ```
* <ul><li>One</li><li>Two</li><li>Three</li></ul>
* ^^^^
* want to note this last item
*
* $p = new WP_HTML_Tag_Processor( $html );
* $in_list = false;
* while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) {
* if ( 'UL' === $p->get_tag() ) {
* if ( $p->is_tag_closer() ) {
* $in_list = false;
* $p->set_bookmark( 'resume' );
* if ( $p->seek( 'last-li' ) ) {
* $p->add_class( 'last-li' );
* }
* $p->seek( 'resume' );
* $p->release_bookmark( 'last-li' );
* $p->release_bookmark( 'resume' );
* } else {
* $in_list = true;
* }
* }
*
* if ( 'LI' === $p->get_tag() ) {
* $p->set_bookmark( 'last-li' );
* }
* }
* ```
*
* Bookmarks intentionally hide the internal string offsets
* to which they refer. They are maintained internally as
* updates are applied to the HTML document and therefore
* retain their "position" - the location to which they
* originally pointed. The inability to use bookmarks with
* functions like `substr` is therefore intentional to guard
* against accidentally breaking the HTML.
*
* Because bookmarks allocate memory and require processing
* for every applied update, they are limited and require
* a name. They should not be created with programmatically-made
* names, such as "li_{$index}" with some loop. As a general
* rule they should only be created with string-literal names
* like "start-of-section" or "last-paragraph".
*
* Bookmarks are a powerful tool to enable complicated behavior.
* Consider double-checking that you need this tool if you are
* reaching for it, as inappropriate use could lead to broken
* HTML structure or unwanted processing overhead.
*
* @since 6.2.0
*
* @param string $name Identifies this particular bookmark.
* @return bool Whether the bookmark was successfully created.
*/
public function set_bookmark( $name ) {
if ( null === $this->tag_name_starts_at ) {
return false;
}
if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) {
_doing_it_wrong(
__METHOD__,
__( 'Too many bookmarks: cannot create any more.' ),
'6.2.0'
);
return false;
}
$this->bookmarks[ $name ] = new WP_HTML_Span(
$this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ),
$this->tag_ends_at
);
return true;
}
/**
* Removes a bookmark that is no longer needed.
*
* Releasing a bookmark frees up the small
* performance overhead it requires.
*
* @param string $name Name of the bookmark to remove.
* @return bool Whether the bookmark already existed before removal.
*/
public function release_bookmark( $name ) {
if ( ! array_key_exists( $name, $this->bookmarks ) ) {
return false;
}
unset( $this->bookmarks[ $name ] );
return true;
}
/**
* Skips contents of title and textarea tags.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
* @since 6.2.0
*
* @param string $tag_name – the lowercase tag name which will close the RCDATA region.
* @return bool Whether an end to the RCDATA region was found before the end of the document.
*/
private function skip_rcdata( $tag_name ) {
$html = $this->html;
$doc_length = strlen( $html );
$tag_length = strlen( $tag_name );
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
$at = strpos( $this->html, '</', $at );
// If there is no possible tag closer then fail.
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
$this->bytes_already_parsed = $doc_length;
return false;
}
$closer_potentially_starts_at = $at;
$at += 2;
/*
* Find a case-insensitive match to the tag name.
*
* Because tag names are limited to US-ASCII there is no
* need to perform any kind of Unicode normalization when
* comparing; any character which could be impacted by such
* normalization could not be part of a tag name.
*/
for ( $i = 0; $i < $tag_length; $i++ ) {
$tag_char = $tag_name[ $i ];
$html_char = $html[ $at + $i ];
if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
$at += $i;
continue 2;
}
}
$at += $tag_length;
$this->bytes_already_parsed = $at;
/*
* Ensure that the tag name terminates to avoid matching on
* substrings of a longer tag name. For example, the sequence
* "</textarearug" should not match for "</textarea" even
* though "textarea" is found within the text.
*/
$c = $html[ $at ];
if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
continue;
}
while ( $this->parse_next_attribute() ) {
continue;
}
$at = $this->bytes_already_parsed;
if ( $at >= strlen( $this->html ) ) {
return false;
}
if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
return true;
}
}
return false;
}
/**
* Skips contents of script tags.
*
* @since 6.2.0
*
* @return bool Whether the script tag was closed before the end of the document.
*/
private function skip_script_data() {
$state = 'unescaped';
$html = $this->html;
$doc_length = strlen( $html );
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
$at += strcspn( $html, '-<', $at );
/*
* For all script states a "-->" transitions
* back into the normal unescaped script mode,
* even if that's the current state.
*/
if (
$at + 2 < $doc_length &&
'-' === $html[ $at ] &&
'-' === $html[ $at + 1 ] &&
'>' === $html[ $at + 2 ]
) {
$at += 3;
$state = 'unescaped';
continue;
}
// Everything of interest past here starts with "<".
if ( $at + 1 >= $doc_length || '<' !== $html[ $at++ ] ) {
continue;
}
/*
* Unlike with "-->", the "<!--" only transitions
* into the escaped mode if not already there.
*
* Inside the escaped modes it will be ignored; and
* should never break out of the double-escaped
* mode and back into the escaped mode.
*
* While this requires a mode change, it does not
* impact the parsing otherwise, so continue
* parsing after updating the state.
*/
if (
$at + 2 < $doc_length &&
'!' === $html[ $at ] &&
'-' === $html[ $at + 1 ] &&
'-' === $html[ $at + 2 ]
) {
$at += 3;
$state = 'unescaped' === $state ? 'escaped' : $state;
continue;
}
if ( '/' === $html[ $at ] ) {
$closer_potentially_starts_at = $at - 1;
$is_closing = true;
++$at;
} else {
$is_closing = false;
}
/*
* At this point the only remaining state-changes occur with the
* <script> and </script> tags; unless one of these appears next,
* proceed scanning to the next potential token in the text.
*/
if ( ! (
$at + 6 < $doc_length &&
( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) &&
( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) &&
( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] )
) ) {
++$at;
continue;
}
/*
* Ensure that the script tag terminates to avoid matching on
* substrings of a non-match. For example, the sequence
* "<script123" should not end a script region even though
* "<script" is found within the text.
*/
if ( $at + 6 >= $doc_length ) {
continue;
}
$at += 6;
$c = $html[ $at ];
if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
++$at;
continue;
}
if ( 'escaped' === $state && ! $is_closing ) {
$state = 'double-escaped';
continue;
}
if ( 'double-escaped' === $state && $is_closing ) {
$state = 'escaped';
continue;
}
if ( $is_closing ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
if ( $this->bytes_already_parsed >= $doc_length ) {
return false;
}
while ( $this->parse_next_attribute() ) {
continue;
}
if ( '>' === $html[ $this->bytes_already_parsed ] ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
return true;
}
}
++$at;
}
return false;
}
/**
* Parses the next tag.
*
* This will find and start parsing the next tag, including
* the opening `<`, the potential closer `/`, and the tag
* name. It does not parse the attributes or scan to the
* closing `>`; these are left for other methods.
*
* @since 6.2.0
*
* @return bool Whether a tag was found before the end of the document.
*/
private function parse_next_tag() {
$this->after_tag();
$html = $this->html;
$doc_length = strlen( $html );
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
$at = strpos( $html, '<', $at );
if ( false === $at ) {
return false;
}
if ( '/' === $this->html[ $at + 1 ] ) {
$this->is_closing_tag = true;
$at++;
} else {
$this->is_closing_tag = false;
}
/*
* HTML tag names must start with [a-zA-Z] otherwise they are not tags.
* For example, "<3" is rendered as text, not a tag opener. If at least
* one letter follows the "<" then _it is_ a tag, but if the following