-
Notifications
You must be signed in to change notification settings - Fork 638
/
HtmlParser.php
1032 lines (1007 loc) · 52.1 KB
/
HtmlParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<?php
/*
* This file is part of the TYPO3 CMS project.
*
* It is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License, either version 2
* of the License, or any later version.
*
* For the full copyright and license information, please read the
* LICENSE.txt file that was distributed with this source code.
*
* The TYPO3 project - inspiring people to share!
*/
namespace TYPO3\CMS\Core\Html;
use TYPO3\CMS\Core\Utility\GeneralUtility;
use TYPO3\CMS\Core\Utility\MathUtility;
/**
* Functions for parsing HTML.
* You are encouraged to use this class in your own applications
*/
class HtmlParser
{
// Void elements that do not have closing tags, as defined by HTML5, except link element
public const VOID_ELEMENTS = 'area|base|br|col|command|embed|hr|img|input|keygen|meta|param|source|track|wbr';
/************************************
*
* Parsing HTML code
*
************************************/
/**
* Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
* Even numbers in the array are outside the blocks, Odd numbers are block-content.
* Use ->removeFirstAndLastTag() to process the content if needed.
*
* @param string $tag List of tags, comma separated.
* @param string $content HTML-content
* @param bool $eliminateExtraEndTags If set, excessive end tags are ignored - you should probably set this in most cases.
* @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
* @see splitTags()
* @see removeFirstAndLastTag()
*/
public function splitIntoBlock($tag, $content, $eliminateExtraEndTags = false)
{
$tags = array_unique(GeneralUtility::trimExplode(',', $tag, true));
array_walk($tags, static function (string &$tag): void {
$tag = preg_quote($tag, '/');
});
$regexStr = '/\\<\\/?(' . implode('|', $tags) . ')(\\s*\\>|\\s[^\\>]*\\>)/si';
$parts = preg_split($regexStr, $content);
if (empty($parts)) {
return [];
}
$newParts = [];
$pointer = strlen($parts[0]);
$buffer = $parts[0];
$nested = 0;
reset($parts);
// We skip the first element in foreach loop
$partsSliced = array_slice($parts, 1, null, true);
foreach ($partsSliced as $v) {
$isEndTag = substr($content, $pointer, 2) === '</';
$tagLen = strcspn(substr($content, $pointer), '>') + 1;
// We meet a start-tag:
if (!$isEndTag) {
// Ground level:
if (!$nested) {
// Previous buffer stored
$newParts[] = $buffer;
$buffer = '';
}
// We are inside now!
$nested++;
// New buffer set and pointer increased
$mbuffer = substr($content, $pointer, strlen($v) + $tagLen);
$pointer += strlen($mbuffer);
$buffer .= $mbuffer;
} else {
// If we meet an endtag:
// Decrease nested-level
$nested--;
$eliminated = 0;
if ($eliminateExtraEndTags && $nested < 0) {
$nested = 0;
$eliminated = 1;
} else {
// In any case, add the endtag to current buffer and increase pointer
$buffer .= substr($content, $pointer, $tagLen);
}
$pointer += $tagLen;
// if we're back on ground level, (and not by eliminating tags...
if (!$nested && !$eliminated) {
$newParts[] = $buffer;
$buffer = '';
}
// New buffer set and pointer increased
$mbuffer = substr($content, $pointer, strlen($v));
$pointer += strlen($mbuffer);
$buffer .= $mbuffer;
}
}
$newParts[] = $buffer;
return $newParts;
}
/**
* Splitting content into blocks *recursively* and processing tags/content with call back functions.
*
* @param string $tag Tag list, see splitIntoBlock()
* @param string $content Content, see splitIntoBlock()
* @param object $procObj Object where call back methods are.
* @param string $callBackContent Name of call back method for content; "function callBackContent($str,$level)
* @param string $callBackTags Name of call back method for tags; "function callBackTags($tags,$level)
* @param int $level Indent level
* @return string Processed content
* @see splitIntoBlock()
*/
public function splitIntoBlockRecursiveProc($tag, $content, &$procObj, $callBackContent, $callBackTags, $level = 0)
{
$parts = $this->splitIntoBlock($tag, $content, true);
foreach ($parts as $k => $v) {
if ($k % 2) {
$firstTagName = $this->getFirstTagName($v, true);
$tagsArray = [];
$tagsArray['tag_start'] = $this->getFirstTag($v);
$tagsArray['tag_end'] = '</' . $firstTagName . '>';
$tagsArray['tag_name'] = strtolower($firstTagName);
$tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag, $this->removeFirstAndLastTag($v), $procObj, $callBackContent, $callBackTags, $level + 1);
if ($callBackTags) {
$tagsArray = $procObj->{$callBackTags}($tagsArray, $level);
}
$parts[$k] = $tagsArray['tag_start'] . $tagsArray['content'] . $tagsArray['tag_end'];
} else {
if ($callBackContent) {
$parts[$k] = $procObj->{$callBackContent}($parts[$k], $level);
}
}
}
return implode('', $parts);
}
/**
* Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
* Even numbers in the array are outside the blocks, Odd numbers are block-content.
* Use ->removeFirstAndLastTag() to process the content if needed.
*
* @param string $tag List of tags
* @param string $content HTML-content
* @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
* @see splitIntoBlock()
* @see removeFirstAndLastTag()
*/
public function splitTags($tag, $content)
{
$tags = GeneralUtility::trimExplode(',', $tag, true);
array_walk($tags, static function (string &$tag): void {
$tag = preg_quote($tag, '/');
});
$regexStr = '/\\<(' . implode('|', $tags) . ')(\\s[^>]*)?\\/?>/si';
$parts = preg_split($regexStr, $content);
if (empty($parts)) {
return [];
}
$pointer = strlen($parts[0]);
$newParts = [];
$newParts[] = $parts[0];
reset($parts);
// We skip the first element in foreach loop
$partsSliced = array_slice($parts, 1, null, true);
foreach ($partsSliced as $v) {
$tagLen = strcspn(substr($content, $pointer), '>') + 1;
// Set tag:
// New buffer set and pointer increased
$tag = substr($content, $pointer, $tagLen);
$newParts[] = $tag;
$pointer += strlen($tag);
// Set content:
$newParts[] = $v;
$pointer += strlen($v);
}
return $newParts;
}
/**
* Removes the first and last tag in the string
* Anything before the first and after the last tags respectively is also removed
*
* @param string $str String to process
* @return string
*/
public function removeFirstAndLastTag($str)
{
$parser = SimpleParser::fromString($str);
$first = $parser->getFirstNode(SimpleNode::TYPE_ELEMENT);
$last = $parser->getLastNode(SimpleNode::TYPE_ELEMENT);
if ($first === null || $first === $last) {
return '';
}
$sequence = array_slice(
$parser->getNodes(),
$first->getIndex() + 1,
$last->getIndex() - $first->getIndex() - 1
);
return implode('', array_map(strval(...), $sequence));
}
/**
* Returns the first tag in $str
* Actually everything from the beginning of the $str is returned, so you better make sure the tag is the first thing...
*
* @param string $str HTML string with tags
* @return string
*/
public function getFirstTag($str)
{
$parser = SimpleParser::fromString($str);
$first = $parser->getFirstNode(SimpleNode::TYPE_ELEMENT);
if ($first === null) {
return '';
}
$sequence = array_slice(
$parser->getNodes(),
0,
$first->getIndex() + 1
);
return implode('', array_map(strval(...), $sequence));
}
/**
* Returns the NAME of the first tag in $str
*
* @param string $str HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do)
* @param bool $preserveCase If set, then the tag is NOT converted to uppercase by case is preserved.
* @return string Tag name in upper case
* @see getFirstTag()
*/
public function getFirstTagName($str, $preserveCase = false)
{
$parser = SimpleParser::fromString($str);
$elements = $parser->getNodes(SimpleNode::TYPE_ELEMENT);
foreach ($elements as $element) {
$name = $element->getElementName();
if ($name === null) {
continue;
}
return $preserveCase ? $name : strtoupper($name);
}
return '';
}
/**
* Returns an array with all attributes as keys. Attributes are only lowercase a-z
* If an attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset()
*
* Compared to the method in GeneralUtility::get_tag_attributes this method also returns meta data about each
* attribute, e.g. if it is a shorthand attribute, and what the quotation is. Also, since all attribute keys
* are lower-cased, the meta information contains the original attribute name.
*
* @param string $tag Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>')
* @param bool $deHSC If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set!
* @return array array(Tag attributes,Attribute meta-data)
*/
public function get_tag_attributes($tag, $deHSC = false)
{
[$components, $metaC] = $this->split_tag_attributes($tag);
// Attribute name is stored here
$name = '';
$valuemode = false;
$attributes = [];
$attributesMeta = [];
if (is_array($components)) {
foreach ($components as $key => $val) {
// Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value
if ($val !== '=') {
if ($valuemode) {
if ($name) {
$attributes[$name] = $deHSC ? htmlspecialchars_decode($val) : $val;
$attributesMeta[$name]['dashType'] = $metaC[$key];
$name = '';
}
} else {
if ($namekey = preg_replace('/[^[:alnum:]_\\:\\-]/', '', $val) ?? '') {
$name = strtolower((string)$namekey);
$attributesMeta[$name] = [];
$attributesMeta[$name]['origTag'] = $namekey;
$attributes[$name] = '';
}
}
$valuemode = false;
} else {
$valuemode = true;
}
}
return [$attributes, $attributesMeta];
}
return [null, null];
}
/**
* Returns an array with the 'components' from an attribute list.
* The result is normally analyzed by get_tag_attributes
* Removes tag-name if found.
*
* The difference between this method and the one in GeneralUtility is that this method actually determines
* more information on the attribute, e.g. if the value is enclosed by a " or ' character.
* That's why this method returns two arrays, the "components" and the "meta-information" of the "components".
*
* @param string $tag The tag or attributes
* @return array
* @internal
* @see \TYPO3\CMS\Core\Utility\GeneralUtility::split_tag_attributes()
*/
public function split_tag_attributes($tag)
{
$matches = [];
if (preg_match('/(\\<[^\\s]+\\s+)?(.*?)\\s*(\\>)?$/s', $tag, $matches) !== 1) {
return [[], []];
}
$tag_tmp = $matches[2];
$metaValue = [];
$value = [];
$matches = [];
if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\\s"\'\\=]+|\\=)/s', $tag_tmp, $matches) > 0) {
foreach ($matches[1] as $part) {
$firstChar = $part[0];
if ($firstChar === '"' || $firstChar === '\'') {
$metaValue[] = $firstChar;
$value[] = substr($part, 1, -1);
} else {
$metaValue[] = '';
$value[] = $part;
}
}
}
return [$value, $metaValue];
}
/*********************************
*
* Clean HTML code
*
*********************************/
/**
* Function that can clean up HTML content according to configuration given in the $tags array.
*
* Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this: $tags = array_flip(explode(',','b,a,i,u'))
* If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options:
*
* ```
* $tags[$tagname] = Array(
* 'overrideAttribs' => '' If set, this string is preset as the attributes of the tag
* 'allowedAttribs' => '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed.
* 'fixAttrib' => Array(
* '[attribute name]' => Array (
* 'set' => Force the attribute value to this value.
* 'unset' => Boolean: If set, the attribute is unset.
* 'default' => If no attribute exists by this name, this value is set as default value (if this value is not blank)
* 'always' => Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists
* 'trim,intval,lower,upper' => All booleans. If any of these keys are set, the value is passed through the respective PHP-functions.
* 'range' => Array ('[low limit]','[high limit, optional]') Setting integer range.
* 'list' => Array ('[value1/default]','[value2]','[value3]') Attribute must be in this list. If not, the value is set to the first element.
* 'removeIfFalse' => Boolean/'blank'. If set, then the attribute is removed if it is 'FALSE'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed)
* 'removeIfEquals' => [value] If the attribute value matches the value set here, then it is removed.
* 'casesensitiveComp' => 1 If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not.
* )
* ),
* 'protect' => '', Boolean. If set, the tag <> is converted to < and >
* 'remap' => '', String. If set, the tagname is remapped to this tagname
* 'rmTagIfNoAttrib' => '', Boolean. If set, then the tag is removed if no attributes happened to be there.
* 'nesting' => '', Boolean/'global'. If set TRUE, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>'
* )
* ```
*
* @param string $content Is the HTML-content being processed. This is also the result being returned.
* @param array $tags Is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure.
* @param mixed $keepAll Boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to < and >
* @param int $hSC Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&" or "ê")
* @param array $addConfig Configuration array send along as $conf to the internal functions
* @return string Processed HTML content
*/
public function HTMLcleaner($content, $tags = [], $keepAll = 0, $hSC = 0, $addConfig = [])
{
$newContent = [];
$tokArr = explode('<', $content);
$newContent[] = $this->bidir_htmlspecialchars(current($tokArr), $hSC);
// We skip the first element in foreach loop
$tokArrSliced = array_slice($tokArr, 1, null, true);
$c = 1;
$tagRegister = [];
$tagStack = [];
$inComment = false;
$inCdata = false;
$skipTag = false;
foreach ($tokArrSliced as $tok) {
if ($inComment) {
if (($eocPos = strpos($tok, '-->')) === false) {
// End of comment is not found in the token. Go further until end of comment is found in other tokens.
$newContent[$c++] = '<' . $tok;
continue;
}
// Comment ends in the middle of the token: add comment and proceed with rest of the token
$newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3);
$tok = substr($tok, $eocPos + 3);
$inComment = false;
$skipTag = true;
} elseif ($inCdata) {
if (($eocPos = strpos($tok, '/*]]>*/')) === false) {
// End of comment is not found in the token. Go further until end of comment is found in other tokens.
$newContent[$c++] = '<' . $tok;
continue;
}
// Comment ends in the middle of the token: add comment and proceed with rest of the token
$newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10);
$tok = substr($tok, $eocPos + 10);
$inCdata = false;
$skipTag = true;
} elseif (str_starts_with($tok, '!--')) {
if (($eocPos = strpos($tok, '-->')) === false) {
// Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
$newContent[$c++] = '<' . $tok;
$inComment = true;
continue;
}
// Start and end of comment are both in the current token. Add comment and proceed with rest of the token
$newContent[$c++] = '<' . substr($tok, 0, $eocPos + 3);
$tok = substr($tok, $eocPos + 3);
$skipTag = true;
} elseif (str_starts_with($tok, '![CDATA[*/')) {
if (($eocPos = strpos($tok, '/*]]>*/')) === false) {
// Comment started in this token but it does end in the same token. Set a flag to skip till the end of comment
$newContent[$c++] = '<' . $tok;
$inCdata = true;
continue;
}
// Start and end of comment are both in the current token. Add comment and proceed with rest of the token
$newContent[$c++] = '<' . substr($tok, 0, $eocPos + 10);
$tok = substr($tok, $eocPos + 10);
$skipTag = true;
}
$firstChar = $tok[0] ?? null;
// It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
if (!$skipTag && preg_match('/[[:alnum:]\\/]/', (string)$firstChar) === 1) {
$tagEnd = strpos($tok, '>');
// If there is and end-bracket... tagEnd can't be 0 as the first character can't be a >
if ($tagEnd) {
$endTag = $firstChar === '/' ? 1 : 0;
$tagContent = substr($tok, $endTag, $tagEnd - $endTag);
$tagParts = preg_split('/\\s+/s', $tagContent, 2);
$tagName = strtolower(rtrim($tagParts[0], '/'));
$emptyTag = 0;
if (isset($tags[$tagName])) {
// If there is processing to do for the tag:
if (is_array($tags[$tagName])) {
if (preg_match('/^(' . self::VOID_ELEMENTS . ' )$/i', $tagName)) {
$emptyTag = 1;
}
// If NOT an endtag, do attribute processing (added dec. 2003)
if (!$endTag) {
// Override attributes
if (isset($tags[$tagName]['overrideAttribs']) && (string)$tags[$tagName]['overrideAttribs'] !== '') {
$tagParts[1] = $tags[$tagName]['overrideAttribs'];
}
// Allowed tags
if (isset($tags[$tagName]['allowedAttribs']) && (string)$tags[$tagName]['allowedAttribs'] !== '') {
// No attribs allowed
if ((string)$tags[$tagName]['allowedAttribs'] === '0') {
$tagParts[1] = '';
} elseif (isset($tagParts[1]) && trim($tagParts[1])) {
$tagAttrib = $this->get_tag_attributes($tagParts[1]);
$tagParts[1] = '';
$newTagAttrib = [];
$tList = (array)(
$tags[$tagName]['_allowedAttribs']
?? GeneralUtility::trimExplode(',', strtolower($tags[$tagName]['allowedAttribs']), true)
);
foreach ($tList as $allowTag) {
if (isset($tagAttrib[0][$allowTag])) {
$newTagAttrib[$allowTag] = $tagAttrib[0][$allowTag];
}
}
$tagParts[1] = $this->compileTagAttribs($newTagAttrib, $tagAttrib[1]);
}
}
// Fixed attrib values
if (isset($tags[$tagName]['fixAttrib']) && is_array($tags[$tagName]['fixAttrib'])) {
$tagAttrib = $this->get_tag_attributes($tagParts[1] ?? '');
$tagParts[1] = '';
foreach ($tags[$tagName]['fixAttrib'] as $attr => $params) {
if (isset($params['set']) && $params['set'] !== '') {
$tagAttrib[0][$attr] = $params['set'];
}
if (!empty($params['unset'])) {
unset($tagAttrib[0][$attr]);
}
if (!empty($params['default']) && !isset($tagAttrib[0][$attr])) {
$tagAttrib[0][$attr] = $params['default'];
}
if (($params['always'] ?? false) || isset($tagAttrib[0][$attr])) {
if ($params['trim'] ?? false) {
$tagAttrib[0][$attr] = trim($tagAttrib[0][$attr]);
}
if ($params['intval'] ?? false) {
$tagAttrib[0][$attr] = (int)$tagAttrib[0][$attr];
}
if ($params['lower'] ?? false) {
$tagAttrib[0][$attr] = strtolower($tagAttrib[0][$attr]);
}
if ($params['upper'] ?? false) {
$tagAttrib[0][$attr] = strtoupper($tagAttrib[0][$attr]);
}
if ($params['range'] ?? false) {
if (isset($params['range'][1])) {
$tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0], (int)$params['range'][1]);
} else {
$tagAttrib[0][$attr] = MathUtility::forceIntegerInRange($tagAttrib[0][$attr], (int)$params['range'][0]);
}
}
if (isset($params['list']) && is_array($params['list'])) {
// For the class attribute, remove from the attribute value any class not in the list
// Classes are case sensitive
if ($attr === 'class') {
$newClasses = [];
$classes = GeneralUtility::trimExplode(' ', $tagAttrib[0][$attr] ?? '', true);
foreach ($classes as $class) {
if (in_array($class, $params['list'])) {
$newClasses[] = $class;
}
}
if (!empty($newClasses)) {
$tagAttrib[0][$attr] = implode(' ', $newClasses);
} else {
$tagAttrib[0][$attr] = $params['list'][0];
}
} else {
$normalizedSearchWord = $tagAttrib[0][$attr] ?? '';
$normalizedSearchList = $params['list'];
if (!($params['casesensitiveComp'] ?? false)) {
// Case-sensitive comparison is not wanted, normalize all values
$normalizedSearchWord = strtoupper($tagAttrib[0][$attr] ?? '');
array_walk($normalizedSearchList, strtoupper(...));
}
if (!in_array($normalizedSearchWord, $normalizedSearchList, true)) {
$tagAttrib[0][$attr] = $params['list'][0];
}
}
}
if (
(($params['removeIfFalse'] ?? false) && $params['removeIfFalse'] !== 'blank' && !$tagAttrib[0][$attr])
|| (($params['removeIfFalse'] ?? false) && $params['removeIfFalse'] === 'blank' && (string)$tagAttrib[0][$attr] === '')
) {
unset($tagAttrib[0][$attr]);
}
if ((string)($params['removeIfEquals'] ?? '') !== '') {
$normalizedAttribute = $tagAttrib[0][$attr];
$normalizedRemoveIfEquals = $params['removeIfEquals'];
if (!($params['casesensitiveComp'] ?? false)) {
// Case-sensitive comparison is not wanted, normalize all values
$normalizedAttribute = strtoupper($tagAttrib[0][$attr]);
$normalizedRemoveIfEquals = strtoupper($params['removeIfEquals']);
}
if ($normalizedAttribute === $normalizedRemoveIfEquals) {
unset($tagAttrib[0][$attr]);
}
}
if ($params['prefixRelPathWith'] ?? false) {
$urlParts = parse_url($tagAttrib[0][$attr]);
if (is_array($urlParts) && empty($urlParts['scheme']) && !empty($urlParts['path']) && !str_starts_with($urlParts['path'], '/')) {
// If it is NOT an absolute URL (by http: or starting "/")
$tagAttrib[0][$attr] = $params['prefixRelPathWith'] . $tagAttrib[0][$attr];
}
}
if ($params['userFunc'] ?? false) {
if (is_array($params['userFunc.'] ?? null)) {
$params['userFunc.']['attributeValue'] = $tagAttrib[0][$attr];
} else {
$params['userFunc.'] = $tagAttrib[0][$attr];
}
$tagAttrib[0][$attr] = GeneralUtility::callUserFunction($params['userFunc'], $params['userFunc.'], $this);
}
}
}
$tagParts[1] = $this->compileTagAttribs($tagAttrib[0], $tagAttrib[1]);
}
} else {
// If endTag, remove any possible attributes:
$tagParts[1] = '';
}
// Protecting the tag by converting < and > to < and > ??
if (!empty($tags[$tagName]['protect'])) {
$lt = '<';
$gt = '>';
} else {
$lt = '<';
$gt = '>';
}
// Remapping tag name?
if (!empty($tags[$tagName]['remap'])) {
$tagParts[0] = $tags[$tagName]['remap'];
}
// rmTagIfNoAttrib
if ($endTag || empty($tags[$tagName]['rmTagIfNoAttrib']) || trim($tagParts[1] ?? '')) {
$setTag = true;
// Remove this closing tag if $tagName was among $TSconfig['removeTags']
if ($endTag
&& isset($tags[$tagName]['allowedAttribs']) && $tags[$tagName]['allowedAttribs'] === 0
&& isset($tags[$tagName]['rmTagIfNoAttrib']) && $tags[$tagName]['rmTagIfNoAttrib'] === 1
) {
$setTag = false;
}
if (isset($tags[$tagName]['nesting'])) {
if (!isset($tagRegister[$tagName])) {
$tagRegister[$tagName] = [];
}
if ($endTag) {
$correctTag = true;
if ($tags[$tagName]['nesting'] === 'global') {
$lastEl = end($tagStack);
if ($tagName !== $lastEl) {
if (in_array($tagName, $tagStack, true)) {
while (!empty($tagStack) && $tagName !== $lastEl) {
$elPos = end($tagRegister[$lastEl]);
unset($newContent[$elPos]);
array_pop($tagRegister[$lastEl]);
array_pop($tagStack);
$lastEl = end($tagStack);
}
} else {
// In this case the
$correctTag = false;
}
}
}
if (empty($tagRegister[$tagName]) || !$correctTag) {
$setTag = false;
} else {
array_pop($tagRegister[$tagName]);
if ($tags[$tagName]['nesting'] === 'global') {
array_pop($tagStack);
}
}
} else {
$tagRegister[$tagName][] = $c;
if ($tags[$tagName]['nesting'] === 'global') {
$tagStack[] = $tagName;
}
}
}
if ($setTag) {
// Setting the tag
$newContent[$c++] = $lt . ($endTag ? '/' : '') . trim($tagParts[0] . ' ' . ($tagParts[1] ?? '')) . ($emptyTag ? ' /' : '') . $gt;
}
}
} else {
$newContent[$c++] = '<' . ($endTag ? '/' : '') . $tagContent . '>';
}
} elseif ($keepAll) {
// This is if the tag was not defined in the array for processing:
if ($keepAll === 'protect') {
$lt = '<';
$gt = '>';
} else {
$lt = '<';
$gt = '>';
}
$newContent[$c++] = $lt . ($endTag ? '/' : '') . $tagContent . $gt;
}
$newContent[$c++] = $this->bidir_htmlspecialchars(substr($tok, $tagEnd + 1), $hSC);
} else {
$newContent[$c++] = $this->bidir_htmlspecialchars('<' . $tok, $hSC);
}
} else {
$newContent[$c++] = $this->bidir_htmlspecialchars(($skipTag ? '' : '<') . $tok, $hSC);
// It was not a tag anyways
$skipTag = false;
}
}
// Unsetting tags:
foreach ($tagRegister as $tag => $positions) {
foreach ($positions as $pKey) {
unset($newContent[$pKey]);
}
}
$newContent = implode('', $newContent);
$newContent = $this->stripEmptyTagsIfConfigured($newContent, $addConfig);
return $newContent;
}
/**
* Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1)
*
* @param string $value Input value
* @param int $dir Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1)
* @return string Output value
*/
public function bidir_htmlspecialchars($value, $dir)
{
switch ((int)$dir) {
case 1:
return htmlspecialchars($value);
case 2:
return htmlspecialchars($value, ENT_COMPAT, 'UTF-8', false);
case -1:
return htmlspecialchars_decode($value);
default:
return $value;
}
}
/**
* Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a]
* in the $content with the $main_prefix or and alternative given by $alternatives
*
* @param string $main_prefix Prefix string
* @param string $content HTML content
* @param array $alternatives Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase
* @param string $suffix Suffix string (put after the resource).
* @return string Processed HTML content
*/
public function prefixResourcePath($main_prefix, $content, $alternatives = [], $suffix = '')
{
$parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a,param,source', $content);
foreach ($parts as $k => $v) {
if ($k % 2) {
$params = $this->get_tag_attributes($v);
// Detect tag-ending so that it is re-applied correctly.
$tagEnd = substr($v, -2) === '/>' ? ' />' : '>';
// The 'name' of the first tag
$firstTagName = $this->getFirstTagName($v);
$prefixedRelPath = false;
$prefix = $alternatives[strtoupper($firstTagName)] ?? $main_prefix;
switch (strtolower($firstTagName)) {
case 'td':
case 'body':
case 'table':
if (isset($params[0]['background'])) {
$params[0]['background'] = $this->prefixRelPath($prefix, $params[0]['background'], $suffix);
$prefixedRelPath = true;
}
break;
case 'img':
case 'input':
case 'script':
case 'embed':
if (isset($params[0]['src'])) {
$params[0]['src'] = $this->prefixRelPath($prefix, $params[0]['src'], $suffix);
$prefixedRelPath = true;
}
break;
case 'link':
case 'a':
if (isset($params[0]['href'])) {
$params[0]['href'] = $this->prefixRelPath($prefix, $params[0]['href'], $suffix);
$prefixedRelPath = true;
}
break;
case 'form':
if (isset($params[0]['action'])) {
$params[0]['action'] = $this->prefixRelPath($prefix, $params[0]['action'], $suffix);
$prefixedRelPath = true;
}
break;
case 'param':
if (isset($params[0]['name']) && $params[0]['name'] === 'movie' && isset($params[0]['value'])) {
$params[0]['value'] = $this->prefixRelPath($prefix, $params[0]['value'], $suffix);
$prefixedRelPath = true;
}
break;
case 'source':
if (isset($params[0]['srcset'])) {
$srcsetImagePaths = GeneralUtility::trimExplode(',', $params[0]['srcset']);
for ($i = 0; $i < count($srcsetImagePaths); $i++) {
$srcsetImagePaths[$i] = $this->prefixRelPath($prefix, $srcsetImagePaths[$i], $suffix);
}
$params[0]['srcset'] = implode(', ', $srcsetImagePaths);
$prefixedRelPath = true;
}
break;
}
if ($prefixedRelPath) {
$tagParts = preg_split('/\\s+/s', $v, 2);
$tagParts[1] = $this->compileTagAttribs($params[0], $params[1]);
$parts[$k] = '<' . trim(strtolower($firstTagName) . ' ' . $tagParts[1]) . $tagEnd;
}
}
}
$content = implode('', $parts);
// Fix <style> section:
$prefix = $alternatives['style'] ?? $main_prefix;
if ((string)$prefix !== '') {
$parts = $this->splitIntoBlock('style', $content);
foreach ($parts as $k => &$part) {
if ($k % 2) {
$part = preg_replace('/(url[[:space:]]*\\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\\))/i', '\\1' . $prefix . '\\2' . $suffix . '\\3', $part);
}
}
unset($part);
$content = implode('', $parts);
}
return $content;
}
/**
* Internal sub-function for ->prefixResourcePath()
*
* @param string $prefix Prefix string
* @param string $srcVal Relative path/URL
* @param string $suffix Suffix string
* @return string Output path, prefixed if no scheme in input string
* @internal
*/
public function prefixRelPath($prefix, $srcVal, $suffix = '')
{
// Only prefix if it's not an absolute URL or
// only a link to a section within the page.
if ($srcVal[0] !== '/' && $srcVal[0] !== '#') {
$urlParts = parse_url($srcVal);
// Only prefix URLs without a scheme
if (!isset($urlParts['scheme'])) {
$srcVal = $prefix . $srcVal . $suffix;
}
}
return $srcVal;
}
/**
* Compiling an array with tag attributes into a string
*
* @param array $tagAttrib Tag attributes
* @param array $meta Meta information about these attributes (like if they were quoted)
* @return string Imploded attributes, eg: 'attribute="value" attrib2="value2"'
* @internal
*/
public function compileTagAttribs($tagAttrib, $meta = [])
{
$accu = [];
foreach ($tagAttrib as $k => $v) {
$attr = $meta[$k]['origTag'] ?? $k;
if (strcmp($v, '') || isset($meta[$k]['dashType'])) {
$dash = $meta[$k]['dashType'] ?? (MathUtility::canBeInterpretedAsInteger($v) ? '' : '"');
$attr .= '=' . $dash . $v . $dash;
}
$accu[] = $attr;
}
return implode(' ', $accu);
}
/**
* Converts TSconfig into an array for the HTMLcleaner function.
*
* @param array $TSconfig TSconfig for HTMLcleaner
* @param array $keepTags Array of tags to keep (?)
* @return array
* @internal
*/
public function HTMLparserConfig($TSconfig, $keepTags = [])
{
// Allow tags (base list, merged with incoming array)
$alTags = array_flip(GeneralUtility::trimExplode(',', strtolower($TSconfig['allowTags'] ?? ''), true));
$keepTags = array_merge($alTags, $keepTags);
// Set config properties.
if (isset($TSconfig['tags.']) && is_array($TSconfig['tags.'])) {
foreach ($TSconfig['tags.'] as $key => $tagC) {
if (!is_array($tagC) && $key == strtolower($key)) {
if ((string)$tagC === '0') {
unset($keepTags[$key]);
}
if ((string)$tagC === '1' && !isset($keepTags[$key])) {
$keepTags[$key] = 1;
}
}
}
foreach ($TSconfig['tags.'] as $key => $tagC) {
if (is_array($tagC) && $key == strtolower($key)) {
$key = substr($key, 0, -1);
if (!is_array($keepTags[$key] ?? null)) {
$keepTags[$key] = [];
}
if (isset($tagC['fixAttrib.']) && is_array($tagC['fixAttrib.'])) {
foreach ($tagC['fixAttrib.'] as $atName => $atConfig) {
if (is_array($atConfig)) {
$atName = substr($atName, 0, -1);
if (!is_array($keepTags[$key]['fixAttrib'][$atName] ?? null)) {
$keepTags[$key]['fixAttrib'][$atName] = [];
}
$keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName], $atConfig);
if ((string)($keepTags[$key]['fixAttrib'][$atName]['range'] ?? '') !== '') {
$keepTags[$key]['fixAttrib'][$atName]['range'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['range']);
}
if ((string)($keepTags[$key]['fixAttrib'][$atName]['list'] ?? '') !== '') {
$keepTags[$key]['fixAttrib'][$atName]['list'] = GeneralUtility::trimExplode(',', $keepTags[$key]['fixAttrib'][$atName]['list']);
}
}
}
}
unset($tagC['fixAttrib.'], $tagC['fixAttrib']);
if (!empty($tagC['rmTagIfNoAttrib']) && empty($tagC['nesting'])) {
$tagC['nesting'] = 1;
}
$keepTags[$key] = array_merge($keepTags[$key], $tagC);
}
}
}
// LocalNesting
if (!empty($TSconfig['localNesting'])) {
$lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['localNesting']), true);
foreach ($lN as $tn) {
if (isset($keepTags[$tn])) {
if (!is_array($keepTags[$tn])) {
$keepTags[$tn] = [];
}
$keepTags[$tn]['nesting'] = 1;
}
}
}
if (!empty($TSconfig['globalNesting'])) {
$lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['globalNesting']), true);
foreach ($lN as $tn) {
if (isset($keepTags[$tn])) {
if (!is_array($keepTags[$tn])) {
$keepTags[$tn] = [];
}
$keepTags[$tn]['nesting'] = 'global';
}
}
}
if (!empty($TSconfig['rmTagIfNoAttrib'])) {
$lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['rmTagIfNoAttrib']), true);
foreach ($lN as $tn) {
if (isset($keepTags[$tn])) {
if (!is_array($keepTags[$tn])) {
$keepTags[$tn] = [];
}
$keepTags[$tn]['rmTagIfNoAttrib'] = 1;
if (empty($keepTags[$tn]['nesting'])) {
$keepTags[$tn]['nesting'] = 1;
}
}
}
}
if (!empty($TSconfig['noAttrib'])) {
$lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['noAttrib']), true);
foreach ($lN as $tn) {
if (isset($keepTags[$tn])) {
if (!is_array($keepTags[$tn])) {
$keepTags[$tn] = [];
}
$keepTags[$tn]['allowedAttribs'] = 0;
}
}
}
if (!empty($TSconfig['removeTags'])) {
$lN = GeneralUtility::trimExplode(',', strtolower($TSconfig['removeTags']), true);
foreach ($lN as $tn) {
$keepTags[$tn] = [];
$keepTags[$tn]['allowedAttribs'] = 0;
$keepTags[$tn]['rmTagIfNoAttrib'] = 1;
}
}
// Create additional configuration:
$addConfig = [];
if (isset($TSconfig['stripEmptyTags'])) {
$addConfig['stripEmptyTags'] = $TSconfig['stripEmptyTags'];
if (isset($TSconfig['stripEmptyTags.'])) {
$addConfig['stripEmptyTags.'] = $TSconfig['stripEmptyTags.'];
}
}
return [
$keepTags,
'' . ($TSconfig['keepNonMatchedTags'] ?? ''),
(int)($TSconfig['htmlSpecialChars'] ?? 0),
$addConfig,
];
}
/**
* Strips empty tags from HTML.
*
* @param string $content The content to be stripped of empty tags
* @param string $tagList The comma separated list of tags to be stripped.
* If empty, all empty tags will be stripped
* @param bool $treatNonBreakingSpaceAsEmpty If TRUE tags containing only entities will be treated as empty.
* @param bool $keepTags If true, the provided tags will be kept instead of stripped.
* @return string the stripped content
*/
public function stripEmptyTags($content, $tagList = '', $treatNonBreakingSpaceAsEmpty = false, $keepTags = false)
{
if (!empty($tagList)) {
$tagRegEx = implode('|', GeneralUtility::trimExplode(',', $tagList, true));
if ($keepTags) {
$tagRegEx = '(?!' . $tagRegEx . ')[^ >]+';
}
} else {
$tagRegEx = '[^ >]+'; // all characters until you reach a > or space;
}