diff --git a/PHPCSUtils/Utils/TextStrings.php b/PHPCSUtils/Utils/TextStrings.php index 21369624..eb798864 100644 --- a/PHPCSUtils/Utils/TextStrings.php +++ b/PHPCSUtils/Utils/TextStrings.php @@ -24,6 +24,25 @@ class TextStrings { + /** + * Regex to match the start of an embedded variable/expression. + * + * Prevents matching escaped variables/expressions. + * + * @var string + */ + const START_OF_EMBED = '`(?[a-zA-Z_\x7f-\xff][a-zA-Z0-9_\x7f-\xff]*)(?:\??->(?P>varname)|\[[^\]\'"\s]+\])?`'; + /** * Get the complete contents of a - potentially multi-line - text string. * @@ -268,4 +287,126 @@ public static function stripQuotes($textString) { return \preg_replace('`^([\'"])(.*)\1$`Ds', '$2', $textString); } + + /** + * Get the embedded variables/expressions from an arbitrary string. + * + * Note: this function gets the complete variables/expressions _as they are embedded_, + * i.e. including potential curly brace wrappers, array access, method calls etc. + * + * @param string $text The contents of a T_DOUBLE_QUOTED_STRING or T_HEREDOC token. + * + * @return array Array of encountered variable names/expressions with the offset at which + * the variable/expression was found in the string, as the key. + */ + public static function getEmbeds($text) + { + return self::getStripEmbeds($text)['embeds']; + } + + /** + * Strip embedded variables/expressions from an arbitrary string. + * + * @param string $text The contents of a T_DOUBLE_QUOTED_STRING or T_HEREDOC token. + * + * @return string String without variables/expressions in it. + */ + public static function stripEmbeds($text) + { + return self::getStripEmbeds($text)['remaining']; + } + + /** + * Split an arbitrary text string into embedded variables/expressions and remaining text. + * + * PHP contains four types of embedding syntaxes: + * 1. Directly embedding variables ("$foo"); + * 2. Braces outside the variable ("{$foo}"); + * 3. Braces after the dollar sign ("${foo}"); + * 4. Variable variables ("${expr}", equivalent to (string) ${expr}). + * + * Type 3 and 4 are deprecated as of PHP 8.2 and will be removed in PHP 9.0. + * + * This method handles all types of embeds, including recognition of whether an embed is escaped or not. + * + * @link https://www.php.net/manual/en/language.types.string.php#language.types.string.parsing + * @link https://wiki.php.net/rfc/deprecate_dollar_brace_string_interpolation + * + * @param string $text The contents of a T_DOUBLE_QUOTED_STRING or T_HEREDOC token. + * + * @return array Array containing two values: + * 1. An array containing a string representation of each embed encountered. + * The keys in this array are the integer offset within the original string + * where the embed was found. + * 2. The textual contents, embeds stripped out of it. + * The format of the array return value is: + * ```php + * array( + * 'embeds' => array, + * 'remaining' => string, + * ) + * ``` + */ + public static function getStripEmbeds($text) + { + if (\strpos($text, '$') === false) { + return [ + 'embeds' => [], + 'remaining' => $text, + ]; + } + + $offset = 0; + $strLen = \strlen($text); // Use iconv ? + $stripped = ''; + $variables = []; + + while (\preg_match(self::START_OF_EMBED, $text, $matches, \PREG_OFFSET_CAPTURE, $offset) === 1) { + $stripped .= \substr($text, $offset, ($matches[2][1] - $offset)); + + $matchedExpr = $matches[2][0]; + $matchedOffset = $matches[2][1]; + $braces = \substr_count($matchedExpr, '{'); + $newOffset = $matchedOffset + \strlen($matchedExpr); + + if ($braces === 0) { + /* + * Type 1: simple variable embed. + * Regex will always return a match due to the look ahead in the above regex. + */ + \preg_match(self::TYPE1_EMBED_AFTER_DOLLAR, $text, $endMatch, 0, $newOffset); + $matchedExpr .= $endMatch[0]; + $variables[$matchedOffset] = $matchedExpr; + $offset = $newOffset + \strlen($endMatch[0]); + continue; + } + + for (; $newOffset < $strLen; $newOffset++) { + if ($text[$newOffset] === '{') { + ++$braces; + continue; + } + + if ($text[$newOffset] === '}') { + --$braces; + if ($braces === 0) { + $matchedExpr = \substr($text, $matchedOffset, (1 + $newOffset - $matchedOffset)); + $variables[$matchedOffset] = $matchedExpr; + $offset = ($newOffset + 1); + break; + } + } + } + } + + if ($offset < $strLen) { + // Add the end of the string. + $stripped .= \substr($text, $offset); + } + + return [ + 'embeds' => $variables, + 'remaining' => $stripped, + ]; + } } diff --git a/Tests/Utils/TextStrings/InterpolatedVariablesTest.php b/Tests/Utils/TextStrings/InterpolatedVariablesTest.php new file mode 100644 index 00000000..686acdbc --- /dev/null +++ b/Tests/Utils/TextStrings/InterpolatedVariablesTest.php @@ -0,0 +1,581 @@ + + */ + private $embeds = [ + // Simple. + '$foo', + '{$foo}', + '${foo}', + + // DIM. + '$foo[2]', + '$foo[-12]', + '{$foo[0]}', + '${foo[132]}', + '$foo[bar]', + '{$foo[\'bar\']}', + '${foo[\'bar\']}', + '{$foo[8][35]}', + '{$foo[10][\'bar\']}', + '{$foo[\'bar\'][\'baz\']}', + '{$foo[\'bar\'][12]}', + + // Property. + '$foo->bar', + '{$foo->bar}', + '$foo?->bar', + '{$foo?->bar}', + '{${beers::$ale}}', + '${beers::$ale}', + + // Class constant. + '{${beers::softdrink}}', + '${beers::softdrink}', + + // Method. + '{$foo->bar()}', + '{$foo?->bar()}', + '{${$object->getName()}}', + '{${$object?->getName()}}', + + // Closure/Function call. + '{$foo()}', + '{${getName()}}', + '{${getName( $test )}}', + '{${getName( \'abc\' )}}', + '${substr(\'laruence\', 0, 2)}', + + // Chain. + '{$foo[42]->baz()()}', + '{$foo[\'bar\']->baz()()}', + '{$foo[42]?->baz()()}', + '{$foo[\'bar\']?->baz()()}', + '{$obj->values[3]->name}', + '{$obj->values[5]?->name}', + + // Variable variables. + '${$bar}', + '{$$bar}', + '${(foo)}', + '${foo->bar}', + '{$foo->$bar}', + '{$foo?->$bar}', + + // Nested. + '${foo["${bar}"]}', + '${foo["${ba23}"]}', + '${foo["${bar[3]}"]}', + '${foo["${bar[\'baz\']}"]}', + '${foo->{$baz}}', + '${foo->{${\'a\'}}}', + '${foo->{"${\'a\'}"}}', + '${foo?->{$baz}}', + '${foo?->{${\'a\'}}}', + '${foo?->{"${\'a\'}"}}', + '{$foo->{$baz[1]}}', + + // Using non-ascii UTF8 variable names. + '$IÑTËRNÂTÎÔNÀLÍŽÆTIØN', + '${IÑTËRNÂTÎÔNÀLÍŽÆTIØN}', + '$Iñtërnâtîônàlížætiøn[nât]', + '$Iñtërnâtîônàlížætiøn?->îôn', + '$МояРабота', + '${$МояРабота}', + '$💟', + '$💟[◾]', + '$😝->🤞', + ]; + + /** + * Collections of phrases to use during the test. + * + * Phrases used will be selected at random. + * + * @var array + */ + private $phrases = [ + 'single line' => "%s this is nonsense %s\tbut that's not the point %s", + 'single line, embed followed by non-space 1' => '%s- dash %s+ plus %s', + 'single line, embed followed by non-space 2' => '%s. dash %s= plus %s', + 'single line, embed followed by non-space 3' => '%s` dash %s%% plus %s', + 'single line, embed followed by non-space 4' => '%s\\ dash %s) plus %s', + 'single line, embed followed by non-space 5' => '%s] dash %s} plus %s', + 'single line, embed followed by non-space 6' => '%s\' dash %s# plus %s', + 'single line, contains escaped non-embed 1' => '%s this {\$name} foo %s but that\'s \$mane[not] the point %s', + 'single line, contains escaped non-embed 2' => '%s this $\{name} foo %s but that\'s \$mane->not the point %s', + 'multi line' => "%s this is\nnonsense %s but\nthat's not the point %s", + 'multi line, empty first line' => "\n%s this is\nnonsense %s but\nthat's not the point %s", + 'multi line, empty last line' => "%s this is\nnonsense %s but\nthat's not the point %s\n", + ]; + + /** + * Test getting embedded variables and expressions from an arbitrary text string. + * + * @dataProvider dataEmbedsInPhrases + * + * @param string $input The input string. + * @param array $expected The expected function output of the respective functions. + * + * @return void + */ + public function testGetEmbeds($input, $expected) + { + $this->assertSame($expected['get'], \array_values(TextStrings::getEmbeds($input))); + } + + /** + * Test getting embedded variables and expressions from an arbitrary text string and verify the offset + * at which the embed was found is correctly set as well. + * + * @dataProvider dataEscaping + * @dataProvider dataSpecificCases + * + * @param string $input The input string. + * @param array $expected The expected function output of the respective functions. + * + * @return void + */ + public function testGetEmbedsAndCheckOffset($input, $expected) + { + $this->assertSame($expected['get'], TextStrings::getEmbeds($input)); + } + + /** + * Test stripping embedded variables and expressions from an arbitrary text string. + * + * @dataProvider dataEmbedsInPhrases + * @dataProvider dataEscaping + * @dataProvider dataSpecificCases + * + * @param string $input The input string. + * @param array $expected The expected function output of the respective functions. + * + * @return void + */ + public function testStripEmbeds($input, $expected) + { + $this->assertSame($expected['stripped'], TextStrings::stripEmbeds($input)); + } + + /** + * Data provider. + * + * @see testGetEmbeds() For the array format. + * @see testStripEmbeds() For the array format. + * + * @return array + */ + public function dataEmbedsInPhrases() + { + $data = []; + foreach ($this->embeds as $embed) { + $data[$embed . '| Plain embed (heredoc)'] = [ + 'input' => $embed, + 'expected' => [ + 'get' => [$embed], + 'stripped' => '', + ], + ]; + $data[$embed . '| Double quoted embed'] = [ + 'input' => '"' . $embed . '"', + 'expected' => [ + 'get' => [$embed], + 'stripped' => '""', + ], + ]; + + // Plain, no double quotes (heredoc). + $phraseKey = \array_rand($this->phrases); + $dataKey = $embed . '| Embed at start of plain phrase in: ' . $phraseKey; + $data[$dataKey] = [ + 'input' => \sprintf($this->phrases[$phraseKey], $embed, '', ''), + 'expected' => [ + 'get' => [$embed], + 'stripped' => \sprintf($this->phrases[$phraseKey], '', '', ''), + ], + ]; + + $phraseKey = \array_rand($this->phrases); + $dataKey = $embed . '| Embed in middle of plain phrase in: ' . $phraseKey; + $data[$dataKey] = [ + 'input' => \sprintf($this->phrases[$phraseKey], '', $embed, ''), + 'expected' => [ + 'get' => [$embed], + 'stripped' => \sprintf($this->phrases[$phraseKey], '', '', ''), + ], + ]; + + $phraseKey = \array_rand($this->phrases); + $dataKey = $embed . '| Embed at end of plain phrase in: ' . $phraseKey; + $data[$dataKey] = [ + 'input' => \sprintf($this->phrases[$phraseKey], '', '', $embed), + 'expected' => [ + 'get' => [$embed], + 'stripped' => \sprintf($this->phrases[$phraseKey], '', '', ''), + ], + ]; + + // Phrase in double quotes. + $phraseKey = \array_rand($this->phrases); + $dataKey = $embed . '| Embed at start of quoted phrase in: ' . $phraseKey; + $data[$dataKey] = [ + 'input' => '"' . \sprintf($this->phrases[$phraseKey], $embed, '', '') . '"', + 'expected' => [ + 'get' => [$embed], + 'stripped' => '"' . \sprintf($this->phrases[$phraseKey], '', '', '') . '"', + ], + ]; + + $phraseKey = \array_rand($this->phrases); + $dataKey = $embed . '| Embed in middle of quoted phrase in: ' . $phraseKey; + $data[$dataKey] = [ + 'input' => '"' . \sprintf($this->phrases[$phraseKey], '', $embed, '') . '"', + 'expected' => [ + 'get' => [$embed], + 'stripped' => '"' . \sprintf($this->phrases[$phraseKey], '', '', '') . '"', + ], + ]; + + $phraseKey = \array_rand($this->phrases); + $dataKey = $embed . '| Embed at end of quoted phrase in: ' . $phraseKey; + $data[$dataKey] = [ + 'input' => '"' . \sprintf($this->phrases[$phraseKey], '', '', $embed) . '"', + 'expected' => [ + 'get' => [$embed], + 'stripped' => '"' . \sprintf($this->phrases[$phraseKey], '', '', '') . '"', + ], + ]; + } + + return $data; + } + + /** + * Data provider. + * + * @see testGetEmbedsAndCheckOffset() For the array format. + * @see testStripEmbeds() For the array format. + * + * @return array + */ + public function dataEscaping() + { + $embedAtEnd = '"Foo: %s%s"'; + $embedAtStart = '%s%s Foo'; // Not, no double quotes! + $data = []; + + for ($i = 0; $i < 10; $i++) { + $escaped = (($i % 2) !== 0); + $slashes = \str_repeat('\\', $i); + $offset = 6 + $i; + + $dataKey = "Escaping handling test, embed at start: slashes before \$ - $i slashes = "; + $dataKey .= ($escaped === true) ? 'escaped' : 'not escaped'; + $data[$dataKey] = [ + 'input' => \sprintf($embedAtStart, $slashes, '$foo'), + 'expected' => [ + 'get' => ($escaped === true) ? [] : [$i => '$foo'], + 'stripped' => ($escaped === true) + ? \sprintf($embedAtStart, $slashes, '$foo') + : \sprintf($embedAtStart, $slashes, ''), + ], + ]; + + $dataKey = "Escaping handling test, embed at start: slashes before { - $i slashes = "; + $dataKey .= ($escaped === true) ? 'escaped' : 'not escaped'; + $data[$dataKey] = [ + 'input' => \sprintf($embedAtStart, $slashes, '{$foo}'), + 'expected' => [ + 'get' => ($escaped === true) ? [($i + 1) => '$foo'] : [$i => '{$foo}'], + 'stripped' => ($escaped === true) + ? \sprintf($embedAtStart, $slashes, '{}') + : \sprintf($embedAtStart, $slashes, ''), + ], + ]; + + $dataKey = "Escaping handling test, embed at end: slashes before \$ - $i slashes = "; + $dataKey .= ($escaped === true) ? 'escaped' : 'not escaped'; + $data[$dataKey] = [ + 'input' => \sprintf($embedAtEnd, $slashes, '$foo'), + 'expected' => [ + 'get' => ($escaped === true) ? [] : [$offset => '$foo'], + 'stripped' => ($escaped === true) + ? \sprintf($embedAtEnd, $slashes, '$foo') + : \sprintf($embedAtEnd, $slashes, ''), + ], + ]; + + $dataKey = "Escaping handling test, embed at end: slashes before { - $i slashes = "; + $dataKey .= ($escaped === true) ? 'escaped' : 'not escaped'; + $data[$dataKey] = [ + 'input' => \sprintf($embedAtEnd, $slashes, '{$foo}'), + 'expected' => [ + 'get' => ($escaped === true) ? [($offset + 1) => '$foo'] : [$offset => '{$foo}'], + 'stripped' => ($escaped === true) + ? \sprintf($embedAtEnd, $slashes, '{}') + : \sprintf($embedAtEnd, $slashes, ''), + ], + ]; + } + + return $data; + } + + /** + * Data provider. + * + * @see testGetEmbedsAndCheckOffset() For the array format. + * @see testStripEmbeds() For the array format. + * + * @return array + */ + public function dataSpecificCases() + { + return [ + // No embeds. + 'Text string without any embeds' => [ + 'input' => '"He drank some orange juice."', + 'expected' => [ + 'get' => [], + 'stripped' => '"He drank some orange juice."', + ], + ], + 'Text string without any valid embeds - not a valid variable name 1' => [ + 'input' => '"He drank some orange $--."', + 'expected' => [ + 'get' => [], + 'stripped' => '"He drank some orange $--."', + ], + ], + 'Text string without any valid embeds - not a valid variable name 2' => [ + 'input' => '"He drank some orange $\name."', + 'expected' => [ + 'get' => [], + 'stripped' => '"He drank some orange $\name."', + ], + ], + + // Variations on embeds not tested via the above generated test cases. + 'No braces, one character variable name' => [ + 'input' => '"This is $g"', + 'expected' => [ + 'get' => [ + 9 => '$g', + ], + 'stripped' => '"This is "', + ], + ], + 'Wrappped in outer braces with space between brace and dollar' => [ + 'input' => '"This is { $great}"', + 'expected' => [ + 'get' => [ + 11 => '$great', + ], + 'stripped' => '"This is { }"', + ], + ], + + 'Text string containing multiple embeds 1' => [ + 'input' => '"$people->john drank some $juices[0] juice."', + 'expected' => [ + 'get' => [ + 1 => '$people->john', + 26 => '$juices[0]', + ], + 'stripped' => '" drank some juice."', + ], + ], + 'Text string containing multiple embeds 2' => [ + 'input' => '"$people->john then said hello to $people->jane."', + 'expected' => [ + 'get' => [ + 1 => '$people->john', + 34 => '$people->jane', + ], + 'stripped' => '" then said hello to ."', + ], + ], + 'Text string containing multiple embeds 3' => [ + 'input' => '"$people->john\'s wife greeted $people->robert."', + 'expected' => [ + 'get' => [ + 1 => '$people->john', + // Note: the backslash escaping the ' will be removed, so doesn't count for offset. + 30 => '$people->robert', + ], + 'stripped' => '"\'s wife greeted ."', + ], + ], + 'Text string containing multiple embeds 4' => [ + 'input' => '"This is the value of the var named $name: {${$name}}"', + 'expected' => [ + 'get' => [ + 36 => '$name', + 43 => '{${$name}}', + ], + 'stripped' => '"This is the value of the var named : "', + ], + ], + 'Text string containing multiple embeds 5 (nothing between embeds, plain)' => [ + 'input' => '"This is the value of the var named $name$name"', + 'expected' => [ + 'get' => [ + 36 => '$name', + 41 => '$name', + ], + 'stripped' => '"This is the value of the var named "', + ], + ], + 'Text string containing multiple embeds 6 (nothing between embeds, outer braces)' => [ + 'input' => '"This is the value of the var named {$name}{$name}"', + 'expected' => [ + 'get' => [ + 36 => '{$name}', + 43 => '{$name}', + ], + 'stripped' => '"This is the value of the var named "', + ], + ], + 'Text string containing multiple embeds 7 (nothing between embeds, inner braces)' => [ + 'input' => '"This is the value of the var named ${name}${name}"', + 'expected' => [ + 'get' => [ + 36 => '${name}', + 43 => '${name}', + ], + 'stripped' => '"This is the value of the var named "', + ], + ], + 'Text string containing multiple embeds 8 (nothing between embeds, mixed)' => [ + 'input' => '"This is the value of the var named $name${name}{$name}"', + 'expected' => [ + 'get' => [ + 36 => '$name', + 41 => '${name}', + 48 => '{$name}', + ], + 'stripped' => '"This is the value of the var named "', + ], + ], + + // These can't be tested via the generated code as it won't work without braces. + 'Embed without braces, variable variable will not work' => [ + 'input' => '"$$bar"', + 'expected' => [ + 'get' => [ + 2 => '$bar', + ], + 'stripped' => '"$"', + ], + ], + 'Embed in outer braces followed by number' => [ + 'input' => '"This square is {$square->width}00 centimeters broad."', + 'expected' => [ + 'get' => [ + 16 => '{$square->width}', + ], + 'stripped' => '"This square is 00 centimeters broad."', + ], + ], + 'Embed in inner braces followed by number' => [ + 'input' => '"This square is ${square->width}00 centimeters broad."', + 'expected' => [ + 'get' => [ + 16 => '${square->width}', + ], + 'stripped' => '"This square is 00 centimeters broad."', + ], + ], + 'Without braces, multi-level array access does not work' => [ + 'input' => '"This works: {$arr[4][3]}, but this doesn\'t: $arr[3][4]"', + 'expected' => [ + 'get' => [ + 13 => '{$arr[4][3]}', + // Note: the backslash escaping the ' will be removed, so doesn't count for offset. + 45 => '$arr[3]', + ], + 'stripped' => '"This works: , but this doesn\'t: [4]"', + ], + ], + 'Without braces, multi-level property access does not work' => [ + 'input' => '"This works: {$obj->prop->key}, but this doesn\'t: $obj->prop->key"', + 'expected' => [ + 'get' => [ + 13 => '{$obj->prop->key}', + // Note: the backslash escaping the ' will be removed, so doesn't count for offset. + 50 => '$obj->prop', + ], + 'stripped' => '"This works: , but this doesn\'t: ->key"', + ], + ], + 'Embed in braces, multi-level array access, string key missing quotes' => [ + 'input' => '"This interprets the key foo as a constant: {$arr[foo][3]}"', + 'expected' => [ + 'get' => [ + 44 => '{$arr[foo][3]}', + ], + 'stripped' => '"This interprets the key foo as a constant: "', + ], + ], + + // Multi-line expressions. + 'Embed in dollar brace, multi-line expression' => [ + 'input' => '"Testing ${foo["${bar + [\'baz\'] +}"]} and more testing"', + 'expected' => [ + 'get' => [ + 9 => '${foo["${bar + [\'baz\'] +}"]}', + ], + 'stripped' => '"Testing and more testing"', + ], + ], + 'Embed in braces, multi-line expression' => [ + 'input' => '"Testing {${foo["${bar + [\'baz\'] +}"]}} and more testing"', + 'expected' => [ + 'get' => [ + 9 => '{${foo["${bar + [\'baz\'] +}"]}}', + ], + 'stripped' => '"Testing and more testing"', + ], + ], + ]; + } +}