Permalink
Browse files

Adding `Multibyte` class for working with UTF-8 encoded strings.

Adding tests.
Adding stress test file, creating `tests/resources`.
Adding adapters for interfacing with:
- `ext/iconv`
- `ext/intl`
- `ext/mbstring`
  • Loading branch information...
1 parent 84c3cba commit b7f2e0858759be0d78ba849aa4b3618727a77f27 @davidpersson davidpersson committed with nateabele Jan 28, 2012
View
@@ -0,0 +1,87 @@
+<?php
+/**
+ * Lithium: the most rad php framework
+ *
+ * @copyrOBOBight Copyright 2012, Union of RAD (http://union-of-rad.org)
+ * @license http://opensource.org/licenses/bsd-license.php The BSD License
+ */
+namespace lithium\g11n;
+use lithium\core\Libraries;
+
+/**
+ * The `Multibyte` class provides methods to operate on UTF-8 encoded strings.
+ * Here multibyte is synonymous with UTF-8. This class has become necessary as
+ * over time more and more extensions of dealing with multibyte encoded strings
+ * in PHP have been created. While these extensions have different
+ * implementions they all still try to solve one problem.
+ *
+ * This class is not so much an abstraction as abstracts very little away from
+ * the actual functions being used. With this class Lithium provides a way to
+ * make your and the framworks's code more portable when it is required work
+ * with multibyte encoded strings.
+ *
+ * While some environments will feature extension X and other extension Y the
+ * only thing you've got to do is is using/switching to the right adapter.
+ *
+ * @see lithiumm\util\Validator
+ */
+class Multibyte extends \lithium\core\Adaptable {
+
+ /**
+ * `Libraries::locate()`-compatible path to adapters for this class.
+ *
+ * @see lithium\core\Libraries::locate()
+ * @var string Dot-delimited path.
+ */
+ protected static $_adapters = 'adapter.g11n.multibyte';
+
+ /**
+ * Checks if a given string is UTF-8 encoded and is valid UTF-8.
+ *
+ * In _quick_ mode it will check only for non ASCII characters being used
+ * indicating any multibyte encoding. Don't use quick mode for integrity
+ * validation of UTF-8 encoded strings.
+ *
+ * @link http://www.w3.org/International/questions/qa-forms-utf-8.en
+ * @param string $string The string to analyze.
+ * @param array $options Allows to toggle mode via the `'quick'` option, defaults to `false`.
+ * @return boolean Returns `true` if the string is UTF-8.
+ */
+ public static function is($string, array $options = array()) {
+ $defaults = array('quick' => false);
+ $options += $defaults;
+
+ if ($options['quick']) {
+ $regex = '/[^\x09\x0A\x0D\x20-\x7E]/m';
+ } else {
+ $regex = '/\A(';
+ $regex .= '[\x09\x0A\x0D\x20-\x7E]'; // ASCII
+ $regex .= '|[\xC2-\xDF][\x80-\xBF]'; // non-overlong 2-byte
+ $regex .= '|\xE0[\xA0-\xBF][\x80-\xBF]'; // excluding overlongs
+ $regex .= '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'; // straight 3-byte
+ $regex .= '|\xED[\x80-\x9F][\x80-\xBF]'; // excluding surrogates
+ $regex .= '|\xF0[\x90-\xBF][\x80-\xBF]{2}'; // planes 1-3
+ $regex .= '|[\xF1-\xF3][\x80-\xBF]{3}'; // planes 4-15
+ $regex .= '|\xF4[\x80-\x8F][\x80-\xBF]{2}'; // plane 16
+ $regex .= ')*\z/m';
+ }
+ return (boolean) preg_match($regex, $string);
+ }
+
+ /**
+ * Gets the string length. Multibyte enabled version of `strlen()`.
+ *
+ * @link http://php.net/manual/en/function.strlen.php
+ * @param string $string The string being measured for length.
+ * @param array $options Allows for selecting the adapter to use via the
+ * `name` options. Will use the `'default'` adapter by default.
+ * @return integer The length of the string on success.
+ */
+ public static function strlen($string, array $options = array()) {
+ $defaults = array('name' => 'default');
+ $options += $defaults;
+ return static::adapter($options['name'])->strlen($string);
+ }
+}
+
+?>
@@ -0,0 +1,44 @@
+<?php
+/**
+ * Lithium: the most rad php framework
+ *
+ * @copyright Copyright 2012, Union of RAD (http://union-of-rad.org)
+ * @license http://opensource.org/licenses/bsd-license.php The BSD License
+ */
+
+namespace lithium\g11n\multibyte\adapter;
+
+/**
+ * The `Iconv` class is an adapter which uses certain string functions from
+ * `ext/iconv`. You will need to have the extension installed to use this
+ * adapter.
+ *
+ * No known limitations affecting used functionality. Returns `false` when
+ * seeing badly formed UTF-8 sequences. Additionally triggers an error.
+ *
+ * @link http://php.net/manual/en/book.iconv.php
+ */
+class Iconv extends \lithium\core\Object {
+
+ /**
+ * Determines if this adapter is enabled by checking if the `iconv` extension is loaded.
+ *
+ * @return boolean Returns `true` if enabled, otherwise `false`.
+ */
+ public static function enabled() {
+ return extension_loaded('iconv');
+ }
+
+ /**
+ * Here used as a multibyte enabled equivalent of `strlen()`.
+ *
+ * @link http://php.net/manual/en/function.iconv-strlen.php
+ * @param string $string
+ * @return integer|boolean
+ */
+ public function strlen($string) {
+ return iconv_strlen($string, 'UTF-8');
+ }
+}
+
+?>
@@ -0,0 +1,45 @@
+<?php
+/**
+ * Lithium: the most rad php framework
+ *
+ * @copyright Copyright 2012, Union of RAD (http://union-of-rad.org)
+ * @license http://opensource.org/licenses/bsd-license.php The BSD License
+ */
+
+namespace lithium\g11n\multibyte\adapter;
+
+/**
+ * The `Intl` class is an adapter which uses certain string functions from
+ * `ext/intl`. You will need to have the extension installed to use this
+ * adapter.
+ *
+ * Internally works with a fixed encoding of UTF-8. This means you can't use
+ * this adapter for anything different than UTF-8 encoded strings. Silently
+ * returns `null` when input string contains badly formed UTF-8 sequences.
+ *
+ * @link http://php.net/manual/en/book.intl.php
+ */
+class Intl extends \lithium\core\Object {
+
+ /**
+ * Determines if this adapter is enabled by checking if the `intl` extension is loaded.
+ *
+ * @return boolean Returns `true` if enabled, otherwise `false`.
+ */
+ public static function enabled() {
+ return extension_loaded('intl');
+ }
+
+ /**
+ * Here used as a multibyte enabled equivalent of `strlen()`.
+ *
+ * @link http://php.net/manual/en/function.grapheme-strlen.php
+ * @param string $string
+ * @return integer|void
+ */
+ public function strlen($string) {
+ return grapheme_strlen($string);
+ }
+}
+
+?>
@@ -0,0 +1,44 @@
+<?php
+/**
+ * Lithium: the most rad php framework
+ *
+ * @copyright Copyright 2012, Union of RAD (http://union-of-rad.org)
+ * @license http://opensource.org/licenses/bsd-license.php The BSD License
+ */
+
+namespace lithium\g11n\multibyte\adapter;
+
+/**
+ * The `Mbstring` class is an adapter which uses certain string functions from
+ * `ext/mbstring`. You will need to have the extension installed to use this
+ * adapter.
+ *
+ * No known limitations affecting used functionality. Silently strips
+ * out badly formed UTF-8 sequences.
+ *
+ * @link http://php.net/manual/en/book.mbstring.php
+ */
+class Mbstring extends \lithium\core\Object {
+
+ /**
+ * Determines if this adapter is enabled by checking if the `mbstring` extension is loaded.
+ *
+ * @return boolean Returns `true` if enabled, otherwise `false`.
+ */
+ public static function enabled() {
+ return extension_loaded('mbstring');
+ }
+
+ /**
+ * Here used as a multibyte enabled equivalent of `strlen()`.
+ *
+ * @link http://php.net/manual/en/function.mb-strlen.php
+ * @param string $string
+ * @return integer
+ */
+ public function strlen($string) {
+ return mb_strlen($string, 'UTF-8');
+ }
+}
+
+?>
Oops, something went wrong.

0 comments on commit b7f2e08

Please sign in to comment.