Permalink
Browse files

bug 2735: fix/improve non latin language tags

a. non latin tags (greek/cyrillic...) are not sorted case-insesitive and group by letter view in tag list is not case insesitive
b. quick searching tag names does not perform correctly accent folding (e.g. Köln and Koln do not match) and case insesitivity for non latin letters
c. missing from remove_accents characters in romanian language (Latin Extended-B)
  ? c8 98 = LATIN CAPITAL LETTER S WITH COMMA BELOW
  ? c8 99 = LATIN SMALL LETTER S WITH COMMA BELOW
  ? c8 9a = LATIN CAPITAL LETTER T WITH COMMA BELOW
  ? c8 9b = LATIN SMALL LETTER T WITH COMMA BELOW
d. str2url allow non latin letters in output only if the input does not contain any valid lating letter/digit. we should always allow non latin letters in output 

git-svn-id: http://piwigo.org/svn/trunk@17748 68402e56-0260-453c-a942-63ccdbb3a9ee
  • Loading branch information...
modus75 committed Sep 4, 2012
1 parent 0257b32 commit 528c75ab35b915b574b4977b8d19412b69845d26
Showing with 49 additions and 34 deletions.
  1. +41 −13 include/functions.inc.php
  2. +1 −1 include/functions_html.inc.php
  3. +6 −4 include/functions_metadata.inc.php
  4. +0 −15 include/functions_search.inc.php
  5. +1 −1 tags.php
@@ -203,32 +203,42 @@ function mkgetdir($dir, $flags=MKGETDIR_DEFAULT)
/* Returns true if the string appears to be encoded in UTF-8. (from wordpress)
* @param string Str
*/
function seems_utf8($Str) { # by bmorel at ssi dot fr
function seems_utf8($Str) {
// OBSOLETE !!!
return qualify_utf8($Str) >= 0;
}
/* returns 0 if $str is Ascii, 1 if utf-8, -1 otherwise */
function qualify_utf8($Str)
{
$ret = 0;
for ($i=0; $i<strlen($Str); $i++) {
if (ord($Str[$i]) < 0x80) continue; # 0bbbbbbb
elseif ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
$ret = 1;
if ((ord($Str[$i]) & 0xE0) == 0xC0) $n=1; # 110bbbbb
elseif ((ord($Str[$i]) & 0xF0) == 0xE0) $n=2; # 1110bbbb
elseif ((ord($Str[$i]) & 0xF8) == 0xF0) $n=3; # 11110bbb
elseif ((ord($Str[$i]) & 0xFC) == 0xF8) $n=4; # 111110bb
elseif ((ord($Str[$i]) & 0xFE) == 0xFC) $n=5; # 1111110b
else return false; # Does not match any model
else return -1; # Does not match any model
for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
return false;
return -1;
}
}
return true;
return $ret;
}
/* Remove accents from a UTF-8 or ISO-859-1 string (from wordpress)
* @param string sstring - an UTF-8 or ISO-8859-1 string
*/
function remove_accents($string)
{
if ( !preg_match('/[\x80-\xff]/', $string) )
return $string;
$utf = qualify_utf8($string);
if ( $utf == 0 )
return $string; // ascii
if (seems_utf8($string)) {
if ( $utf > 0 ) {
$chars = array(
// Decompositions for Latin-1 Supplement
"\xc3\x80"=>'A', "\xc3\x81"=>'A',
@@ -323,6 +333,9 @@ function remove_accents($string)
"\xc5\xba"=>'z', "\xc5\xbb"=>'Z',
"\xc5\xbc"=>'z', "\xc5\xbd"=>'Z',
"\xc5\xbe"=>'z', "\xc5\xbf"=>'s',
// Decompositions for Latin Extended-B
"\xc8\x98"=>'S', "\xc8\x99"=>'s',
"\xc8\x9a"=>'T', "\xc8\x9b"=>'t',
// Euro Sign
"\xe2\x82\xac"=>'E',
// GBP (Pound) Sign
@@ -353,6 +366,23 @@ function remove_accents($string)
return $string;
}
if (function_exists('mb_strtolower') && defined('PWG_CHARSET'))
{
function transliterate($term)
{
return remove_accents( mb_strtolower($term, PWG_CHARSET) );
}
}
else
{
function transliterate($term)
{
return remove_accents( strtolower($term) );
}
}
/**
* simplify a string to insert it into an URL
*
@@ -361,16 +391,14 @@ function remove_accents($string)
*/
function str2url($str)
{
$raw = $str;
$str = remove_accents($str);
$str = preg_replace('/[^a-z0-9_\s\'\:\/\[\],-]/','',strtolower($str));
$str = $safe = transliterate($str);
$str = preg_replace('/[^\x80-\xffa-z0-9_\s\'\:\/\[\],-]/','',$str);
$str = preg_replace('/[\s\'\:\/\[\],-]+/',' ',trim($str));
$res = str_replace(' ','_',$str);
if (empty($res))
{
$res = str_replace(' ','_', $raw);
$res = str_replace(' ','_', $safe);
}
return $res;
@@ -300,7 +300,7 @@ function tag_alpha_compare($a, $b)
{
if (!isset($cache[__FUNCTION__][ $tag['name'] ]))
{
$cache[__FUNCTION__][ $tag['name'] ] = strtolower(str2url($tag['name']));
$cache[__FUNCTION__][ $tag['name'] ] = transliterate($tag['name']);
}
}
@@ -90,10 +90,12 @@ function clean_iptc_value($value)
// apparently mac uses some MacRoman crap encoding. I don't know
// how to detect it so a plugin should do the trick.
$value = trigger_event('clean_iptc_value', $value);
$is_utf8 = seems_utf8($value);
$value = convert_charset( $value,
$is_utf8 ? 'utf-8' : 'iso-8859-1',
get_pwg_charset() );
if ( ($qual = qualify_utf8($value)) != 0)
{// has non ascii chars
$value = convert_charset( $value,
$qual>0 ? 'utf-8' : 'iso-8859-1',
get_pwg_charset() );
}
}
return $value;
}
@@ -266,21 +266,6 @@ function get_regular_search_results($search, $images_where)
}
if (function_exists('mb_strtolower'))
{
function transliterate($term)
{
return remove_accents( mb_strtolower($term) );
}
}
else
{
function transliterate($term)
{
return remove_accents( strtolower($term) );
}
}
function is_word_char($ch)
{
return ($ch>='0' && $ch<='9') || ($ch>='a' && $ch<='z') || ($ch>='A' && $ch<='Z') || ord($ch)>127;
@@ -99,7 +99,7 @@ function id_compare($a, $b)
foreach ($tags as $tag)
{
$tag_letter = strtoupper(mb_substr(str2url($tag['name']), 0, 1, 'utf-8'));
$tag_letter = mb_strtoupper(mb_substr(transliterate($tag['name']), 0, 1, PWG_CHARSET), PWG_CHARSET);
if ($current_tag_idx==0) {
$current_letter = $tag_letter;

0 comments on commit 528c75a

Please sign in to comment.