Skip to content

Commit

Permalink
Add an is_utf8 implementation
Browse files Browse the repository at this point in the history
This avoids allocating memory when simply testing if a string is UTF-8
or not.
  • Loading branch information
sjoelund authored and OpenModelica-Hudson committed Jan 23, 2017
1 parent b7ec83f commit 399ee41
Show file tree
Hide file tree
Showing 5 changed files with 339 additions and 5 deletions.
5 changes: 4 additions & 1 deletion COPYING
@@ -1,7 +1,7 @@
Upstream Author: Open Source Modelica Consortium
http://www.openmodelica.org

Copyright: OpenModelica as a whole is licensed under the OSMC Public
Copyright: OpenModelica as a whole is licensed under the OSMC Public
License:

--- Start of Definition of OSMC Public License ---
Expand Down Expand Up @@ -282,6 +282,8 @@ Additional software statically linked by OpenModelica:
online http://opensource.org/licenses/BSD-3-Clause, or in the
OpenModelica source distribution if the sources were used.

[BSD-2-Clause] https://opensource.org/licenses/BSD-2-Clause

[BSD-1] Like [BSD] but only requires the text reproduced if distributed as source code.

[LGPL 2.1] Full text might be in the file '/usr/share/common-licenses/LGPL-2.1',
Expand All @@ -297,6 +299,7 @@ or online http://opensource.org/licenses/LGPL-2.1.
2006-2011 The University of Colorado Denver
[BSD] TinyMT Copyright (c) 2011, 2013 Mutsuo Saito, Makoto Matsumoto, Hiroshima University and The University of Tokyo
[BSD] csv-compare Tubes.cs Copyright (c) 2013, ITI GmbH Dresden
[BSD-2-Clause] is_utf8 Copyright (c) 2013, Palard Julien
[BSD-1] Copyright (c) 2005-2013, Troy D. Hanson http://troydhanson.github.com/uthash/
[LGPL 2.1] lpsolve55
[LGPL 2.1+] Copyright (C) 2008 Robert Gamble libcsv.c
Expand Down
3 changes: 2 additions & 1 deletion Compiler/runtime/Makefile.common
Expand Up @@ -24,7 +24,8 @@ OMC_OBJ_SHARED = Dynload_omc$(OBJEXT) Error_omc$(OBJEXT) \
ErrorMessage$(OBJEXT) systemimplmisc.o System_omc$(OBJEXT) \
Lapack_omc.o Settings_omc$(OBJEXT) \
UnitParserExt_omc.o unitparser.o \
IOStreamExt_omc.o Socket_omc.o getMemorySize.o
IOStreamExt_omc.o Socket_omc.o getMemorySize.o \
is_utf8.o

OMC_OBJ_STUBS = corbaimpl_stub_omc.o

Expand Down
265 changes: 265 additions & 0 deletions Compiler/runtime/is_utf8.c
@@ -0,0 +1,265 @@
#include "is_utf8.h"

/*
Check if the given unsigned char * is a valid utf-8 sequence.
Return value :
If the string is valid utf-8, 0 is returned.
Else the position, starting from 1, is returned.
Source:
http://www.unicode.org/versions/Unicode7.0.0/UnicodeStandard-7.0.pdf
page 124, 3.9 "Unicode Encoding Forms", "UTF-8"
Table 3-7. Well-Formed UTF-8 Byte Sequences
-----------------------------------------------------------------------------
| Code Points | First Byte | Second Byte | Third Byte | Fourth Byte |
| U+0000..U+007F | 00..7F | | | |
| U+0080..U+07FF | C2..DF | 80..BF | | |
| U+0800..U+0FFF | E0 | A0..BF | 80..BF | |
| U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | |
| U+D000..U+D7FF | ED | 80..9F | 80..BF | |
| U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | |
| U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF |
| U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF |
| U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF |
-----------------------------------------------------------------------------
Returns the first erroneous byte position, and give in
`faulty_bytes` the number of actually existing bytes taking part in this error.
*/
int is_utf8(unsigned char *str, size_t len, char **message, int *faulty_bytes)
{
size_t i = 0;

*message = NULL;
*faulty_bytes = 0;
while (i < len)
{
if (str[i] <= 0x7F) /* 00..7F */
{
i += 1;
}
else if (str[i] >= 0xC2 && str[i] <= 0xDF) /* C2..DF 80..BF */
{
if (i + 1 < len) /* Expect a 2nd byte */
{
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
{
*message = "After a first byte between C2 and DF, expecting a 2nd byte between 80 and BF";
*faulty_bytes = 2;
return i;
}
}
else
{
*message = "After a first byte between C2 and DF, expecting a 2nd byte.";
*faulty_bytes = 1;
return i;
}
i += 2;
}
else if (str[i] == 0xE0) /* E0 A0..BF 80..BF */
{
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
{
if (str[i + 1] < 0xA0 || str[i + 1] > 0xBF)
{
*message = "After a first byte of E0, expecting a 2nd byte between A0 and BF.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte of E0, expecting a 3nd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
}
else
{
*message = "After a first byte of E0, expecting two following bytes.";
*faulty_bytes = 1;
return i;
}
i += 3;
}
else if (str[i] >= 0xE1 && str[i] <= 0xEC) /* E1..EC 80..BF 80..BF */
{
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
{
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
{
*message = "After a first byte between E1 and EC, expecting the 2nd byte between 80 and BF.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte between E1 and EC, expecting the 3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
}
else
{
*message = "After a first byte between E1 and EC, expecting two following bytes.";
*faulty_bytes = 1;
return i;
}
i += 3;
}
else if (str[i] == 0xED) /* ED 80..9F 80..BF */
{
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
{
if (str[i + 1] < 0x80 || str[i + 1] > 0x9F)
{
*message = "After a first byte of ED, expecting 2nd byte between 80 and 9F.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte of ED, expecting 3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
}
else
{
*message = "After a first byte of ED, expecting two following bytes.";
*faulty_bytes = 1;
return i;
}
i += 3;
}
else if (str[i] >= 0xEE && str[i] <= 0xEF) /* EE..EF 80..BF 80..BF */
{
if (i + 2 < len) /* Expect a 2nd and 3rd byte */
{
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
{
*message = "After a first byte between EE and EF, expecting 2nd byte between 80 and BF.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte between EE and EF, expecting 3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
}
else
{
*message = "After a first byte between EE and EF, two following bytes.";
*faulty_bytes = 1;
return i;
}
i += 3;
}
else if (str[i] == 0xF0) /* F0 90..BF 80..BF 80..BF */
{
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
{
if (str[i + 1] < 0x90 || str[i + 1] > 0xBF)
{
*message = "After a first byte of F0, expecting 2nd byte between 90 and BF.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte of F0, expecting 3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
{
*message = "After a first byte of F0, expecting 4th byte between 80 and BF.";
*faulty_bytes = 4;
return i;
}
}
else
{
*message = "After a first byte of F0, expecting three following bytes.";
*faulty_bytes = 1;
return i;
}
i += 4;
}
else if (str[i] >= 0xF1 && str[i] <= 0xF3) /* F1..F3 80..BF 80..BF 80..BF */
{
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
{
if (str[i + 1] < 0x80 || str[i + 1] > 0xBF)
{
*message = "After a first byte of F1, F2, or F3, expecting a 2nd byte between 80 and BF.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte of F1, F2, or F3, expecting a 3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
{
*message = "After a first byte of F1, F2, or F3, expecting a 4th byte between 80 and BF.";
*faulty_bytes = 4;
return i;
}
}
else
{
*message = "After a first byte of F1, F2, or F3, expecting three following bytes.";
*faulty_bytes = 1;
return i;
}
i += 4;
}
else if (str[i] == 0xF4) /* F4 80..8F 80..BF 80..BF */
{
if (i + 3 < len) /* Expect a 2nd, 3rd 3th byte */
{
if (str[i + 1] < 0x80 || str[i + 1] > 0x8F)
{
*message = "After a first byte of F4, expecting 2nd byte between 80 and 8F.";
*faulty_bytes = 2;
return i;
}
if (str[i + 2] < 0x80 || str[i + 2] > 0xBF)
{
*message = "After a first byte of F4, expecting 3rd byte between 80 and BF.";
*faulty_bytes = 3;
return i;
}
if (str[i + 3] < 0x80 || str[i + 3] > 0xBF)
{
*message = "After a first byte of F4, expecting 4th byte between 80 and BF.";
*faulty_bytes = 4;
return i;
}
}
else
{
*message = "After a first byte of F4, expecting three following bytes.";
*faulty_bytes = 1;
return i;
}
i += 4;
}
else
{
*message = "Expecting bytes in the following ranges: 00..7F C2..F4.";
*faulty_bytes = 1;
return i;
}
}
message = NULL;
return 0;
}
37 changes: 37 additions & 0 deletions Compiler/runtime/is_utf8.h
@@ -0,0 +1,37 @@
/*
is_utf8 is distributed under the following terms:
Copyright (c) 2013 Palard Julien. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/

#ifndef _IS_UTF8_H
#define _IS_UTF8_H

#include <stdlib.h>

int is_utf8(unsigned char *str, size_t len, char **message, int *faulty_bytes);

#endif /* _IS_UTF8_H */

0 comments on commit 399ee41

Please sign in to comment.