From 0bf8e97e46ff85d02d7f158c6b476369f7912cc1 Mon Sep 17 00:00:00 2001 From: Qiang Kou Date: Tue, 2 Aug 2016 10:41:52 -0700 Subject: [PATCH] rm string representation of encoding; CE_UTF8 as default encoding --- ChangeLog | 6 +++ inst/include/Rcpp/String.h | 85 +++++++++-------------------------- inst/unitTests/cpp/String.cpp | 8 ++-- inst/unitTests/runit.String.R | 4 +- 4 files changed, 32 insertions(+), 71 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8e96f9fd7..ac45288a9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2016-08-02 Qiang Kou + + * inst/include/Rcpp/String.h: CE_UTF8 as default encoding + * inst/unitTests/cpp/String.cpp: Update unit test + * inst/unitTests/runit.String.R: Idem + 2016-08-01 Nathan Russell * inst/include/Rcpp/vector/Vector.h: Added decreasing option for Vector diff --git a/inst/include/Rcpp/String.h b/inst/include/Rcpp/String.h index 17434320e..18e332f91 100644 --- a/inst/include/Rcpp/String.h +++ b/inst/include/Rcpp/String.h @@ -40,7 +40,6 @@ #define RCPP_STRING_DEBUG_3(fmt, M1, M2, M3) #endif - namespace Rcpp { /** @@ -53,7 +52,7 @@ namespace Rcpp { typedef internal::const_string_proxy const_StringProxy; /** default constructor */ - String(): data(Rf_mkChar("")), buffer(), valid(true), buffer_ready(true), enc(CE_NATIVE) { + String(): data(Rf_mkCharCE("", CE_UTF8)), buffer(), valid(true), buffer_ready(true), enc(CE_UTF8) { Rcpp_PreserveObject(data); RCPP_STRING_DEBUG("String()"); } @@ -64,12 +63,6 @@ namespace Rcpp { RCPP_STRING_DEBUG("String(const String&)"); } - String(const String& other, const std::string& enc) : data(other.get_sexp()), valid(true), buffer_ready(false) { - Rcpp_PreserveObject(data); - set_encoding(enc); - RCPP_STRING_DEBUG("String(const String&)"); - } - /** construct a string from a single CHARSXP SEXP */ String(SEXP charsxp) : data(R_NilValue) { if (TYPEOF(charsxp) == STRSXP) { @@ -88,33 +81,16 @@ namespace Rcpp { RCPP_STRING_DEBUG("String(SEXP)"); } - String(SEXP charsxp, const std::string& enc) : data(R_NilValue) { - if (TYPEOF(charsxp) == STRSXP) { - data = STRING_ELT(charsxp, 0); - } else if (TYPEOF(charsxp) == CHARSXP) { - data = charsxp; - } - - if (::Rf_isString(data) && ::Rf_length(data) != 1) - throw ::Rcpp::not_compatible("expecting a single value"); - - valid = true; - buffer_ready = false; - Rcpp_PreserveObject(data); - set_encoding(enc); - RCPP_STRING_DEBUG("String(SEXP)"); - } - /** from string proxy */ String(const StringProxy& proxy): data(proxy.get()), valid(true), buffer_ready(false), enc(Rf_getCharCE(proxy.get())) { Rcpp_PreserveObject(data); RCPP_STRING_DEBUG("String(const StringProxy&)"); } - String(const StringProxy& proxy, const std::string& enc): data(proxy.get()), valid(true), buffer_ready(false) { + String(const StringProxy& proxy, cetype_t enc): data(proxy.get()), valid(true), buffer_ready(false) { Rcpp_PreserveObject(data); set_encoding(enc); - RCPP_STRING_DEBUG("String(const StringProxy&)"); + RCPP_STRING_DEBUG("String(const StringProxy&, cetype_t)"); } /** from string proxy */ @@ -123,40 +99,40 @@ namespace Rcpp { RCPP_STRING_DEBUG("String(const const_StringProxy&)"); } - String(const const_StringProxy& proxy, const std::string& enc): data(proxy.get()), valid(true), buffer_ready(false) { + String(const const_StringProxy& proxy, cetype_t enc): data(proxy.get()), valid(true), buffer_ready(false) { Rcpp_PreserveObject(data); set_encoding(enc); - RCPP_STRING_DEBUG("String(const const_StringProxy&)"); + RCPP_STRING_DEBUG("String(const const_StringProxy&, cetype_t)"); } /** from a std::string */ - String(const std::string& s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE) { + String(const std::string& s, cetype_t enc = CE_UTF8) : buffer(s), valid(false), buffer_ready(true), enc(enc) { data = R_NilValue; - RCPP_STRING_DEBUG("String(const std::string&)"); + RCPP_STRING_DEBUG("String(const std::string&, cetype_t)"); } - String(const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(CE_NATIVE) { + String(const std::wstring& s, cetype_t enc = CE_UTF8) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(enc) { Rcpp_PreserveObject(data); - RCPP_STRING_DEBUG("String(const std::wstring&)"); + RCPP_STRING_DEBUG("String(const std::wstring&, cetype_t)"); } /** from a const char* */ - String(const char* s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE) { + String(const char* s, cetype_t enc = CE_UTF8) : buffer(s), valid(false), buffer_ready(true), enc(enc) { data = R_NilValue; - RCPP_STRING_DEBUG("String(const char*)"); + RCPP_STRING_DEBUG("String(const char*, cetype_t)"); } - String(const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(CE_NATIVE) { + String(const wchar_t* s, cetype_t enc = CE_UTF8) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(enc) { Rcpp_PreserveObject(data); - RCPP_STRING_DEBUG("String(const wchar_t* s)"); + RCPP_STRING_DEBUG("String(const wchar_t* s, cetype_t)"); } /** constructors from R primitives */ - String(int x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_NATIVE) {Rcpp_PreserveObject(data);} - String(double x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_NATIVE) {Rcpp_PreserveObject(data);} - String(bool x) : data(internal::r_coerce(x)), valid(true) , buffer_ready(false), enc(CE_NATIVE) {Rcpp_PreserveObject(data);} - String(Rcomplex x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_NATIVE) {Rcpp_PreserveObject(data);} - String(Rbyte x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_NATIVE) {Rcpp_PreserveObject(data);} + String(int x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_UTF8) {Rcpp_PreserveObject(data);} + String(double x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_UTF8) {Rcpp_PreserveObject(data);} + String(bool x) : data(internal::r_coerce(x)), valid(true) , buffer_ready(false), enc(CE_UTF8) {Rcpp_PreserveObject(data);} + String(Rcomplex x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_UTF8) {Rcpp_PreserveObject(data);} + String(Rbyte x) : data(internal::r_coerce(x)), valid(true), buffer_ready(false), enc(CE_UTF8) {Rcpp_PreserveObject(data);} ~String() { Rcpp_ReleaseObject(data); @@ -406,17 +382,8 @@ namespace Rcpp { return buffer_ready ? buffer.c_str() : CHAR(data); } - inline const std::string get_encoding() const { - switch (enc) { - case CE_BYTES: - return "bytes"; - case CE_LATIN1: - return "latin1"; - case CE_UTF8: - return "UTF-8"; - default: - return "unknown"; - } + inline cetype_t get_encoding() const { + return enc; } inline void set_encoding(cetype_t encoding) { @@ -431,18 +398,6 @@ namespace Rcpp { } } - inline void set_encoding(const std::string & encoding) { - if (encoding == "bytes") { - set_encoding(CE_BYTES); - } else if (encoding == "latin1") { - set_encoding(CE_LATIN1); - } else if (encoding == "UTF-8") { - set_encoding(CE_UTF8); - } else { - set_encoding(CE_ANY); - } - } - bool operator<(const Rcpp::String& other) const { return strcmp(get_cstring(), other.get_cstring()) < 0; } diff --git a/inst/unitTests/cpp/String.cpp b/inst/unitTests/cpp/String.cpp index 08e0dfa6c..4adf839df 100644 --- a/inst/unitTests/cpp/String.cpp +++ b/inst/unitTests/cpp/String.cpp @@ -90,20 +90,20 @@ String test_push_front(String x) { } // [[Rcpp::export]] -String test_String_encoding(String x) { +int test_String_encoding(String x) { return x.get_encoding(); } // [[Rcpp::export]] String test_String_set_encoding(String x) { - x.set_encoding("UTF-8"); + x.set_encoding(CE_UTF8); return x; } // [[Rcpp::export]] String test_String_ctor_encoding(String x) { String y(x); - y.set_encoding("UTF-8"); + y.set_encoding(CE_UTF8); return y; } @@ -111,6 +111,6 @@ String test_String_ctor_encoding(String x) { // [[Rcpp::export]] String test_String_ctor_encoding2() { String y("å"); - y.set_encoding("UTF-8"); + y.set_encoding(CE_UTF8); return y; } diff --git a/inst/unitTests/runit.String.R b/inst/unitTests/runit.String.R index b8ca20126..9820c58a8 100644 --- a/inst/unitTests/runit.String.R +++ b/inst/unitTests/runit.String.R @@ -87,8 +87,8 @@ if (.runThisTest) { a <- b <- "å" Encoding(a) <- "unknown" Encoding(b) <- "UTF-8" - checkEquals(test_String_encoding(a), "unknown") - checkEquals(test_String_encoding(b), "UTF-8") + checkEquals(test_String_encoding(a), 0) + checkEquals(test_String_encoding(b), 1) checkEquals(Encoding(test_String_set_encoding(a)), "UTF-8") checkEquals(Encoding(test_String_ctor_encoding(a)), "UTF-8") checkEquals(Encoding(test_String_ctor_encoding2()), "UTF-8")