Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 82 additions & 26 deletions inst/include/Rcpp/String.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,54 +53,74 @@ namespace Rcpp {
typedef internal::const_string_proxy<STRSXP> const_StringProxy;

/** default constructor */
String( ): data( Rf_mkChar("") ), buffer(), valid(true), buffer_ready(true) {
String( ): data( Rf_mkChar("") ), buffer(), valid(true), buffer_ready(true), enc(CE_NATIVE) {
RCPP_STRING_DEBUG( "String()" ) ;
}

/** copy constructor */
String( const String& other) : data( other.get_sexp()), valid(true), buffer_ready(false) {
String( const String& other) : data( other.get_sexp()), valid(true), buffer_ready(false), enc(Rf_getCharCE(other.get_sexp())) {
RCPP_STRING_DEBUG( "String(const String&)" ) ;
}

String( const String& other, const std::string& enc) : data( other.get_sexp()), valid(true), buffer_ready(false) {
set_encoding(enc);
RCPP_STRING_DEBUG( "String(const String&)" ) ;
}

/** construct a string from a single CHARSXP SEXP */
String(SEXP charsxp) : data(charsxp), valid(true), buffer_ready(false) {
String(SEXP charsxp) : data(charsxp), valid(true), buffer_ready(false), enc(Rf_getCharCE(charsxp)) {
RCPP_STRING_DEBUG( "String(SEXP)" ) ;
}

String(SEXP charsxp, const std::string& enc) : data(charsxp), valid(true), buffer_ready(false) {
set_encoding(enc);
RCPP_STRING_DEBUG( "String(SEXP)" ) ;
}

/** from string proxy */
String( const StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false){
String( const StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false), enc(Rf_getCharCE(proxy.get())){
RCPP_STRING_DEBUG( "String( const StringProxy&)" ) ;
}

String( const StringProxy& proxy, const std::string& enc ): data( proxy.get() ), valid(true), buffer_ready(false) {
set_encoding(enc);
RCPP_STRING_DEBUG( "String( const StringProxy&)" ) ;
}

/** from string proxy */
String( const const_StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false){
String( const const_StringProxy& proxy ): data( proxy.get() ), valid(true), buffer_ready(false), enc(Rf_getCharCE(proxy.get())){
RCPP_STRING_DEBUG( "String( const const_StringProxy&)" ) ;
}

String( const const_StringProxy& proxy, const std::string& enc ): data( proxy.get() ), valid(true), buffer_ready(false) {
set_encoding(enc);
RCPP_STRING_DEBUG( "String( const const_StringProxy&)" ) ;
}

/** from a std::string */
String( const std::string& s) : buffer(s), valid(false), buffer_ready(true) {
String( const std::string& s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE) {
RCPP_STRING_DEBUG( "String(const std::string& )" ) ;
}

String( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false) {
String( const std::wstring& s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(CE_NATIVE) {
RCPP_STRING_DEBUG( "String(const std::wstring& )" ) ;
}

/** from a const char* */
String( const char* s) : buffer(s), valid(false), buffer_ready(true){
String( const char* s) : buffer(s), valid(false), buffer_ready(true), enc(CE_NATIVE){
RCPP_STRING_DEBUG( "String(const char*)" ) ;
}

String( const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false) {
String( const wchar_t* s) : data(internal::make_charsexp(s)), valid(true), buffer_ready(false), enc(CE_NATIVE) {
RCPP_STRING_DEBUG( "String(const wchar_t* s)" ) ;
}


/** constructors from R primitives */
String( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true), buffer_ready(false) {}
String( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true), buffer_ready(false){}
String( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false){}
String( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false){}
String( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true), buffer_ready(false){}
String( int x ) : data( internal::r_coerce<INTSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
String( double x ) : data( internal::r_coerce<REALSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}
String( bool x ) : data( internal::r_coerce<LGLSXP,STRSXP>(x) ), valid( true ) , buffer_ready(false), enc(CE_NATIVE){}
String( Rcomplex x ) : data( internal::r_coerce<CPLXSXP,STRSXP>(x) ), valid( true ), buffer_ready(false), enc(CE_NATIVE){}
String( Rbyte x ) : data( internal::r_coerce<RAWSXP,STRSXP>(x) ), valid(true), buffer_ready(false), enc(CE_NATIVE){}


inline String& operator=( int x ){ data = internal::r_coerce<INTSXP ,STRSXP>( x ) ; valid = true ; buffer_ready = false ; return *this ; }
Expand All @@ -127,13 +147,13 @@ namespace Rcpp {
inline String& operator=( const std::wstring& s){ return assign_wide_string(s) ; }
inline String& operator=( const wchar_t* s){ return assign_wide_string(s) ; }


inline String& operator+=( const std::string& s){
RCPP_STRING_DEBUG( "String::operator+=( std::string )" ) ;
if( is_na() ) return *this ;
setBuffer() ; buffer += s ; valid = false ;
return *this ;
}

inline String& operator+=( const char* s){
RCPP_STRING_DEBUG( "String::operator+=( const char*)" ) ;
if( is_na() ) return *this ;
Expand All @@ -157,8 +177,8 @@ namespace Rcpp {

public:

inline String& operator+=( const std::wstring& s){ return append_wide_string( s ); }
inline String& operator+=( const wchar_t* s){ return append_wide_string( s ); }
inline String& operator+=( const std::wstring& s){ return append_wide_string( s ); }
inline String& operator+=( const wchar_t* s){ return append_wide_string( s ); }

inline String& operator+=( const String& other ){
RCPP_STRING_DEBUG( "String::operator+=( const char*)" ) ;
Expand Down Expand Up @@ -214,8 +234,6 @@ namespace Rcpp {
return replace_first( s.get_cstring(), news.get_cstring() ) ;
}



inline String& replace_last( const char* s, const char* news ){
RCPP_STRING_DEBUG_2( "String::replace_last( const char* = '%s' , const char* = '%s')", s, news ) ;
if( is_na() ) return *this ;
Expand Down Expand Up @@ -312,7 +330,7 @@ namespace Rcpp {

inline SEXP get_sexp() const {
RCPP_STRING_DEBUG_1( "String::get_sexp const ( valid = %d) ", valid ) ;
return valid ? data : Rf_mkChar( buffer.c_str() ) ;
return valid ? data : Rf_mkCharCE( buffer.c_str(), enc ) ;
}

inline SEXP get_sexp() {
Expand All @@ -329,11 +347,46 @@ namespace Rcpp {
return std::wstring( s, s + strlen(s) );
}


inline const char* get_cstring() const {
return buffer_ready ? buffer.c_str() : CHAR(data) ;
}

inline const std::string get_encoding() const {
switch (enc) {
case CE_BYTES:
return "bytes";
case CE_LATIN1:
return "latin1";
case CE_UTF8:
return "UTF-8";
default:
return "unknown";
}
}

inline void set_encoding( cetype_t encoding ) {
enc = encoding;

if (valid) {
data = Rf_mkCharCE(Rf_translateCharUTF8(data), encoding);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the underlying SEXP for the String class managed? When we create it to do we PROTECT it and/or do we need to? When we assign over the top of it (as is done here) do we need to do any additional UNPROTECT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to admit I am not sure about this problem.

According to some similar SEXP assignment code, I don't think we need additional UNPROTECT here.

@eddelbuettel , can you please give us a clear answer? Thank you!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the underlying SEXP is a CHARSXP, I think its lifetime is essentially 'infinite' since once a string enters the string pool R won't clean it out. At least, that's the assumption the String class is making all over the place; this probably won't be safe assumption 'some day' but is probably permissible for now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, agreed. I think we are good to merge (if I hear no objections by
tomorrow AM I'll go ahead and merge).

On Wed, Jul 1, 2015 at 6:19 PM, Kevin Ushey notifications@github.com
wrote:

In inst/include/Rcpp/String.h
#310 (comment):

  •            case CE_BYTES:
    
  •                return "bytes";
    
  •            case CE_LATIN1:
    
  •                return "latin1";
    
  •            case CE_UTF8:
    
  •                return "UTF-8";
    
  •            default:
    
  •                return "unknown";
    
  •        }
    
  •    }
    
  •    inline void set_encoding( cetype_t encoding ) {
    
  •        enc = encoding;
    
  •        if (valid) {
    
  •            data = Rf_mkCharCE(Rf_translateCharUTF8(data), encoding);
    

Because the underlying SEXP is a CHARSXP, I think its lifetime is
essentially 'infinite' since once a string enters the string pool R won't
clean it out. At least, that's the assumption the String class is making
all over the place; this probably won't be safe assumption 'some day' but
is probably permissible for now.


Reply to this email directly or view it on GitHub
https://github.com/RcppCore/Rcpp/pull/310/files#r33695590.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kevinushey are you sure that's true? I thought CHARSXPs were garbage collected, but normally you don't have to worry about PROTECT() because they immediately go into a STRSXP. (But I can't think of any easy way to test that hypothesis)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think CHARSXPs enter a global string cache (it's something I vaguely remember reading about a long time ago) but I think it's worth confirming / testing. It should be easy to make an example with e.g. an unprotected Rf_mkChar + a following Rf_allocVector + gctorture; presumedly that CHARSXP would be cleaned up if the GC touched it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From Writing R Extensions (
http://cran.rstudio.com/doc/manuals/r-devel/R-exts.html#Handling-character-data
):

CHARSXPs are read-only objects and must never be modified. In particular,
the C-style string contained in a CHARSXP should be treated as read-only
and for this reason the CHAR function used to access the character data of
a CHARSXP returns (const char *) (this also allows compilers to issue
warnings about improper use). Since CHARSXPs are immutable, the same
CHARSXP can
be shared by any STRSXP needing an element representing the same string. R
maintains a global cache of CHARSXPs so that there is only ever one
CHARSXP representing
a given string in memory.

On Mon, Jul 6, 2015 at 1:25 PM, Kevin Ushey notifications@github.com
wrote:

In inst/include/Rcpp/String.h
#310 (comment):

  •            case CE_BYTES:
    
  •                return "bytes";
    
  •            case CE_LATIN1:
    
  •                return "latin1";
    
  •            case CE_UTF8:
    
  •                return "UTF-8";
    
  •            default:
    
  •                return "unknown";
    
  •        }
    
  •    }
    
  •    inline void set_encoding( cetype_t encoding ) {
    
  •        enc = encoding;
    
  •        if (valid) {
    
  •            data = Rf_mkCharCE(Rf_translateCharUTF8(data), encoding);
    

I think CHARSXPs enter a global string cache (it's something I vaguely
remember reading about a long time ago) but I think it's worth confirming /
testing. It should be easy to make an example with e.g. an unprotected
Rf_mkChar + a following Rf_allocVector + gctorture; presumedly that
CHARSXP would be cleaned up if the GC touched it.


Reply to this email directly or view it on GitHub
https://github.com/RcppCore/Rcpp/pull/310/files#r33959013.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think either of those speak to whether CHARSXPs are gc'd or not. This PR by @kevinushey suggests that they are: hadley/reshape#40

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just added a comment at the end of the PR showing that they indeed can be GCed, so we'll have to handle that. :(

} else {
data = Rf_mkCharCE(buffer.c_str(), encoding) ;
valid = true ;
}
}

inline void set_encoding(const std::string & encoding) {
if ( encoding == "bytes" ) {
set_encoding( CE_BYTES );
} else if ( encoding == "latin1" ) {
set_encoding( CE_LATIN1 );
} else if ( encoding == "UTF-8" ) {
set_encoding( CE_UTF8 );
} else {
set_encoding( CE_ANY );
}
}

bool operator<( const Rcpp::String& other ) const {
return strcmp( get_cstring(), other.get_cstring() ) < 0;
}
Expand Down Expand Up @@ -363,6 +416,9 @@ namespace Rcpp {
/** is the buffer initialized */
bool buffer_ready ;

/** the encoding of encapsulated CHARSXP */
cetype_t enc;

inline bool is_na() const { return data == NA_STRING ; }
inline void setBuffer(){
if( !buffer_ready){
Expand All @@ -373,7 +429,7 @@ namespace Rcpp {
inline void setData(){
RCPP_STRING_DEBUG( "setData" ) ;
if(!valid) {
data = Rf_mkChar(buffer.c_str()) ;
data = Rf_mkCharCE(buffer.c_str(), enc) ;
valid = true ;
}
}
Expand Down Expand Up @@ -403,8 +459,8 @@ namespace Rcpp {
return s.get_sexp() ;
}

template <int RTYPE>
template <typename T>
template <int RTYPE>
template <typename T>
string_proxy<RTYPE>& string_proxy<RTYPE>::operator+=(const T& rhs) {
String tmp = get() ;
tmp += rhs ;
Expand All @@ -415,7 +471,7 @@ namespace Rcpp {
}


template <>
template <>
inline SEXP wrap<Rcpp::String>( const Rcpp::String& object) {
RCPP_STRING_DEBUG( "wrap<String>()" ) ;
Shield<SEXP> res( Rf_allocVector( STRSXP, 1 ) ) ;
Expand Down
26 changes: 26 additions & 0 deletions inst/unitTests/cpp/String.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,29 @@ String test_push_front(String x) {
x.push_front("abc");
return x;
}

// [[Rcpp::export]]
String test_String_encoding(String x) {
return x.get_encoding();
}

// [[Rcpp::export]]
String test_String_set_encoding(String x) {
x.set_encoding("UTF-8");
return x;
}

// [[Rcpp::export]]
String test_String_ctor_encoding(String x) {
String y(x);
y.set_encoding("UTF-8");
return y;
}


// [[Rcpp::export]]
String test_String_ctor_encoding2() {
String y("å");
y.set_encoding("UTF-8");
return y;
}
12 changes: 12 additions & 0 deletions inst/unitTests/runit.String.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,16 @@ if (.runThisTest) {
res <- test_push_front("def")
checkIdentical(res, "abcdef")
}

test.String.encoding <- function() {
a <- b <- "å"
Encoding(a) <- "unknown"
Encoding(b) <- "UTF-8"
checkEquals(test_String_encoding(a), "unknown")
checkEquals(test_String_encoding(b), "UTF-8")
checkEquals(Encoding(test_String_set_encoding(a)), "UTF-8")
checkEquals(Encoding(test_String_ctor_encoding(a)), "UTF-8")
checkEquals(Encoding(test_String_ctor_encoding2()), "UTF-8")
}

}