Skip to content

Commit

Permalink
fwrite > 1e6 columns fixed stack overflow segfault by removing VLAs a…
Browse files Browse the repository at this point in the history
…t C level. Closes #1903. #1664.
  • Loading branch information
mattdowle committed Nov 11, 2016
1 parent 9162592 commit 4fb148f
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 29 deletions.
9 changes: 5 additions & 4 deletions NEWS.md
Expand Up @@ -11,11 +11,12 @@

#### NEW FEATURES

1. `fwrite` - *parallel file writer*:
1. `fwrite` - *parallel .csv writer*:
* Thanks to Otto Seiskari for the initial pull request [#580](https://github.com/Rdatatable/data.table/issues/580) that provided C code, R wrapper, manual page and extensive tests.
* From there Matt parallelized and specialized C functions for writing integer/numeric values. See [this blog post](http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/) for implementation details and benchmarks.
* Caught in development before release to CRAN: thanks to Francesco Grossetti for [#1725](https://github.com/Rdatatable/data.table/issues/1725) (NA handling) and Torsten Betz for [#1847](https://github.com/Rdatatable/data.table/issues/1847) (rounding of 9.999999999999998).
* `fwrite` status is being tracked here: [#1664](https://github.com/Rdatatable/data.table/issues/1664)
* From there Matt parallelized and specialized C functions for writing integer/numeric exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, dates (between 0000-03-01 and 9999-12-31), times down to microseconds in POSIXct, automatic quoting, `bit64::integer64`, \code{row.names} and \code{sep2} for \code{list} columns where each cell can itself be a vector. See [this blog post](http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/) for implementation details and benchmarks.
* Accepts any `list` of same length vectors; e.g. `data.frame` and `data.table`.
* Caught in development before release to CRAN: thanks to Francesco Grossetti for [#1725](https://github.com/Rdatatable/data.table/issues/1725) (NA handling), Torsten Betz for [#1847](https://github.com/Rdatatable/data.table/issues/1847) (rounding of 9.999999999999998) and @ambils for [#1903](https://github.com/Rdatatable/data.table/issues/1903) (> 1 million columns).
* `fwrite` status was tracked here: [#1664](https://github.com/Rdatatable/data.table/issues/1664)

2. `fread()`:
* gains `quote` argument. `quote = ""` disables quoting altogether which reads each field *as is*, [#1367](https://github.com/Rdatatable/data.table/issues/1367). Thanks @manimal.
Expand Down
9 changes: 9 additions & 0 deletions inst/tests/tests.Rraw
Expand Up @@ -9700,6 +9700,15 @@ options(old)
# check that extra digits made it into output
test(1741.6, sum(nchar(x1)) < sum(nchar(x2)) && sum(nchar(x2)) < sum(nchar(x3)))

# > 1e6 columns (there used to be VLAs at C level that caused stack overflow), #1903
set.seed(1)
L = lapply(1:1e6, sample, x=100, size=2)
x = capture.output(fwrite(L))
test(1742.1, nchar(x), c(2919861L, 2919774L)) # tests 2 very long lines, too
test(1742.2, substring(x,1,10), c("27,58,21,9","38,91,90,6"))
test(1742.3, L[[1L]], c(27L,38L))
test(1742.4, L[[1000000L]], c(76L, 40L))
test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40"))


##########################
Expand Down
64 changes: 39 additions & 25 deletions src/fwrite.c
Expand Up @@ -35,6 +35,7 @@ static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd),
#define ET_ITIME 2
#define ET_INT64 3
#define ET_POSIXCT 4
#define ET_FACTOR 5

static inline void writeInteger(long long x, char **thisCh)
{
Expand Down Expand Up @@ -528,23 +529,44 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g.
int firstListColumn = 0;
clock_t t0=clock();

// Allocate a shallow copy place-holder. Solely for when dateTimeAs="write.csv" to hold
// the results of as.character(POSIXct). But saves switches to do it always.
SEXP DF = PROTECT(allocVector(VECSXP, ncol));
SEXP DF = DFin;
int protecti = 0;
if (dateTimeAs == DATETIMEAS_WRITECSV) {
int j=0; while(j<ncol && !INHERITS(VECTOR_ELT(DFin,j), char_POSIXct)) j++;
if (j<ncol) {
// dateTimeAs=="write.csv" && there exist some POSIXct columns; coerce them
DF = PROTECT(allocVector(VECSXP, ncol));
protecti++;
// potentially large if ncol=1e6 as reported in #1903 where using large VLA caused stack overflow
SEXP s = PROTECT(allocList(2));
SET_TYPEOF(s, LANGSXP);
SETCAR(s, install("format.POSIXct"));
for (int j=0; j<ncol; j++) {
SEXP column = VECTOR_ELT(DFin, j);
if (INHERITS(column, char_POSIXct)) {
SETCAR(CDR(s), column);
SET_VECTOR_ELT(DF, j, eval(s, R_GlobalEnv));
} else {
SET_VECTOR_ELT(DF, j, column);
}
}
UNPROTECT(1); // s, not DF
}
}

// Store column type tests in lookups for efficiency
int sameType = TYPEOF(VECTOR_ELT(DFin, 0)); // to avoid deep switch later
SEXP levels[ncol]; // VLA. NULL means not-factor too, else the levels
int extraType[ncol]; // ET_INT64, ET_ITIME, ET_DATE, ET_POSIXCT

// Store column type tests in lookup for efficiency
// ET_INT64, ET_ITIME, ET_DATE, ET_POSIXCT, ET_FACTOR
char *extraType = (char *)R_alloc(ncol, sizeof(char)); // not a VLA as ncol could be > 1e6 columns

for (int j=0; j<ncol; j++) {
SEXP column = VECTOR_ELT(DFin, j);
SET_VECTOR_ELT(DF, j, column);
SEXP column = VECTOR_ELT(DF, j);
if (nrow != length(column))
error("Column %d's length (%d) is not the same as column 1's length (%d)", j+1, length(column), nrow);
levels[j] = NULL;
extraType[j] = 0;
if (isFactor(column)) {
levels[j] = getAttrib(column, R_LevelsSymbol);
extraType[j] = ET_FACTOR;
} else if (INHERITS(column, char_integer64)) {
if (TYPEOF(column)!=REALSXP) error("Column %d inherits from 'integer64' but is type '%s' not REALSXP", j+1, type2char(TYPEOF(column)));
extraType[j] = ET_INT64;
Expand All @@ -553,16 +575,8 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g.
} else if (INHERITS(column, char_Date)) { // including IDate which inherits from Date
extraType[j] = ET_DATE;
} else if (INHERITS(column, char_POSIXct)) {
if (dateTimeAs == DATETIMEAS_WRITECSV) {
SEXP s = PROTECT(allocList(2));
SET_TYPEOF(s, LANGSXP);
SETCAR(s, install("format.POSIXct"));
SETCAR(CDR(s), column);
SET_VECTOR_ELT(DF, j, column = eval(s, R_GlobalEnv));
UNPROTECT(1);
} else {
extraType[j] = ET_POSIXCT;
}
if (dateTimeAs==DATETIMEAS_WRITECSV) error("Internal error: column should have already been coerced to character");
extraType[j] = ET_POSIXCT;
}
if (TYPEOF(column)!=sameType || getAttrib(column,R_ClassSymbol)!=R_NilValue) {
sameType = 0; // only all plain INTSXP or all plain REALSXP save the deep switch() below.
Expand Down Expand Up @@ -616,7 +630,7 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g.
case INTSXP: {
int i32 = INTEGER(column)[i];
if (i32 == NA_INTEGER) ch += na_len;
else if (levels[j] != NULL) ch += LENGTH(STRING_ELT(levels[j], i32-1));
else if (extraType[j] == ET_FACTOR) ch += LENGTH(STRING_ELT(getAttrib(column,R_LevelsSymbol), i32-1));
else if (extraType[j] == ET_ITIME) ch += 8;
else if (extraType[j] == ET_DATE) writeDate(i32, &ch);
else writeInteger(i32, &ch); }
Expand Down Expand Up @@ -761,7 +775,7 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g.
if (nrow == 0) {
if (verbose) Rprintf("No data rows present (nrow==0)\n");
if (f!=-1 && CLOSE(f)) error("%s: '%s'", strerror(errno), filename);
UNPROTECT(1); // DF
UNPROTECT(protecti);
return(R_NilValue);
}

Expand Down Expand Up @@ -888,8 +902,8 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g.
case INTSXP:
if (INTEGER(column)[i] == NA_INTEGER) {
writeChars(na, &ch);
} else if (levels[j] != NULL) { // isFactor(column) == TRUE
writeString(STRING_ELT(levels[j], INTEGER(column)[i]-1), &ch);
} else if (extraType[j] == ET_FACTOR) {
writeString(STRING_ELT(getAttrib(column,R_LevelsSymbol), INTEGER(column)[i]-1), &ch);
} else if (extraType[j] == ET_ITIME) {
writeITime(INTEGER(column)[i], &ch);
} else if (extraType[j] == ET_DATE) {
Expand Down Expand Up @@ -1119,7 +1133,7 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g.
}
if (verbose) Rprintf("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n",
nth, anyBufferGrown?"yes":"no", maxBuffUsedPC);
UNPROTECT(1); // DF; the shallow copy of DFin just for dateTimeAs="write.csv"
UNPROTECT(protecti);
return(R_NilValue);
}

Expand Down

0 comments on commit 4fb148f

Please sign in to comment.