Skip to content

Commit

Permalink
moved yaml write to C level as Philippe suggested; now supports gzip'…
Browse files Browse the repository at this point in the history
…d yaml and bom when no column names too
  • Loading branch information
mattdowle committed May 23, 2019
1 parent db39ba3 commit 5024343
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 61 deletions.
34 changes: 13 additions & 21 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
# validate arguments
if (is.matrix(x)) { # coerce to data.table if input object is matrix
message("x being coerced from class: matrix to data.table")
x <- as.data.table(x)
x = as.data.table(x)
}
stopifnot(is.list(x),
identical(quote,"auto") || isTRUEorFALSE(quote),
Expand All @@ -44,21 +44,20 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
length(compress) == 1L && compress %chin% c("auto", "none", "gzip"),
isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names),
isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01),
isTRUEorFALSE(bom),
isTRUEorFALSE(bom),
length(na) == 1L, #1725, handles NULL or character(0) input
is.character(file) && length(file)==1L && !is.na(file),
length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024,
length(nThread)==1L && !is.na(nThread) && nThread>=1L
)

is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))
is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))

file <- path.expand(file) # "~/foo/bar"
file = path.expand(file) # "~/foo/bar"
if (append && missing(col.names) && (file=="" || file.exists(file))) {
col.names = FALSE # test 1658.16 checks this
bom = FALSE
}
if (bom && !col.names) stop("bom can be TRUE only if col.names is TRUE") # nocov
if (identical(quote,"auto")) quote=NA # logical NA
if (file=="") {
# console output which it seems isn't thread safe on Windows even when one-batch-at-a-time
Expand All @@ -81,12 +80,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
}

# process YAML after potentially short-circuiting due to irregularities
if (yaml) {
yaml = if (yaml) {
if (!requireNamespace('yaml', quietly=TRUE))
stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
if (append || is_gzip) {
if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.")
if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.")
if (append && (file=="" || file.exists(file))) {
warning("Ignoring yaml=TRUE because append=TRUE and the file already exists. YAML will only be written to the top of a file.")
""
} else {
schema_vec = sapply(x, class)
# multi-class objects reduced to first class
Expand All @@ -106,20 +105,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na,
dec=dec, qmethod=qmethod, logical01=logical01
)
if (bom) {
# writeBin cannot overwrite, so wipe the file
if (file.exists(file)) close(file(file, open='w'))
writeBin(as.raw(c(0xEF, 0xBB, 0xBF)), file)
}
# NB: as.yaml adds trailing newline
cat('---', yaml::as.yaml(yaml_header, line.sep=eol), '---', sep=eol, file=file, append=bom)
bom = FALSE
append = TRUE
paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline
}
}
file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
} else ""
file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
.Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
showProgress, is_gzip, bom, verbose)
showProgress, is_gzip, bom, yaml, verbose)
invisible()
}

31 changes: 15 additions & 16 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6514,13 +6514,13 @@ if (test_xts) {
setcolorder(dt, c(2, 3, 1))
dt[ , char_col := 'a']
test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric')

# 890 -- key argument for as.data.table.xts
x = xts(1:10, as.Date(1:10, origin = "1970-01-01"))
test(1465.18, capture.output(as.data.table(x, key="index")),
c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2",
" 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5",
" 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8",
c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2",
" 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5",
" 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8",
" 9: 1970-01-10 9", "10: 1970-01-11 10"))

Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE)
Expand Down Expand Up @@ -9466,7 +9466,7 @@ test(1658.25, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"')
# integer NA
DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5))
test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5')
test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NA\nNA,4\n3,5')
test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5')

# wrong argument types
test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE")
Expand Down Expand Up @@ -14448,18 +14448,18 @@ if (test_yaml) { # csvy; #1701
# force eol for platform independence
fwrite(DT, tmp, yaml = TRUE, eol = '\n')
as_read = readLines(tmp)
test(2033.01, as_read[c(1L, 25L)], c('---', '---'))
test(2033.01, as_read[c(1L, 24L)], c('---', '---'))
test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
test(2033.03, grepl('creation_time_utc', as_read[3L]))
test(2033.04, as_read[4:24],
test(2033.04, as_read[4:23],
c("schema:", " fields:", " - name: a", " type: integer",
" - name: b", " type: numeric", " - name: c", " type: character",
"header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
# NB: apparently \n is encoded like this in YAML
"eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
"logical01: no", ""))
"logical01: no"))
tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
test(2033.05, as_read[26:31], tbl_body)
test(2033.05, as_read[25:30], tbl_body)

# windows eol
fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
Expand All @@ -14480,13 +14480,12 @@ if (test_yaml) { # csvy; #1701
attr(DT2, 'yaml_metadata') = NULL
test(2033.08, all.equal(DT, DT2))

# unsupported operations
test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
warning = 'Skipping yaml writing because append = TRUE')
test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body,
warning = 'Skipping yaml writing because is_gzip = TRUE')
# yaml + bom arguments
warning = 'Ignoring yaml=TRUE because append=TRUE and the file already exists. YAML.*only.*top of a file')

# TODO: test gzip'd yaml which is now supported

# yaml + bom arguments
DT <- data.table(l=letters, n=1:26)
fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE)
n_lines = length(readLines(f, warn = FALSE))
Expand Down Expand Up @@ -14808,7 +14807,7 @@ test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural joi
options(datatable.naturaljoin=FALSE)

#tests for adding key to as.data.table, #890
## as.data.table.numeric (should cover as.data.table.factor,
## as.data.table.numeric (should cover as.data.table.factor,
## *.ordered, *.integer, *.logical, *.character, and *.Date since
## all are the same function in as.data.table.R)
nn = c(a=0.1, c=0.2, b=0.3, d=0.4)
Expand Down
47 changes: 23 additions & 24 deletions src/fwrite.c
Original file line number Diff line number Diff line change
Expand Up @@ -533,16 +533,6 @@ void writeCategString(void *col, int64_t row, char **pch)
write_string(getCategString(col, row), pch);
}

void writeBom(char **pch)
{
char *ch = *pch;
*ch++ = 0xEF;
*ch++ = 0xBB;
*ch++ = 0xBF;
*pch = ch;
}


int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen)
{
z_stream stream;
Expand Down Expand Up @@ -682,30 +672,39 @@ void fwriteMain(fwriteMainArgs args)
}
}

int yamlLen = strlen(args.yaml);
if (args.verbose) {
DTPRINT("Writing column names ... ");
DTPRINT("Writing bom (%s), yaml (%d characters) and column names (%s) ... ",
args.bom?"true":"false", yamlLen, args.colNames?"true":"false");
if (f==-1) DTPRINT("\n");
}
size_t headerLen = 0;
if (args.bom) headerLen += 3;
headerLen += yamlLen;
if (args.colNames) {
size_t headerLen = 0;
for (int j=0; j<args.ncol; j++) headerLen += getStringLen(args.colNames, j)*2; // *2 in case quotes are escaped or doubled
headerLen += args.ncol*(1/*sep*/+(doQuote!=0)*2) + eolLen + 3; // 3 in case doRowNames and doQuote (the first blank <<"",>> column name)
}
if (headerLen) {
char *buff = malloc(headerLen);
if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno));
char *ch = buff;
if (args.bom)
writeBom(&ch);
if (args.doRowNames) {
// Unusual: the extra blank column name when row_names are added as the first column
if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv
*ch++ = sep;
}
for (int j=0; j<args.ncol; j++) {
writeString(args.colNames, j, &ch);
*ch++ = sep;
if (args.bom) {*ch++=0xEF; *ch++=0xBB; *ch++=0xBF; } // 3 appears above (search for "bom")
memcpy(ch, args.yaml, yamlLen);
ch += yamlLen;
if (args.colNames) {
if (args.doRowNames) {
// Unusual: the extra blank column name when row_names are added as the first column
if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv
*ch++ = sep;
}
for (int j=0; j<args.ncol; j++) {
writeString(args.colNames, j, &ch);
*ch++ = sep;
}
ch--; // backup over the last sep
write_chars(args.eol, &ch);
}
ch--; // backup over the last sep
write_chars(args.eol, &ch);
if (f==-1) {
*ch = '\0';
DTPRINT(buff);
Expand Down
1 change: 1 addition & 0 deletions src/fwrite.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ typedef struct fwriteMainArgs
bool showProgress;
bool is_gzip;
bool bom;
const char *yaml;
bool verbose;
} fwriteMainArgs;

Expand Down
2 changes: 2 additions & 0 deletions src/fwriteR.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,15 @@ SEXP fwriteR(
SEXP showProgress_Arg,
SEXP is_gzip_Arg,
SEXP bom_Arg,
SEXP yaml_Arg,
SEXP verbose_Arg
)
{
if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table");
fwriteMainArgs args;
args.is_gzip = LOGICAL(is_gzip_Arg)[0];
args.bom = LOGICAL(bom_Arg)[0];
args.yaml = CHAR(STRING_ELT(yaml_Arg, 0));
args.verbose = LOGICAL(verbose_Arg)[0];
args.filename = CHAR(STRING_ELT(filename_Arg, 0));
args.ncol = length(DF);
Expand Down

0 comments on commit 5024343

Please sign in to comment.