Skip to content

Commit

Permalink
fwrite gzip on Solaris 10 with zlib 1.2.8 (#3965)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattdowle committed Oct 14, 2019
1 parent 4e3740b commit b2d618c
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 31 deletions.
2 changes: 2 additions & 0 deletions .dev/CRAN_Release.cmd
Expand Up @@ -16,6 +16,8 @@ rm ./src/*.so
rm ./src/*.o
rm -rf ./data.table.Rcheck

checkbashisms ./configure # for portability; e.g. Solaris 10 running Bourne shell; #3964

# Ensure no non-ASCII, other than in README.md is ok
# tests.Rraw in particular have failed CRAN Solaris (only) due to this.
grep -RI --exclude-dir=".git" --exclude="*.md" --exclude="*~" --color='auto' -P -n "[\x80-\xFF]" ./
Expand Down
2 changes: 1 addition & 1 deletion NEWS.md
Expand Up @@ -12,7 +12,7 @@

2. Compilation failed on CRAN's MacOS due to an older version of `zlib.h/zconf.h` which did not have `z_const` defined, [#3939](https://github.com/Rdatatable/data.table/issues/3939). Other open-source projects unrelated to R have experienced this problem on MacOS too. We have followed the common practice of removing `z_const` to support the older `zlib` versions, and data.table's release procedures have gained a `grep` to ensure `z_const` isn't used again by accident in future. The library `zlib` is used for `fwrite`'s new feature of multithreaded compression on-the-fly; see item 3 of 1.12.4 below.

3. An error, again in `fwrite`'s compression, but only observed so far on Solaris 32bit has been fixed, [#3931](https://github.com/Rdatatable/data.table/issues/3931): `Error -2: one or more threads failed to allocate buffers or there was a compression error.` In case it happens again, this area has been made more robust and the error more detailed.
3. A runtime error in `fwrite`'s compression, but only observed so far on Solaris 10 32bit with zlib 1.2.8 (Apr 2013), [#3931](https://github.com/Rdatatable/data.table/issues/3931): `Error -2: one or more threads failed to allocate buffers or there was a compression error.` In case it happens again, this area has been made more robust and the error more detailed. As is often the case, investigating the Solaris problem revealed secondary issues in the same area of the code. In this case, some `%d` in verbose output should have been `%lld`. This obliquity that CRAN's Solaris provides is greatly appreciated.

4. A leak could occur in the event of an unsupported column type error, or if working memory cannot all be allocated; [#3940](https://github.com/Rdatatable/data.table/issues/3940). Found thanks to `clang`'s Leak Sanitizer (prompted by CRAN's diligent use of latest tools), and two tests in the test suite which tested the unsupported type error.

Expand Down
22 changes: 15 additions & 7 deletions configure
Expand Up @@ -2,17 +2,20 @@
# Let's keep this simple. If pkg-config is available, use it. Otherwise print
# the helpful message to aid user if compilation does fail. Note 25 of R-exts:
# "[pkg-config] is available on the machines used to produce the CRAN binary packages"
# This script should pass `checkbashisms` for portability; e.g. CRAN's Solaris 10

if ! hash pkg-config 2>/dev/null; then
pkg-config --version >/dev/null 2>&1
if [ $? -ne 0 ]; then
echo "*** pkg-config is not installed. It is used to check that zlib is installed. Compilation"
echo "*** will now be attempted and if it works you can ignore this message. If it fails and"
echo "*** will now be attempted and if it works you can ignore this message. If compilation fails"
echo "*** and you cannot install pkg-config, try: locate zlib.h zconf.h. If they are not found"
echo "*** then try installing zlib1g-dev (Debian/Ubuntu), zlib-devel (Fedora) or zlib (OSX)"
exit 0 # now that the advice has been printed, exit with success and continue
fi

if ! `pkg-config --exists zlib`; then
echo "pkg-config did not detect zlib is installed. Try installing:"
pkg-config --exists zlib
if [ $? -ne 0 ]; then
echo "pkg-config did not detect zlib is installed. Please install it:"
echo "* deb: zlib1g-dev (Debian, Ubuntu, ...)"
echo "* rpm: zlib-devel (Fedora, EPEL, ...)"
echo "* brew: zlib (OSX)"
Expand All @@ -23,9 +26,14 @@ version=`pkg-config --modversion zlib`
echo "zlib ${version} is available ok"

lib=`pkg-config --libs zlib`
if test $lib != "-lz"; then
echo "zlib is linked using ${lib} not the expected standard -lz. Please report to data.table issue tracker on GitHub."
exit 1
expr "$lib" : ".*-lz$" >/dev/null
if [ $? -ne 0 ]; then
expr "$lib" : ".*-lz " >/dev/null
# would use \b in one expr but MacOS does not support \b
if [ $? -ne 0 ]; then
echo "zlib is linked using ${lib} which does not include the standard -lz. Please report to data.table issue tracker on GitHub."
exit 1
fi
fi

exit 0
Expand Down
7 changes: 4 additions & 3 deletions inst/tests/tests.Rraw
Expand Up @@ -9639,9 +9639,10 @@ test(1658.40, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c(
# fwrite compress
test(1658.41, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console
DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25))
fwrite(DT, file=f1<-tempfile(fileext=".gz"))
fwrite(DT, file=f2<-tempfile())
test(1658.42, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0)
test(1658.421, fwrite(DT, file=f1<-tempfile(fileext=".gz"), verbose=TRUE), NULL,
output="args.nrow=200 args.ncol=2.*maxLineLen=5[12].*Writing 200 rows in 1 batches of 200 rows.*nth=1") # [12] for Windows where eolLen==2
test(1658.422, fwrite(DT, file=f2<-tempfile()), NULL)
test(1658.423, file.info(f1)$size < file.info(f2)$size) # 74 < 804 (file.size() isn't available in R 3.1.0)
if (test_R.utils) test(1658.43, fread(f1), DT) # use fread to decompress gz (works cross-platform)
fwrite(DT, file=f3<-tempfile(), compress="gzip") # compress to filename not ending .gz
test(1658.44, file.info(f3)$size, file.info(f1)$size)
Expand Down
44 changes: 24 additions & 20 deletions src/fwrite.c
Expand Up @@ -551,6 +551,7 @@ void writeCategString(void *col, int64_t row, char **pch)
}

int init_stream(z_stream *stream) {
stream->next_in = Z_NULL;
stream->zalloc = Z_NULL;
stream->zfree = Z_NULL;
stream->opaque = Z_NULL;
Expand Down Expand Up @@ -615,8 +616,8 @@ void fwriteMain(fwriteMainArgs args)
DTPRINT("... ");
for (int j=args.ncol-10; j<args.ncol; j++) DTPRINT("%d ", args.whichFun[j]);
}
DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%d eolLen=%d\n",
args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen);
DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%lld args.ncol=%d eolLen=%d\n",
args.doRowNames, args.rowNames, doQuote, (long long)args.nrow, args.ncol, eolLen);
}

// Calculate upper bound for line length. Numbers use a fixed maximum (e.g. 12 for integer) while strings find the longest
Expand Down Expand Up @@ -734,7 +735,7 @@ void fwriteMain(fwriteMainArgs args)
STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); // # nocov
}
size_t zbuffUsed = zbuffSize;
ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (int)(ch-buff));
ret1 = compressbuff(&stream, zbuff, &zbuffUsed, buff, (size_t)(ch-buff));
if (ret1==Z_OK) ret2 = WRITE(f, zbuff, (int)zbuffUsed);
deflateEnd(&stream);
free(zbuff);
Expand Down Expand Up @@ -772,9 +773,8 @@ void fwriteMain(fwriteMainArgs args)
int nth = args.nth;
if (numBatches < nth) nth = numBatches;
if (args.verbose) {
DTPRINT("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ",
args.nrow, numBatches, rowsPerBatch, args.buffMB, args.showProgress, nth);
if (f==-1) DTPRINT("\n");
DTPRINT("Writing %lld rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d)\n",
(long long)args.nrow, numBatches, rowsPerBatch, args.buffMB, args.showProgress, nth);
}
t0 = wallclock();

Expand Down Expand Up @@ -813,6 +813,7 @@ void fwriteMain(fwriteMainArgs args)

bool failed = false; // naked (unprotected by atomic) write to bool ok because only ever write true in this special paradigm
int failed_compress = 0; // the first thread to fail writes their reason here when they first get to ordered section
char failed_msg[1001] = ""; // to hold zlib's msg; copied out of zlib in ordered section just in case the msg is allocated within zlib
int failed_write = 0; // same. could use +ve and -ve in the same code but separate it out to trace Solaris problem, #3931

#pragma omp parallel num_threads(nth)
Expand All @@ -827,7 +828,7 @@ void fwriteMain(fwriteMainArgs args)
z_stream mystream;
if (args.is_gzip) {
myzBuff = zbuffPool + me*zbuffSize;
if (init_stream(&mystream)) {
if (init_stream(&mystream)) { // this should be thread safe according to zlib documentation
failed = true; // # nocov
my_failed_compress = -998; // # nocov
}
Expand Down Expand Up @@ -862,28 +863,29 @@ void fwriteMain(fwriteMainArgs args)
// compress buffer if gzip
if (args.is_gzip && !failed) {
myzbuffUsed = zbuffSize;
int ret = compressbuff(&mystream, myzBuff, &myzbuffUsed, myBuff, (int)(ch-myBuff));
int ret = compressbuff(&mystream, myzBuff, &myzbuffUsed, myBuff, (size_t)(ch-myBuff));
if (ret) { failed=true; my_failed_compress=ret; }
else deflateReset(&mystream);
}
#pragma omp ordered
{
if (failed) {
if (failed_compress==0 && my_failed_compress!=0) failed_compress = my_failed_compress; // # nocov
// # nocov start
if (failed_compress==0 && my_failed_compress!=0) {
failed_compress = my_failed_compress;
if (mystream.msg!=NULL) strncpy(failed_msg, mystream.msg, 1000); // copy zlib's msg for safe use after deflateEnd just in case zlib allocated the message
}
// else another thread could have failed below while I was working or waiting above; their reason got here first
// # nocov end
} else {
errno=0;
if (f==-1) {
*ch='\0'; // standard C string end marker so DTPRINT knows where to stop
DTPRINT(myBuff);
} else if (args.is_gzip) {
if (WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) {
failed=true; // # nocov
failed_write=errno; // # nocov
}
} else if (WRITE(f, myBuff, (int)(ch - myBuff)) == -1) {
failed=true; // # nocov
failed_write=errno; // # nocov
} else if ((args.is_gzip ? WRITE(f, myzBuff, (int)myzbuffUsed)
: WRITE(f, myBuff, (int)(ch-myBuff))) == -1) {
failed=true; // # nocov
failed_write=errno; // # nocov
}

int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB
Expand All @@ -898,9 +900,9 @@ void fwriteMain(fwriteMainArgs args)
int ETA = (int)((args.nrow-end)*((now-startTime)/end));
if (hasPrinted || ETA >= 2) {
if (args.verbose && !hasPrinted) DTPRINT("\n");
DTPRINT("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. "
DTPRINT("\rWritten %.1f%% of %lld rows in %d secs using %d thread%s. "
"maxBuffUsed=%d%%. ETA %d secs. ",
(100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, nth==1?"":"s",
(100.0*end)/args.nrow, (long long)args.nrow, (int)(now-startTime), nth, nth==1?"":"s",
maxBuffUsedPC, ETA);
// TODO: use progress() as in fread
nextTime = now+1;
Expand Down Expand Up @@ -956,7 +958,9 @@ void fwriteMain(fwriteMainArgs args)
if (failed) {
// # nocov start
if (failed_compress)
STOP("Error %d: compression error. Please retry with verbose=TRUE and search online for this error message.\n", failed_compress);
STOP("zlib v%s deflate() returned error %d with z_stream.msg '%s'. %s\n", zlibVersion(), failed_compress, failed_msg,
args.verbose ? "Please include the full output above in your data.table bug report."
: "Please retry fwrite() with verbose=TRUE and include the full output with your data.table bug report.");
if (failed_write)
STOP("%s: '%s'", strerror(failed_write), args.filename);
// # nocov end
Expand Down

0 comments on commit b2d618c

Please sign in to comment.