Closes #1701 -- fread support for csvy format

Rdatatable · Feb 10, 2019 · acf13b6 · acf13b6
1 parent 5cd3a46
commit acf13b6
Show file tree

Hide file tree

Showing 14 changed files with 441 additions and 7 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,7 +15,7 @@ Authors@R: c(
   person("Scott","Ritchie",   role="ctb"))
 Depends: R (>= 3.1.0)
 Imports: methods
-Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo
+Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo, yaml
 Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.
 License: MPL-2.0 | file LICENSE
 URL: http://r-datatable.com

diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,8 @@
 
 1. `:=` no longer recycles length>1 RHS vectors. There was a warning when recycling left a remainder but no warning when the LHS length was an exact multiple of the RHS length (the same behaviour as base R). Consistent feedback for several years has been that recycling is more often a bug. In rare cases where you need to recycle a length>1 vector, use `rep()` explicitly. Single values are still recycled silently as before. Early warning was given in [this tweet](https://twitter.com/MattDowle/status/1088544083499311104). The 758 CRAN and Bioconductor packages using data.table were tested and the maintainers of the 16 packages affected (2%) were consulted before going ahead, [#3310](https://github.com/Rdatatable/data.table/pull/3310).
 
+2. `fread` gains argument `yaml` (default `FALSE`) and the ability to parse CSVY-formatted input files (i.e., csv files with metadata in a header formatted as YAML; see http://csvy.org/), [#1701](https://github.com/Rdatatable/data.table/issues/1701). See `?fread` and files in `/inst/tests/csvy/` for sample formats; please provide feedback if you find this feature useful and would like extended capabilities. For now, consider it experimental, meaning the API/arguments may change. Thanks to @leeper at [`rio`](https://github.com/leeper/rio) for the inspiration and @MichaelChirico for implementing.
+
 #### BUG FIXES
 
 1. `rbindlist()` of a malformed factor missing levels attribute is now a helpful error rather than a cryptic error about `STRING_ELT`, [#3315](https://github.com/Rdatatable/data.table/issues/3315). Thanks to Michael Chirico for reporting.

diff --git a/R/fread.R b/R/fread.R
@@ -1,5 +1,5 @@
 
-fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings=getOption("datatable.na.strings","NA"),stringsAsFactors=FALSE,verbose=getOption("datatable.verbose",FALSE),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01", FALSE), autostart=NA)
+fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings=getOption("datatable.na.strings","NA"),stringsAsFactors=FALSE,verbose=getOption("datatable.verbose",FALSE),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01", FALSE), yaml=FALSE, autostart=NA)
 {
   if (missing(input)+is.null(file)+is.null(text)+is.null(cmd) < 3L) stop("Used more than one of the arguments input=, file=, text= and cmd=.")
   input_has_vars = length(all.vars(substitute(input)))>0L  # see news for v1.11.6
@@ -18,7 +18,8 @@ fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",d
   isTrueFalse = function(x) isTRUE(x) || identical(FALSE, x)
   isTrueFalseNA = function(x) isTRUE(x) || identical(FALSE, x) || identical(NA, x)
   stopifnot( isTrueFalse(strip.white), isTrueFalse(blank.lines.skip), isTrueFalse(fill), isTrueFalse(showProgress),
-             isTrueFalse(stringsAsFactors), isTrueFalse(verbose), isTrueFalse(check.names), isTrueFalse(logical01) )
+             isTrueFalse(stringsAsFactors), isTrueFalse(verbose), isTrueFalse(check.names), isTrueFalse(logical01),
+             isTrueFalse(yaml))
   stopifnot( is.numeric(nrows), length(nrows)==1L )
   if (is.na(nrows) || nrows<0) nrows=Inf   # accept -1 to mean Inf, as read.table does
   if (identical(header,"auto")) header=NA
@@ -124,8 +125,13 @@ fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",d
     }
   }
   stopifnot(length(skip)==1L, !is.na(skip), is.character(skip) || is.numeric(skip))
-  if (skip=="__auto__") skip=-1L   # skip="string" so long as "string" is not "__auto__". Best conveys to user something is automatic there (than -1 or NA).
-  if (is.double(skip)) skip = as.integer(skip)
+  if (skip=="__auto__") {
+    if (yaml) {
+      skip=0L
+    } else {
+      skip=-1L   # skip="string" so long as "string" is not "__auto__". Best conveys to user something is automatic there (than -1 or NA).
+    }
+  } else if (is.double(skip)) skip = as.integer(skip)
   stopifnot(is.null(na.strings) || is.character(na.strings))
   tt = grep("^\\s+$", na.strings)
   if (length(tt)) {
@@ -142,6 +148,138 @@ fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",d
     }
     # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804
   }
+  if (yaml) {
+    if (!requireNamespace('yaml', quietly = TRUE))
+      stop("'data.table' relies on the package 'yaml' to ",
+           "parse the file header; please add this to your ",
+           "library with install.packages('yaml') and try again.")
+    if (is.character(skip))
+      warning("Combining a search string as 'skip' and reading a ",
+              "YAML header may not work as expected -- currently, ",
+              "reading will proceed to search for 'skip' from ",
+              "the beginning of the file, NOT from the end of ",
+              "the metadata; please file an issue on GitHub if ",
+              "you'd like to see more intuitive behavior supported.")
+    # create connection to stream header lines from file:
+    #   https://stackoverflow.com/questions/9871307
+    f = base::file(input, 'r')
+    first_line = readLines(f, n=1L)
+    n_read = 1L
+    yaml_border_re = '^#?---'
+    if (!grepl(yaml_border_re, first_line)) {
+      close(f)
+      stop('Encountered <', substring(first_line, 1L, 50L),
+           if (nchar(first_line) > 50L) '...', '> at the first ',
+           'unskipped line (', 1L+skip, '), which does not ',
+           'constitute the start to a valid YAML header ',
+           '(expecting something matching regex "', yaml_border_re,
+           '"); please check your input and try again.')
+    }
+
+    yaml_comment_re = '^#'
+    yaml_string = character(0L)
+    while (TRUE) {
+      this_line = readLines(f, n=1L)
+      n_read = n_read + 1L
+      if (!length(this_line)){
+        close(f)
+        stop('Reached the end of the file before finding ',
+             'a completion to the YAML header. A valid ',
+             'YAML header is bookended by lines matching ',
+             'the regex "', yaml_border_re, '". Please ',
+             'double check the input file is a valid csvy.')
+      }
+      if (grepl(yaml_border_re, this_line)) break
+      if (grepl(yaml_comment_re, this_line))
+        this_line = sub(yaml_comment_re, '', this_line)
+      yaml_string = paste(yaml_string, this_line, sep='\n')
+    }
+    close(f) # when #561 is implemented, no need to close f.
+
+    yaml_header = yaml::yaml.load(yaml_string)
+    yaml_names = names(yaml_header)
+    if (verbose) cat('Processed', n_read, 'lines of YAML',
+                     'metadata with the following top-level fields:',
+                     brackify(yaml_names), '\n')
+    if ('fields' %chin% yaml_names) {
+      new_types = sapply(yaml_header$fields, `[[`, 'type')
+      if (any(null_idx <- sapply(new_types, is.null)))
+        new_types = do.call(c, new_types)
+      synonms = data.table(
+        r_type = rep(c('character', 'integer', 'numeric'),
+                     c(2L, 2L, 3L)),
+        syn = c('character', 'string', 'integer', 'int',
+                'numeric', 'number', 'double'),
+        key = 'syn'
+      )
+      new_types = synonms[list(new_types)]$r_type
+      new_names = sapply(yaml_header$fields[!null_idx], `[[`, 'name')
+
+      # resolve any conflicts with colClasses, if supplied;
+      #   colClasses (if present) is already in list form by now
+      if (!is.null(colClasses)) {
+        if (any(idx_name <- new_names %chin% unlist(colClasses))) {
+          matched_name_idx = which(idx_name)
+          if (!all(idx_type <- sapply(matched_name_idx, function(ii) {
+            new_names[ii] %chin% colClasses[[ new_types[ii] ]]
+          }))) {
+            plural = sum(idx_type) > 1L
+            warning('colClasses dictated by user input and ',
+                    'those read from YAML header are in conflict ',
+                    '(specifically, for column', if (plural) 's',
+                    ' [', paste(new_names[matched_name_idx[!idx_type]],
+                                collapse = ','),
+                    ']); the proceeding assumes the user input was ',
+                    'an intentional override and will ignore the types ',
+                    'implied by the YAML header; please exclude ',
+                    if (plural) 'these columns' else 'this column',
+                    ' from colClasses if this was unintentional.')
+          }
+        }
+        # only add unmentioned columns
+        for (ii in which(!idx_name)) {
+          colClasses[[ new_types[ii] ]] =
+            c(colClasses[[ new_types[ii] ]], new_names[ii])
+        }
+      } else colClasses = tapply(new_names, new_types, c, simplify=FALSE)
+      if (!missing(col.names)) {
+        warning("User-supplied column names in 'col.names' will ",
+                "override those found in YAML metadata.")
+      }
+    }
+    sep_syn = c('sep', 'delimiter')
+    if (any(sep_idx <- sep_syn %chin% yaml_names)) {
+      if (sep != 'auto')
+        warning("User-supplied 'sep' will override that found in metadata.")
+      else sep = yaml_header[[ sep_syn[sep_idx][1L] ]]
+    }
+    if ('header' %chin% yaml_names) {
+      if (!is.na(header))
+        warning("User-supplied 'header' will override that found in metadata.")
+      else header = as.logical(yaml_header$header)
+    }
+    quote_syn = c('quote', 'quoteChar', 'quote_char')
+    if (any(quote_idx <- quote_syn %chin% yaml_names)) {
+      # won't catch if user explicitly set quote='"'
+      if (quote != '"')
+        warning("User-supplied 'quote' will override that found in metadata.")
+      else quote = yaml_header[[ quote_syn[quote_idx][1L] ]]
+    }
+    dec_syn = c('dec', 'decimal')
+    if (any(dec_idx <- dec_syn %chin% yaml_names)) {
+      # won't catch if user explicitly set dec="."
+      if (dec != '.')
+        warning("User-supplied 'dec' will override that found in metadata.")
+      else dec = yaml_header[[ dec_syn[dec_idx][1L] ]]
+    }
+    if ('na.strings' %chin% yaml_names) {
+      # won't catch if user explicitly set na.strings="NA"
+      if (na.strings != 'NA')
+        warning("User-supplied 'na.strings' will override that found in metadata.")
+      else na.strings = yaml_header$na.strings
+    }
+    if (is.integer(skip)) skip = skip + n_read
+  }
   warnings2errors = getOption("warn") >= 2
   ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
               fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding)
@@ -191,6 +329,7 @@ fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",d
     }
     setkeyv(ans, key)
   }
+  if (yaml) setattr(ans, 'yaml_metadata', yaml_header)
   if (!is.null(index) && data.table) {
     if (!all(sapply(index, is.character)))
       stop("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")

diff --git a/inst/tests/csvy/test.csvy b/inst/tests/csvy/test.csvy
@@ -0,0 +1,20 @@
+---
+name: my-dataset
+source: https://github.com/leeper/csvy/tree/master/inst/examples
+fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: explaining var1
+    constraints:
+      - required: true
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+---
+var1,var2,var3
+A,1,2.5
+B,3,4.3
diff --git a/inst/tests/csvy/test_attr_wrong.csvy b/inst/tests/csvy/test_attr_wrong.csvy
@@ -0,0 +1,24 @@
+---
+name:
+fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: a single-quoted character variable
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+    description: European-style numeric
+header: true
+sep: "-"
+dec: ","
+quote: "'"
+na.strings: '@'
+---
+var1|var2|var3
+'A'|1|2,5
+'B'|@|4,3
+
diff --git a/inst/tests/csvy/test_attributes.csvy b/inst/tests/csvy/test_attributes.csvy
@@ -0,0 +1,24 @@
+---
+name:
+fields:
+  - name: var1
+    title: variable 1
+    type: string
+    description: a single-quoted character variable
+  - name: var2
+    title: variable 2
+    type: integer
+  - name: var3
+    title: variable 3
+    type: number
+    description: European-style numeric
+header: true
+sep: "|"
+dec: ","
+quote: "'"
+na.strings: '@'
+---
+var1|var2|var3
+'A'|1|2,5
+'B'|@|4,3
+
diff --git a/inst/tests/csvy/test_comment.csvy b/inst/tests/csvy/test_comment.csvy
@@ -0,0 +1,21 @@
+#---
+#name: my-dataset
+#source: https://github.com/leeper/csvy/tree/master/inst/examples
+#fields:
+#  - name: var1
+#    title: variable 1
+#    type: string
+#    description: explaining var1
+#    constraints:
+#      - required: true
+#  - name: var2
+#    title: variable 2
+#    type: integer
+#  - name: var3
+#    title: variable 3
+#    type: number
+#---
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+
diff --git a/inst/tests/csvy/test_extraneous.csvy b/inst/tests/csvy/test_extraneous.csvy
@@ -0,0 +1,47 @@
+---
+names:
+- Date
+- WTI
+class: data.frame
+title: Cushing, OK WTI Spot Price FOB
+filename: data.csv
+fileurl: https://raw.githubusercontent.com/jrovegno/csvy/master/data.csv
+sourceurl: http://www.eia.gov/dnav/pet/hist/LeafHandler.ashx?n=PET&s=RWTC&f=D
+source_csvy: https://github.com/leeper/csvy/tree/master/inst/examples
+item: PET
+sourcekey: RWTC
+freq: Daily
+rate: MID
+type: price
+units: Dollars per Barrel
+latestdate: '2015-08-31'
+releasedate: '2015-09-02'
+nextreleasedate: '2015-09-10'
+source: Thomson Reuters
+contactemail: infoctr@eia.doe.gov
+contactphone: (202) 586-8800
+---
+"Date","WTI"
+"1986-01-02",25.56
+"1986-01-03",26
+"1986-01-06",26.53
+"1986-01-07",25.85
+"1986-01-08",25.87
+"1986-01-09",26.03
+"1986-01-10",25.65
+"1986-01-13",25.08
+"1986-01-14",24.97
+"1986-01-15",25.18
+"1986-01-16",23.98
+"1986-01-17",23.63
+"1986-01-20",21.33
+"1986-01-21",20.61
+"1986-01-22",20.25
+"1986-01-23",19.93
+"1986-01-24",19.45
+"1986-01-27",20.87
+"1986-01-28",19.45
+"1986-01-29",19.61
+"1986-01-30",19.58
+"1986-01-31",18.95
+
diff --git a/inst/tests/csvy/test_incomplete_header.csvy b/inst/tests/csvy/test_incomplete_header.csvy
@@ -0,0 +1,14 @@
+---
+name: my-dataset
+source: https://github.com/leeper/csvy/tree/master/inst/examples
+fields:
+  - name: var1
+  - name: var2
+    type: integer
+  - name: var3
+    type: number
+
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+
diff --git a/inst/tests/csvy/test_missing_type.csvy b/inst/tests/csvy/test_missing_type.csvy
@@ -0,0 +1,14 @@
+---
+name: my-dataset
+source: https://github.com/leeper/csvy/tree/master/inst/examples
+fields:
+  - name: var1
+  - name: var2
+    type: integer
+  - name: var3
+    type: number
+---
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+
diff --git a/inst/tests/csvy/test_skip.csvy b/inst/tests/csvy/test_skip.csvy
@@ -0,0 +1,15 @@
+---
+fields:
+  - name: var1
+    type: string
+  - name: var2
+    type: integer
+  - name: var3
+    type: number
+---
+
+
+var1,var2,var3
+A,1,2.5
+B,3,4.3
+