Merge pull request #508 from SebKrantz/development

Development
SebKrantz · Dec 7, 2023 · fb5253c · fb5253c
2 parents 990a6fa + d0138e3
commit fb5253c
Show file tree

Hide file tree

Showing 7 changed files with 94 additions and 91 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: collapse
 Title: Advanced and Fast Data Transformation
-Version: 2.0.6
-Date: 2023-11-11
+Version: 2.0.7
+Date: 2023-12-07
 Authors@R: c(
            person("Sebastian", "Krantz", role = c("aut", "cre"), 
                   email = "sebastian.krantz@graduateinstitute.ch", 

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,10 @@
 
 * Added argument `multiple = FALSE` to `join()`. Setting `multiple = TRUE` performs a multiple-matching join where a row in `x` is matched to all matching rows in `y`. The default `FALSE` just takes the first matching row in `y`. 
 
+* Improved recode/replace functions. Notably, `replace_outliers()` now supports option `value = "clip"` to replace outliers with the respective upper/lower bounds, and also has option `single.limit = "mad"` which removes outliers exceeding a certain number of median absolute deviations. Furthermore, all functions now have a `set` argument which fully applies the transformations by reference. 
+
+* Functions `replace_NA` and `replace_Inf` were renamed to `replace_na` and `replace_inf` to make the namespace a bit more consistent. The earlier versions remain available. 
+
 # collapse 2.0.6
 
 * Fixed a serious bug in `qsu()` where higher order weighted statistics were erroneous, i.e. whenever `qsu(x, ..., w = weights, higher = TRUE)` was invoked, the 'SD', 'Skew' and 'Kurt' columns were wrong (if `higher = FALSE` the weighted 'SD' is correct). The reason is that there appears to be no straightforward generalization of Welford's Online Algorithm to higher-order weighted statistics. This was not detected earlier because the algorithm was only tested with unit weights. The fix involved replacing Welford's Algorithm for the higher-order weighted case by a 2-pass method, that additionally uses long doubles for higher-order terms. Thanks @randrescastaneda for reporting. 

diff --git a/R/global_macros.R b/R/global_macros.R
diff --git a/man/recode-replace.Rd b/man/recode-replace.Rd
@@ -28,7 +28,7 @@ replace_outliers(X, limits, value = NA,
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
-  \item{X}{a vector, matrix, array, data frame or list of atomic objects. \code{replace_outliers} has internal methods for \link[fgroup_by]{grouped} and \link[findex_by]{indexed} data.}
+  \item{X}{a vector, matrix, array, data frame or list of atomic objects. \code{replace_outliers} has internal methods for \link[=fgroup_by]{grouped} and \link[=findex_by]{indexed} data.}
   \item{\dots}{comma-separated recode arguments of the form: \code{value = replacement, `2` = 0, Secondary = "SEC"} etc. \code{recode_char} with \code{regex = TRUE} also supports regular expressions i.e. \code{`^S|D$` = "STD"} etc.}
 \item{default}{optional argument to specify a scalar value to replace non-matched elements with.}
 \item{missing}{optional argument to specify a scalar value to replace missing elements with. \emph{Note} that to increase efficiency this is done before the rest of the recoding i.e. the recoding is performed on data where missing values are filled!}

diff --git a/src/data.table_rbindlist.c b/src/data.table_rbindlist.c
@@ -394,7 +394,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
           const char *str = isString(s) ? CHAR(STRING_ELT(s,w2)) : "";
             snprintf(buff, sizeof(buff), "Column %d ['%s'] of item %d is missing in item %d. Use fill=TRUE to fill with NA (NULL for list columns), or use.names=FALSE to ignore column names.%s",
                         w2+1, str, i+1, missi+1, extra );
-          if (usenames==TRUE) error((const char*)buff);
+          if (usenames==TRUE) error("%s", buff);
           i = ll; // break from outer i loop
           break;         // break from inner j loop
         }
@@ -675,7 +675,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
       }
       for (int k=0; k<nLevel; ++k) SET_TRUELENGTH(levelsRaw[k], 0);
       savetl_end();
-      if (warnStr[0]) warning((const char*)warnStr);  // now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
+      if (warnStr[0]) warning("%s", warnStr);  // now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
       copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.
       SEXP levelsSxp;
       setAttrib(target, R_LevelsSymbol, levelsSxp=allocVector(STRSXP, nLevel));

diff --git a/src/data.table_subset.c b/src/data.table_subset.c
@@ -504,7 +504,7 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols, SEXP checkrows) { // , SEXP fastret
       SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++;
       rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE))); nprotect++;
       const char *err = check_idx(rows, nrow, &anyNA); // , &orderedSubset
-      if (err!=NULL) error((const char*)err);
+      if (err!=NULL) error("%s", err);
     }
 
       // Adding sf geometry column if not already selected...
@@ -612,7 +612,7 @@ SEXP subsetVector(SEXP x, SEXP idx, SEXP checkidx) { // idx is 1-based passed fr
     SEXP max = PROTECT(ScalarInteger(length(x))); nprotect++;
     idx = PROTECT(convertNegAndZeroIdx(idx, max, ScalarLogical(TRUE))); nprotect++;
     const char *err = check_idx(idx, length(x), &anyNA); // , &orderedSubset
-    if (err != NULL) error((const char*)err);
+    if (err != NULL) error("%s", err);
   }
   SEXP ans = PROTECT(allocVector(TYPEOF(x), length(idx))); nprotect++;
   copyMostAttrib(x, ans);

diff --git a/src/join.c b/src/join.c
@@ -345,7 +345,7 @@ SEXP multi_match(SEXP m, SEXP g) {
   int *cgs = (int*)R_alloc(ng+2, sizeof(int)); cgs[1] = 1;
   for(int i = 1; i != ngp; ++i) cgs[i+1] = cgs[i] + gs[i];
   int *restrict cnt = (int*)Calloc(ngp, int);
-  int *po = (int*) R_alloc(l, sizeof(int)); --po;
+  int *po = (int*)R_alloc(l, sizeof(int)); --po;
   for(int i = 1; i != lp; ++i) po[cgs[pg[i]] + cnt[pg[i]]++] = i;
   Free(cnt);
 
@@ -354,7 +354,7 @@ SEXP multi_match(SEXP m, SEXP g) {
   // Indices to duplicate y (this is the normal fmatch(x, y) vector but now accounting for multiple matches)
   SEXP y_ind = PROTECT(allocVector(INTSXP, n));
   int *px_ind = INTEGER(x_ind), *py_ind = INTEGER(y_ind);
-  for(int i = 0, j = 0, q, k, s; i != lm; ++i) {
+  for(int i = 0, j = 0, q = 0, k = 0, s = 0; i != lm; ++i) {
     if(pm[i] == NA_INTEGER) {
       px_ind[j] = i+1;
       py_ind[j++] = NA_INTEGER;