Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat
DT[, c(..cols, "colC")] # same as DT[, .(colB,colC)]
DT[, -..cols] # all columns other than colB
```
Thus, `with=` should no longer be needed in any cases. Please change to using the `..` prefix and over the next few years we will start to formally deprecate and remove the `with=` parameter. If this is well received, the `..` prefix could be expanded to symbols appearing in `i=` and `by=`, too. Note that column names should not now start with `..`. If a symbol `..var` is used in `j=` but `..var` exists as a column name, the column still takes precendence, for backwards compatibility. Over the next few years, data.table will start issuing warnings/errors when it sees column names starting with `..`. This affects one CRAN package out of 475 using data.table, so we do not believe this restriction to be unreasonable. Our main focus here which we believe `..` achieves is to resolve the more common ambiguity when `var` is in calling scope and `var` is a column name, too.
Thus, `with=` should no longer be needed in any cases. Please change to using the `..` prefix and over the next few years we will start to formally deprecate and remove the `with=` parameter. If this is well received, the `..` prefix could be expanded to symbols appearing in `i=` and `by=`, too. Note that column names should not now start with `..`. If a symbol `..var` is used in `j=` but `..var` exists as a column name, the column still takes precedence, for backwards compatibility. Over the next few years, data.table will start issuing warnings/errors when it sees column names starting with `..`. This affects one CRAN package out of 475 using data.table, so we do not believe this restriction to be unreasonable. Our main focus here which we believe `..` achieves is to resolve the more common ambiguity when `var` is in calling scope and `var` is a column name, too.

19. `setindexv` can now assign multiple (separate) indices by accepting a `list` in the `cols` argument.

Expand Down
10 changes: 10 additions & 0 deletions inst/tests/genotypes_genome.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

Return chromosome done

Done simulating ARG ...
Total number of SNPs = 3287
Samples:
POP1: 00000000000000000010001000100000000000000000001000001110001110000000000000000001111100000000000000000000000000000111111000000000000000000111111100000000001100001110000000000000000000000000000000000000011000000000000000000000010000011111100000000000000000000000111111111001000000000000000000000000111000000000000001111110001000000000011100000000000000000000000001111110000000000000001111100000000000000100000000000000000000000011111110000000000111100000000000000000000000100000000000000110100000000000000000000000000000000000000000000100010000000000000000000000001111100000000000000000000000000000000000000111111111111111111111110000000000000000000000000110100000000000000000000001111111111111100000000000000000000000000000000000000000000011000010000000100000000000000000110000001111111111100000000000000000000000000000011100010000111111000000000000000001111000000000001110000000000000000000001111100000001111111110000000000000000001000000000111111100000000000000000000000011100000000000000000000100000011100000000000000000000000011001111111100000001110000000000000000000001000100000000000000001100000000000000000000000000000000000011000000000000010000000000000000000000011111111000000000000000000000111000000000000000000000000000001001111000000000000000000000011100000110000000000000000010000000111111111000000000000000000000000000000000000000111110001110000000000000000000000011111110000000001100000000000000000111000000000000000000000000000000011111110000000000000000000000000001100100000000000000000000011000000011100000000000000000000000000000001100111111110000100000001000000000010111100000000000000100000000000000011000111111000000000000000000000000011110000000000000000001100000000000000000000000000000000000000000000000000000001100000000000000000000000000000011000000000000000111111100000000000000000000000011110000000010000000000000111111000000000000000000000000000001110010000000010000000001000000001111000000000001000000000111111000000000000000000000000000000111000001000000000100000000000000100000001111110000000000000000000001111111111100001110000000000001000000000000000000000011111100000001100000000000000000000001100000000001000110000010000000111111100000100000000000000000000000000000011111110000000100000000000000001000000000000010000000000000000000000000110110000000000000000000000000000011001100000000000000000010000000000000110000000000000000000001111111100000000000000000000001100000000000000000011000000000000001000000000000000000000011111001111110000000000000011000000000000000001111100000000000000000000000000011111111111111111111111111111110000000000000000000000000000000000000000001100000000000000010000000000000000001111111111111110000000000010000000000000010000001111000000000000000000000000000001111111000000000000000000000100000000000000000011000010000000000000001111111000000000000100000010000001000000100000000000000101000000000100000000111000000000011000000111110000000000000000000000000000000000000011000000000000001111000000000000000000000000000000111100000000000000000000001000000000001111110000000000000000010000000000000000000011111110000000001000000000000000011111110000000000000000001001000100000000000001000000000000000001100101111110000000000000000000000010100000111000000000000000000000000111010000000000000010001000000000000000000000010111111
POP1: 00000000000000000000001000100000000000000100000000011110001110000000000000000101111100000000011000000000000000000111110000000001000000000111111100000000001100000000000000000000000000000000000000000000000100000000100000000000000000111111100000000000000000111111000000000110000000000000000001001111000000000000000010000000000000111111100000000000000011000000000000001110000000000000001111000000000000000100000000000000000000000011111110000000000111100000000000000000000000000001000000000110100000000000000100010000000000010000000000010000100000000000000000000000000000010100000000010000000000000000000000010000000000111111111111111110010000000000000000000000000000100000000001110000000000000000000001111111111000000000000000000000111000000000111110000000010000000000000000000010000000000000000111111111000000000000000001100000000000000000000000011100000001111000000000000000000011111111111000000000000000000000000000000000000000001100000010001000000000000000000000000001000000000000100010000000000010000111100000000000000111110100000000000000000000000000000000000000000000000100000000000000001110000010000000000100000000000000000000011000000000000001100000000000001111111100000000000000100000000000111000000001100000000000000000000000010000000001100000000000000000000001111000000001100000000000000111111111000000000000000000000000000000000001110000000010000000010000000111000000000000000000000000000000000110111110000000000000000000000000011000011100000000000000000000000000110000001100000010000000000000000011000000011100000000000000000000010000000010011111111110000000000010000111111100000000000000000000000000000001110000000000000111110000001000001010000000000000000100000000001100000000000000000000000000000000000010000000010000000001100000000000001001000000000000000111111111110000000000000000000000000000000000000000111100000000000000000111111001100000000000000000000000010000010000000000000000000000111110000000000000000100000000000000000000111111110010000000000000000000000000000000000000000000100011111110000000000100000000000000001111111111100000000000000000000000000000011111111111100000000000000000000000000110000000011000000000000100000000000011100000000011111000000000000000000000000101100101111110000000010000000000000000000000000000001100000000000000100001110001010000000000000000000000000111000111100000000000000011110000000000000110000000001000000000000000000011111111111000000000000000000000000011000000111110000000000000000000000000000000000000001111111111000000000000000000001000000011111100000000000000001000000000011111111111111000000000000000000000001000000001000001111111111111110000000000000000100000001101111111111111110000000000000000000000000000000001000000000000111110000000000000000010011111111110000000000000000000100000000100100000000010001101000000000000000011100001111000000000000001111100011101000000000000000000010001000000001000001100000110000000000110000000001111100000000000000000011110000000000000000000111111100000000000000000000000000000000000111100000000000000001100001000000000000111110010000000000000000000000000000000000111111110000000000000000000010000011111110000000000000000000101000100000000000000110000000000000000011001111110000000000000000000000000011111111000000000000000000000111000100000000000000000000011100000001100000010000000000
POP1: 00000000000000000000100110011110000000000000000000100001001110000000000000000010000011111100000000000000100000000011110000000000000110000111111100000000000000000000001000000010000000000000000000000000000100000000000000000000010000011111110000000000000000000000111111111000000000000000000000000000110000000000000001111110001000000000011100000000000000000000000001111110000100000000001111010000000000011100000000000000000000000011111110000000000111100000000000000000000000000001000000000110100000000000000100010010000000000000000000010000101000000000000000000000000000010110000000000000000000000000000000011000000000111111111111111110000000000000000000001001100000000000000000001000000000000000011111111111111100000000111000000000000000000000111110000000000000000000000000000000001111111111111000000000000000000000000000000000010000000000000000000000000000000111001111100000000000000000000001000000001111111111111110000100000000001100000010001000000000000000000000000001000000000000000010000000000010000111100000000000000111110000000000000000000000000000000000000000000000000100000000000000001110000001000000001000000000000000000011000000010000000000000111111111110000000011111110000000000000000000111000000000000000000000010000000000110000100000011000000000000000011110000000000000000000000110000000000000011111111111111000000000000000000001110000000010000000000000110000001111100001100000000001100000000001000000111000000000000000000011100000000001111110000001000000000000001111110000010001100000000000001000011111100000000001000000000000000000000000011111111110000010000001000000000010111100000000000000000000000000000000111000000111110000000000000000100011110000001000000000001100000001000000000000000000000000001100000000000000001000000000000000110000000000000000000000000000001111000011100000000000000000100000000000111100000000000000000000000010011111110000000000000000000001100000000000000000010000000001111000000000000000010000000000000000111111110010000000000000111000000000000000100000000100000100000001111100000001000000000000100000000000011110000000000000000000000001000000000000011111100000000000000000000000000000001100000000000000000000000100000000000011111000000000000000000000000010011011111110000000000000100000000000000000000100001100000000000000000000000100110000000000001000000000000000000001111100000000000000000000000011111001100000000000000000000000000011111111111000000000000010000000000000111100001110000000000000000000000000000000000000001111111111000000000000000000000000000011111100100000000100000000000000011111111111111000000000000000000000000100000001000001111111111111110000000000000000000000001101111111111111110000000000000000000000000000111100000100000111000000000000000000010010011111111110000000000000000000000000000100000000000000000011000010000000000000001111111000000000000100000010000001000000100000000000000101000000000100000000111000000000011000000111110000000000011000000000000110000000000000011111111111100000000000110000000000000000111111000000000000000000001100001000000000000111110010000000000000000000000000000110000000111110000000000000000000000111111111110000000000000000000000111000000000000000000000000000001110000001111100000000000000000000100000000000111000000000000000000010000111010001000001100000001100000000010000000000010111111

20 changes: 16 additions & 4 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -7476,10 +7476,10 @@ input = "a,b\n\n\n1,3\n\n2,4\n\n"
test(1578.5, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))

f = testDir("530_fread.txt")
test(1578.5, fread(f, skip=47L), data.table(a=logical(), b=logical()), warning="Stopped early.*discarded.*<<1,3>>")
test(1578.6, fread(f, skip=49L), data.table(V1=1:2, V2=3:4))
test(1578.7, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4))
test(1578.8, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50
test(1578.6, fread(f, skip=47L, verbose=TRUE), data.table(V1=1:2, V2=3:4), output="Positioned on line 48 starting: <<a,b>>")
test(1578.7, fread(f, skip=49L), data.table(V1=1:2, V2=3:4))
test(1578.8, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4))
test(1578.9, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50

# gforce optimisations
dt = data.table(x = sample(letters, 300, TRUE),
Expand Down Expand Up @@ -11731,6 +11731,18 @@ observed <- capture.output(print(data.table(x = seq_len(6L)), topn = 2L, class =
expected <- c(" x", " <int>", " 1: 1", " 2: 2", "--- ", " 5: 5", " 6: 6")
test(1908, observed, expected)

# skip= is now consistent as if the file started on that line.
# Found via rev dep checking (package PhenotypeSimulator), #2786. It is still a breaking change that PhenotypeSimulator will need to accomodate please.
test(1909.1, names(ans<-fread(testDir("genotypes_genome.txt"), skip="Samples:", sep=" ", colClasses="character")),
c("V1","Samples:"),
warning="Detected 1 column name.*but the data has 2 columns.*Added 1 extra default column name for the first column")
test(1909.2, ans$V1, c("POP1:","POP1:","POP1:"))
test(1909.3, nchar(ans[["Samples:"]]), INT(3287,3287,3287))
test(1909.4, names(ans<-fread(testDir("genotypes_genome.txt"), skip="POP1:", sep=" ", colClasses="character", header=FALSE)),
c("V1","V2"))
test(1909.5, ans$V1, c("POP1:","POP1:","POP1:"))
test(1909.6, nchar(ans$V2), INT(3287,3287,3287))


###################################
# Add new tests above this line #
Expand Down
9 changes: 3 additions & 6 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,6 @@ int freadMain(freadMainArgs _args) {
//*********************************************************************************************
const char *pos = sof; // Location where the actual data in the file begins
int row1line = 1; // The line number where the data starts. Normally row 1 is column names and row1line ends up == 2.
bool skipAuto = true;
{
ch = pos;
if (verbose) DTPRINT("[05] Skipping initial rows if needed\n");
Expand All @@ -1369,14 +1368,12 @@ int freadMain(freadMainArgs _args) {
if (verbose) DTPRINT("Found skip='%s' on line %llu. Taking this to be header row or first row of data.\n",
args.skipString, (llu)row1line);
ch = pos;
skipAuto = false;
}
else if (args.skipNrow >= 0) {
// Skip the first `skipNrow` lines of input, including 0 to force the first line to be the start
while (ch<eof && row1line<=args.skipNrow) row1line+=(*ch++=='\n');
if (ch>=eof) STOP("skip=%llu but the input only has %llu line%s", (llu)args.skipNrow, (llu)row1line, row1line>1?"s":"");
pos = ch;
skipAuto = false;
}

// skip blank input at the start
Expand Down Expand Up @@ -1491,7 +1488,7 @@ int freadMain(freadMainArgs _args) {
thisBlockLines++;
continue;
}
if ((lastncol>1 && thisBlockLines>1) || !skipAuto) break; // found and finished the first 2x2 (or bigger) block
if (lastncol>1 && thisBlockLines>1) break; // found and finished the first 2x2 (or bigger) block
while (ch<eof && thisncol==0) {
prevLineStart=NULL; lineStart=ch; thisRow++;
thisncol = countfields(&ch);
Expand Down Expand Up @@ -1553,7 +1550,7 @@ int freadMain(freadMainArgs _args) {
sep = topSep;
whiteChar = (sep==' ' ? '\t' : (sep=='\t' ? ' ' : 0));
ncol = topNumFields;
if (fill || !skipAuto || sep==127) {
if (fill || sep==127) {
// leave pos on the first populated line; that is start of data
ch = pos;
} else {
Expand Down Expand Up @@ -1718,7 +1715,7 @@ int freadMain(freadMainArgs _args) {
}
}

if (args.header==NA_BOOL8 && prevStart!=NULL && skipAuto) {
if (args.header==NA_BOOL8 && prevStart!=NULL) {
// The first data row matches types in the row after that, and user didn't override default auto detection.
// Maybe previous line (if there is one, prevStart!=NULL) contains column names but there are too few (which is why it didn't become the first data row).
ch = prevStart;
Expand Down