Merge pull request #104 from cnrrobertson/pivot_options

Ability to specify lists, Not lists, colon, or nothing for @pivot_longer
TidierOrg · Jun 9, 2024 · 7f1ac79 · 7f1ac79 · kdpsingh · Jun 9, 2024
2 parents c9bc480 + 82516e3
commit 7f1ac79
Show file tree

Hide file tree

Showing 10 changed files with 194 additions and 10 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # TidierData.jl updates
 
+## v0.16.1 - 2024-06-09
+- Adds support for tuples and vectors as arguments to select multiple columns. Prefixing tuples/vectors with a `-` or `!` will exclude the selected columns.
+- The `:` selector from Julia is now available and equivalent to `everything()`
+- `@pivot_longer()` now pivots all columns if no column selectors are provided
+
 ## v0.16.0 - 2024-06-07
 - `unique()`, `mad()`, and `iqr()` are no longer auto-vectorized
 - Bugfix: `@ungroup()` now preserves row-ordering (and is faster)

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.16.0"
+version = "0.16.1"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/docs/examples/UserGuide/interpolation.jl b/docs/examples/UserGuide/interpolation.jl
@@ -4,7 +4,7 @@
 
 # Note: You can only interpolate values from variables in the parent environment. If you would like to interpolate column names, you have two options: you can either use `across()` or you can use `@aside` with `@pull()` to create variables in the parent environment containing the values of those columns which can then be accessed using interpolatino.
 
-# myvar = :b`, `myvar = (:a, :b)`, and `myvar = [:a, :b]` all refer to *columns* with those names. On the other hand, `myvar = "b"`, `myvar = ("a", "b")` and `myvar = ["a", "b"]` will interpolate those *values*. See below for examples.
+# myvar = :b` and `myvar = Cols(:a, :b)` both refer to *columns* with those names. On the other hand, `myvar = "b"`, `myvar = ("a", "b")` and `myvar = ["a", "b"]` will interpolate the *values*. If you intend to interpolate column names, the preferred way is to use `Cols()` as in the examples below.
 
 using TidierData
 
@@ -20,9 +20,19 @@ myvar = :b
   @select(!!myvar)
 end
 
-# ## Select multiple variables (vector of symbols)
+# ## Select multiple variables
 
-myvars = [:a, :b]
+# You can also use a vector as in `[:a, :b]`, but `Cols()` is preferred because it lets you mix and match numbers.
+
+myvars = Cols(:a, :b)
+
+@chain df begin
+  @select(!!myvars)
+end
+
+# This is the same as this...
+
+myvars = Cols(:a, 2)
 
 @chain df begin
   @select(!!myvars)
@@ -86,7 +96,7 @@ end
 
 # ## Summarize across multiple variables
 
-myvars = [:b, :c]
+myvars = Cols(:b, :c)
 
 @chain df begin
   @summarize(across(!!myvars, (mean, minimum, maximum)))
@@ -103,7 +113,9 @@ end
 
 # ## Group by multiple interpolated variables
 
-myvars = [:a, :b]
+# Once again, you can mix and match column selectors within `Cols()`
+
+myvars = Cols(:a, 2)
 
 @chain df begin
   @group_by(!!myvars)

diff --git a/docs/examples/UserGuide/pivots.jl b/docs/examples/UserGuide/pivots.jl
@@ -36,6 +36,18 @@ df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4])
 
 @pivot_longer(df_wide, -id)
 
+# The selected columns can also be included as an array
+
+@pivot_longer(df_wide, [id, B])
+
+# or excluded
+
+@pivot_longer(df_wide, -[id, B])
+
+# If all columns should be included, they can be specified by either `everything()`, `:`, or by leaving the argument blank
+
+@pivot_longer(df_wide, everything())
+
 # In this example, we set the `names_to` and `values_to` arguments. Either argument can be left out and will revert to the default value. The `names_to` and `values_to` arguments can be provided as strings or as bare unquoted variable names.
 
 # Here is an example with `names_to` and `values_to` containing strings:
@@ -45,3 +57,4 @@ df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4])
 # And here is an example with `names_to` and `values_to` containing bare unquoted variables:
 
 @pivot_longer(df_wide, A:B, names_to = letter, values_to = number)
+
diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -449,6 +449,7 @@ macro group_by(df, exprs...)
 
   tidy_exprs = parse_tidy.(tidy_exprs)
   grouping_exprs = parse_group_by.(exprs)
+  grouping_exprs = parse_tidy.(grouping_exprs)
 
   df_expr = quote
     local any_expressions = any(typeof.($tidy_exprs) .!= QuoteNode)

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -260,6 +260,28 @@ julia> @chain df @select(!(a:b))
    4 │    14
    5 │    15
 
+julia> @chain df @select(-(a, b))
+5×1 DataFrame
+ Row │ c     
+     │ Int64 
+─────┼───────
+   1 │    11
+   2 │    12
+   3 │    13
+   4 │    14
+   5 │    15
+
+julia> @chain df @select(!(a, b))
+5×1 DataFrame
+ Row │ c     
+     │ Int64 
+─────┼───────
+   1 │    11
+   2 │    12
+   3 │    13
+   4 │    14
+   5 │    15
+
 julia> @chain df begin
          @select(contains("b"), starts_with("c"))
        end
@@ -667,6 +689,34 @@ julia> @chain df begin
    3 │ C         3.0
    4 │ D         4.0
    5 │ E         5.0
+
+julia> @chain df begin
+         @group_by(-(b, c)) # same as `a`
+         @summarize(b = mean(b))
+       end
+5×2 DataFrame
+ Row │ a     b       
+     │ Char  Float64 
+─────┼───────────────
+   1 │ a         1.0
+   2 │ b         2.0
+   3 │ c         3.0
+   4 │ d         4.0
+   5 │ e         5.0
+
+julia> @chain df begin
+         @group_by(!(b, c)) # same as `a`
+         @summarize(b = mean(b))
+       end
+5×2 DataFrame
+ Row │ a     b       
+     │ Char  Float64 
+─────┼───────────────
+   1 │ a         1.0
+   2 │ b         2.0
+   3 │ c         3.0
+   4 │ d         4.0
+   5 │ e         5.0
 ```
 """
 

diff --git a/src/parsing.jl b/src/parsing.jl
@@ -1,5 +1,5 @@
 # Not exported
-function parse_tidy(tidy_expr::Union{Expr,Symbol,Number}; # Can be symbol or expression
+function parse_tidy(tidy_expr::Union{Expr,Symbol,Number, QuoteNode}; # Can be symbol or expression
                     autovec::Bool=true, subset::Bool=false, from_across::Bool=false,
                     from_slice::Bool = false)
   if @capture(tidy_expr, across(vars_, funcs_))
@@ -55,7 +55,11 @@ function parse_tidy(tidy_expr::Union{Expr,Symbol,Number}; # Can be symbol or exp
     var = QuoteNode(var)
     return :(Not($var))
   elseif @capture(tidy_expr, var_Symbol)
-    return QuoteNode(var)
+    if var == Symbol(":")
+      return var
+    else
+      return QuoteNode(var)
+    end
   elseif @capture(tidy_expr, var_Number)
     if var > 0
       return var
@@ -67,13 +71,28 @@ function parse_tidy(tidy_expr::Union{Expr,Symbol,Number}; # Can be symbol or exp
     end
   elseif @capture(tidy_expr, !var_Number)
     return :(Not($var))
+  elseif @capture(tidy_expr, (tuple__,))
+    tuple = parse_tidy.(tuple)
+    return :(Cols($(tuple...)))
+  elseif @capture(tidy_expr, [vec__])
+    vec = parse_tidy.(vec)
+    return :(Cols($(vec...)))
+  elseif @capture(tidy_expr, -[vec__])
+    vec = parse_tidy.(vec)
+    return :(Not(Cols($(vec...)))) # can simpify to Not($(tuple...)) in DataFrames 1.6+
+  elseif @capture(tidy_expr, ![vec__])
+    vec = parse_tidy.(vec)
+    return :(Not(Cols($(vec...)))) # can simpify to Not($(tuple...)) in DataFrames 1.6+
   elseif !subset & @capture(tidy_expr, -fn_(args__)) # negated selection helpers
     return :(Cols(!($(esc(fn))($(args...))))) # change the `-` to a `!` and return
   elseif !subset & @capture(tidy_expr, fn_(args__)) # selection helpers
     if from_across || fn == :Cols # fn == :Cols is to deal with interpolated columns
       return tidy_expr
     elseif fn == :where
       return :(Cols(all.(broadcast($(esc(args...)), eachcol(DataFrame(df_copy))))))
+    elseif fn == :- || fn == :! # for negated selection as in -(A, B), which is internally represnted as function
+      args = parse_tidy.(args)
+      return :(Not(Cols($(args...)))) # can simpify to Not($(tuple...)) in DataFrames 1.6+
     else
       return :(Cols($(esc(tidy_expr))))
     end
@@ -499,4 +518,4 @@ function parse_blocks(exprs...)
     return (MacroTools.rmlines(exprs[1]).args...,)
   end
   return exprs
-end
+end
diff --git a/src/pivots.jl b/src/pivots.jl
@@ -41,6 +41,9 @@ end
 $docstring_pivot_longer
 """
 macro pivot_longer(df, exprs...)
+    if length(exprs) == 0
+        exprs = (:(everything()),)
+    end
     exprs = parse_blocks(exprs...)
 
     # take the expressions and return arg => value dictionary 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,4 +8,18 @@ DocMeta.setdocmeta!(TidierData, :DocTestSetup, :(using TidierData); recursive=tr
 
 doctest(TidierData)
 
-end
+end
+
+using TidierData
+using Test
+using DataFrames
+
+test_df = DataFrame(
+    label = [1, 1, 2, 2],
+    name = ["A", "B", "A", "B"],
+    num = [1, 2, 3, 4]
+)
+
+@testset "TidierData" verbose = true begin
+    include("test_pivots.jl")
+end
diff --git a/test/test_pivots.jl b/test/test_pivots.jl
@@ -0,0 +1,67 @@
+@testset "pivots" verbose = true begin
+@testset "pivot_wider" begin
+    true_wide = DataFrame(
+            label = [1, 2],
+            A = [1, 3],
+            B = [2, 4]
+        )
+    test_wide = @pivot_wider(test_df, names_from="name", values_from="num")
+    test_wide2 = @pivot_wider(test_df, names_from=name, values_from=num)
+    test_wide3 = @pivot_wider(test_df, names_from=:name, values_from=:num)
+    @test all(Array(true_wide .== test_wide))
+    @test all(Array(true_wide .== test_wide2))
+    @test all(Array(true_wide .== test_wide3))
+end
+
+@testset "pivot_longer" begin
+    true_long1 = DataFrame(
+            label = [1,1,2,2,1,1,2,2],
+            variable = ["name","name","name","name","num","num","num","num"],
+            value = ["A","B","A","B",1,2,3,4],
+        )
+    test_long1 = @pivot_longer(test_df, -label)
+    test_long2 = @pivot_longer(test_df, name:num)
+
+    true_long3 = DataFrame(
+        name = ["A","B","A","B"],
+        num = [1,2,3,4],
+        variable = ["label","label","label","label"],
+        value = [1,1,2,2]
+    )
+    test_long3 = @pivot_longer(test_df, -(name:num))
+    test_long4 = @pivot_longer(test_df, label)
+
+    true_long5 = DataFrame(
+        name = ["A","B","A","B","A","B","A","B"],
+        variable = ["label","label","label","label","num","num","num","num"],
+        value = [1,1,2,2,1,2,3,4],
+    )
+    test_long5 = @pivot_longer(test_df, [label,num])
+
+    true_long6 = DataFrame(
+        label = [1,1,2,2],
+        num = [1,2,3,4],
+        variable = ["name","name","name","name"],
+        value = ["A","B","A","B"],
+    )
+    test_long6 = @pivot_longer(test_df, -[label,num])
+
+    true_long7 = DataFrame(
+        variable = ["label","label","label","label","name","name","name","name","num","num","num","num"],
+        value = [1,1,2,2,"A","B","A","B",1,2,3,4],
+    )
+    test_long7 = @pivot_longer(test_df, :)
+    test_long8 = @pivot_longer(test_df)
+    test_long9 = @pivot_longer(test_df, everything())
+
+    @test all(Array(true_long1 .== test_long1))
+    @test all(Array(true_long1 .== test_long2))
+    @test all(Array(true_long3 .== test_long3))
+    @test all(Array(true_long3 .== test_long4))
+    @test all(Array(true_long5 .== test_long5))
+    @test all(Array(true_long6 .== test_long6))
+    @test all(Array(true_long7 .== test_long7))
+    @test all(Array(true_long7 .== test_long8))
+    @test all(Array(true_long7 .== test_long9))
+end
+end