Merge pull request #59 from drizk1/slice_min_max

@slice_min and @slice_max
TidierOrg · Nov 18, 2023 · ca0c6e0 · ca0c6e0
2 parents 84d1322 + 5656c87
commit ca0c6e0
Show file tree

Hide file tree

Showing 7 changed files with 322 additions and 4 deletions.
diff --git a/Project.toml b/Project.toml
@@ -20,6 +20,7 @@ DataFrames = "1.5"
 MacroTools = "0.5"
 Reexport = "0.2, 1"
 ShiftedArrays = "2"
+Statistics = "1.6"
 StatsBase = "0.34, 1"
 julia = "1.6"
 

diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ TidierData.jl currently supports the following top-level macros:
 - `@mutate()` and `@transmute()` 
 - `@summarize()` and `@summarise()`
 - `@filter()`
-- `@slice()` and `@slice_sample()`
+- `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()`
 - `@group_by()` and `@ungroup()`
 - `@arrange()`
 - `@pull()`

diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl
@@ -65,4 +65,26 @@ end
 
 @chain df begin
   @slice_sample(5)
-end
+end
+
+# ## Slice the min
+
+# This line selects all rows with the the minimum value of the desired column
+
+@chain df begin
+  @slice_min(b)
+end
+
+# This line will only show the first row.
+
+@chain df begin
+  @slice_min(b, with_ties = false)
+end
+
+# ## Slice the max
+
+# The optional prop arguement will slice a proportion of the full dataframe.
+
+@chain df begin 
+  @slice_max(b, prop = .5)
+end
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -94,7 +94,7 @@ TidierData.jl currently supports the following top-level macros:
     - `@mutate()` and `@transmute()` 
     - `@summarize()` and `@summarise()`
     - `@filter()`
-    - `@slice()` and `@slice_sample()`
+    - `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()`
     - `@group_by()` and `@ungroup()`
     - `@arrange()`
     - `@pull()`

diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end
       as_float, as_integer, as_string, is_float, is_integer, is_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter,
       @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join,
       @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate,
-      @unite, @summary, @fill_missing, @slice_sample
+      @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max
 
 # Package global variables
 const code = Ref{Bool}(false) # output DataFrames.jl code?

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -2441,3 +2441,130 @@ julia> @chain df begin
    5 │    25      5     15
 ```
 """
+
+const docstring_slice_max =
+"""
+    @slice_max(df, column; with_ties, n, prop, missing_rm)
+
+Retrieve rows with the maximum value(s) from the specified column of a DataFrame.
+
+# Arguments
+- `df`: The source data frame or grouped data frame from which to slice rows.
+- `column`: The column for which to slice the maximum values.
+- `with_ties`: Whether or not all ties will be shown, defaults to true. When false it will only show the first row. 
+- `prop`: The proportion of rows to slice.
+- `n`: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. 
+- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(
+           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
+           b = [0.3, 2, missing, 3, 6, 5, 7, 7],
+           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);
+
+julia> @chain df begin
+       @slice_max(b)
+       end 
+2×3 DataFrame
+ Row │ a         b         c        
+     │ Float64?  Float64?  Float64? 
+─────┼──────────────────────────────
+   1 │      5.0       7.0       5.0
+   2 │      6.0       7.0       6.0
+
+julia> @chain df begin
+       @slice_max(b, with_ties = false)
+       end 
+1×3 DataFrame
+ Row │ a         b         c        
+     │ Float64?  Float64?  Float64? 
+─────┼──────────────────────────────
+   1 │      5.0       7.0       5.0
+
+julia> @chain df begin
+       @slice_max(b, with_ties = false, n = 2)
+       end 
+2×3 DataFrame
+ Row │ a         b         c        
+     │ Float64?  Float64?  Float64? 
+─────┼──────────────────────────────
+   1 │      5.0       7.0       5.0
+   2 │      6.0       7.0       6.0
+   
+julia> @chain df begin
+       @slice_max(b, prop = 0.5, missing_rm = true)
+       end
+3×3 DataFrame
+ Row │ a         b         c        
+     │ Float64?  Float64?  Float64? 
+─────┼──────────────────────────────
+   1 │      5.0       7.0       5.0
+   2 │      6.0       7.0       6.0
+   3 │      1.0       6.0       1.0
+```
+"""
+
+const docstring_slice_min =
+"""
+    @slice_min(df, column; with_ties, n, prop, missing_rm)
+
+Retrieve rows with the minimum value(s) from the specified column of a DataFrame.
+
+# Arguments
+- `df`: The source data frame or grouped data frame from which to slice rows.
+- `column`: The column for which to slice the minimum values.
+- `with_ties`: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row. 
+- `prop`: The proportion of rows to slice.
+- `n`: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. 
+- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(
+           a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
+           b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],
+           c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);
+
+julia> @chain df begin
+       @slice_min(b)
+       end 
+2×3 DataFrame
+ Row │ a         b         c         
+     │ Float64?  Float64?  Float64?  
+─────┼───────────────────────────────
+   1 │  missing       0.3        0.2
+   2 │  missing       0.3  missing
+
+julia> @chain df begin
+       @slice_min(b, with_ties = false)
+       end 
+1×3 DataFrame
+ Row │ a         b         c        
+     │ Float64?  Float64?  Float64? 
+─────┼──────────────────────────────
+   1 │  missing       0.3       0.2
+
+julia> @chain df begin
+       @slice_min(b, with_ties = true, n = 1)
+       end 
+2×3 DataFrame
+ Row │ a         b         c         
+     │ Float64?  Float64?  Float64?  
+─────┼───────────────────────────────
+   1 │  missing       0.3        0.2
+   2 │  missing       0.3  missing   
+  
+   
+julia> @chain df begin
+       @slice_min(b, prop = 0.5, missing_rm = true)
+       end
+3×3 DataFrame
+ Row │ a          b         c         
+     │ Float64?   Float64?  Float64?  
+─────┼────────────────────────────────
+   1 │ missing         0.3        0.2
+   2 │ missing         0.3  missing   
+   3 │       0.2       2.0        0.2
+```
+"""
diff --git a/src/slice.jl b/src/slice.jl
@@ -71,4 +71,172 @@ macro slice_sample(df, exprs...)
   end
 
   return df_expr
+end
+
+"""
+$docstring_slice_max
+"""
+macro slice_max(df, exprs...)
+  expr_dict = Dict()
+  column = nothing
+  missing_rm = true
+  with_ties = true
+  arranged = false
+  for expr in exprs
+      if @capture(expr, lhs_ = rhs_)
+          expr_dict[lhs] = rhs
+          if lhs == :missing_rm
+              missing_rm = rhs
+          elseif lhs == :prop
+              arranged = true
+          end
+      else
+          column = expr
+      end
+  end
+  if haskey(expr_dict, :with_ties)
+      with_ties = expr_dict[:with_ties]
+  end
+  if column === nothing
+      throw(ArgumentError("No column provided"))
+  end
+  return quote
+      grouping_cols = Symbol[]
+      if $(esc(df)) isa DataFrames.GroupedDataFrame
+          grouping_cols = DataFrames.groupcols($(esc(df)))
+      end
+      temp_df = if $arranged
+          if $missing_rm
+              @chain $(esc(df)) begin 
+                  @filter(!ismissing($column))
+                  @arrange(desc($column))
+              end
+          else
+              @chain $(esc(df)) begin 
+                  @arrange(desc($column))
+              end
+          end
+      else
+          @filter($(esc(df)), $column == maximum(skipmissing($column)))
+      end
+      if temp_df isa DataFrames.GroupedDataFrame
+          result_dfs = []
+          for sdf in temp_df
+              local prop_val
+              if haskey($expr_dict, :prop)
+                  prop_val = $expr_dict[:prop]
+                  if prop_val < 0.0 || prop_val > 1.0
+                      throw(ArgumentError("Prop value should be between 0 and 1"))
+                  end
+                  num_rows = floor(Int, nrow(sdf) * prop_val)
+                  push!(result_dfs, first(sdf, num_rows))
+              elseif $with_ties
+                  push!(result_dfs, sdf)
+              else
+                  n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
+                  push!(result_dfs, first(sdf, n))
+              end
+          end
+          temp_df = vcat(result_dfs...)
+          temp_df = DataFrames.groupby(temp_df, grouping_cols)
+      else
+          local prop_val
+          if haskey($expr_dict, :prop)
+              prop_val = $expr_dict[:prop]
+              if prop_val < 0.0 || prop_val > 1.0
+                  throw(ArgumentError("Prop value should be between 0 and 1"))
+              end
+              num_rows = floor(Int, nrow(temp_df) * prop_val)
+              temp_df = first(temp_df, num_rows)
+          elseif !$with_ties
+              n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
+              temp_df = first(temp_df, n)
+          end
+          temp_df
+      end
+  end
+end
+
+"""
+$docstring_slice_min
+"""
+macro slice_min(df, exprs...)
+  expr_dict = Dict()
+  column = nothing
+  missing_rm = true
+  with_ties = true
+  arranged = false
+  for expr in exprs
+      if @capture(expr, lhs_ = rhs_)
+          expr_dict[lhs] = rhs
+          if lhs == :missing_rm
+              missing_rm = rhs
+          elseif lhs == :prop
+              arranged = true
+          end
+      else
+          column = expr
+      end
+  end
+  if haskey(expr_dict, :with_ties)
+      with_ties = expr_dict[:with_ties]
+  end
+  if column === nothing
+      throw(ArgumentError("No column provided"))
+  end
+  return quote
+      grouping_cols = Symbol[]
+      if $(esc(df)) isa DataFrames.GroupedDataFrame
+          grouping_cols = DataFrames.groupcols($(esc(df)))
+      end
+      temp_df = if $arranged
+          if $missing_rm
+              @chain $(esc(df)) begin 
+                  @filter(!ismissing($column))
+                  @arrange($column)
+              end
+          else
+              @chain $(esc(df)) begin 
+                  @arrange($column)
+              end
+          end
+      else
+          @filter($(esc(df)), $column == minimum(skipmissing($column)))
+      end
+      if temp_df isa DataFrames.GroupedDataFrame
+          result_dfs = []
+          for sdf in temp_df
+              local prop_val
+              if haskey($expr_dict, :prop)
+                  prop_val = $expr_dict[:prop]
+                  if prop_val < 0.0 || prop_val > 1.0
+                      throw(ArgumentError("Prop value should be between 0 and 1"))
+                  end
+                  num_rows = floor(Int, nrow(sdf) * prop_val)
+                  push!(result_dfs, first(sdf, num_rows))
+              elseif $with_ties
+                  push!(result_dfs, sdf)
+              else
+                  n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
+                  push!(result_dfs, first(sdf, n))
+              end
+          end
+          temp_df = vcat(result_dfs...)
+          temp_df = DataFrames.groupby(temp_df, grouping_cols)
+      else
+          local prop_val
+          if haskey($expr_dict, :prop)
+              prop_val = $expr_dict[:prop]
+              if prop_val < 0.0 || prop_val > 1.0
+                  throw(ArgumentError("Prop value should be between 0 and 1"))
+              end
+              num_rows = floor(Int, nrow(temp_df) * prop_val)
+              temp_df = first(temp_df, num_rows)
+          elseif !$with_ties
+              n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
+              temp_df = first(temp_df, n)
+          end
+          temp_df
+      end
+  end
 end