From 50c7702de0fd117071319b9a1b55715979d133e7 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 12 Nov 2023 18:34:08 -0500 Subject: [PATCH 1/4] slice_min and max --- docs/examples/UserGuide/slice.jl | 24 ++++- src/TidierData.jl | 2 +- src/docstrings.jl | 126 +++++++++++++++++++++++ src/slice.jl | 168 +++++++++++++++++++++++++++++++ 4 files changed, 318 insertions(+), 2 deletions(-) diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl index 0c24b53..ef87a80 100644 --- a/docs/examples/UserGuide/slice.jl +++ b/docs/examples/UserGuide/slice.jl @@ -65,4 +65,26 @@ end @chain df begin @slice_sample(5) -end \ No newline at end of file +end + +# ## Slice the min + +# This line selects all rows with the the minimum value of the desired column + +@chain df begin + @slice_min(b) +end + +# This line will only show the first row. + +@chain df begin + @slice_min(b, with_ties = false) +end + +# ## Slice the max + +# The optional prop arguement will slice a proportion of the full dataframe. + +@chain df begin + @slice_max(b, prop = .5) +end diff --git a/src/TidierData.jl b/src/TidierData.jl index 6fc2c3b..7cac670 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end as_float, as_integer, as_string, is_float, is_integer, is_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? diff --git a/src/docstrings.jl b/src/docstrings.jl index 6afa139..bd9a1d4 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2441,3 +2441,129 @@ julia> @chain df begin 5 │ 25 5 15 ``` """ + +const docstring_slice_max = +""" + @slice_max(df, column; with_ties, n, prop, missing_rm) + +Retrieve rows with the maximum value(s) from the specified column of a DataFrame. + +# Arguments +- `df`: The source data frame or grouped data frame from which to slice rows. +- `column`: The column for which to slice the maximum values. +- `with_ties`: Whether or not all ties will be shown, defaults to true. When false it will only show the first row. +- `prop`: The proportion of rows to slice. +- `n = integer`: An optional argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. +- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice. + +# Examples +```jldoctest +julia> df = DataFrame( + dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6], + dt2 = [0.3, 2, missing, 3, 6, 5, 7, 7], + dt3 = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + +julia> @chain df begin + @slice_max(dt2) + end +2×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 + +julia> @chain df begin + @slice_max(dt2, with_ties = false) + end +1×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + +julia> @chain df begin + @slice_max(dt2, with_ties = false, n=2) + end +2×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 + +julia> @chain df begin + @slice_max(dt2, prop = .5, missing_rm=true) + end +3×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 + 3 │ 1.0 6.0 1.0 +``` +""" + +const docstring_slice_min = +""" + @slice_min(df, column; with_ties, n, prop, missing_rm) + +Retrieve rows with the minimum value(s) from the specified column of a DataFrame. + +# Arguments +- `df`: The source data frame or grouped data frame from which to slice rows. +- `column`: The column for which to slice the minimum values. +- `with_ties`: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row. +- `prop`: The proportion of rows to slice. +- `n = integer`: An optional argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. +- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice. + +# Examples +```jldoctest +julia> df = DataFrame( + dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6], + dt2 = [0.3, 2, missing, 0.3, 6, 5, 7, 7], + dt3 = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + +julia> @chain df begin + @slice_min(dt2) + end +2×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼─────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + +julia> @chain df begin + @slice_min(dt2, with_ties = false) + end +1×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ missing 0.3 0.2 + +julia> @chain df begin + @slice_min(dt2, with_ties = true, n=1) + end +2×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼─────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + +julia> @chain df begin + @slice_min(dt2, prop = .5, missing_rm = true) + end +3×3 DataFrame + Row │ dt1 dt2 dt3 + │ Float64? Float64? Float64? +─────┼──────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + 3 │ 0.2 2.0 0.2 +``` +""" \ No newline at end of file diff --git a/src/slice.jl b/src/slice.jl index a5a5b06..276af60 100644 --- a/src/slice.jl +++ b/src/slice.jl @@ -71,4 +71,172 @@ macro slice_sample(df, exprs...) end return df_expr +end + +""" +$docstring_slice_max +""" +macro slice_max(df, exprs...) + expr_dict = Dict() + column = nothing + missing_rm = true + with_ties = true + arranged = false + for expr in exprs + if @capture(expr, lhs_ = rhs_) + expr_dict[lhs] = rhs + if lhs == :missing_rm + missing_rm = rhs + elseif lhs == :prop + arranged = true + end + else + column = expr + end + end + if haskey(expr_dict, :with_ties) + with_ties = expr_dict[:with_ties] + end + if column === nothing + throw(ArgumentError("No column provided")) + end + return quote + grouping_cols = Symbol[] + if $(esc(df)) isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols($(esc(df))) + end + temp_df = if $arranged + if $missing_rm + @chain $(esc(df)) begin + @filter(!ismissing($column)) + @arrange(desc($column)) + end + else + @chain $(esc(df)) begin + @arrange(desc($column)) + end + end + else + @filter($(esc(df)), $column == maximum(skipmissing($column))) + end + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(sdf) * prop_val) + push!(result_dfs, first(sdf, num_rows)) + elseif $with_ties + push!(result_dfs, sdf) + else + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + push!(result_dfs, first(sdf, n)) + end + end + temp_df = vcat(result_dfs...) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + else + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(temp_df) * prop_val) + temp_df = first(temp_df, num_rows) + elseif !$with_ties + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + temp_df = first(temp_df, n) + end + temp_df + end + end +end + +""" +$docstring_slice_min +""" +macro slice_min(df, exprs...) + expr_dict = Dict() + column = nothing + missing_rm = true + with_ties = true + arranged = false + for expr in exprs + if @capture(expr, lhs_ = rhs_) + expr_dict[lhs] = rhs + if lhs == :missing_rm + missing_rm = rhs + elseif lhs == :prop + arranged = true + end + else + column = expr + end + end + if haskey(expr_dict, :with_ties) + with_ties = expr_dict[:with_ties] + end + if column === nothing + throw(ArgumentError("No column provided")) + end + return quote + grouping_cols = Symbol[] + if $(esc(df)) isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols($(esc(df))) + end + temp_df = if $arranged + if $missing_rm + @chain $(esc(df)) begin + @filter(!ismissing($column)) + @arrange($column) + end + else + @chain $(esc(df)) begin + @arrange($column) + end + end + else + @filter($(esc(df)), $column == minimum(skipmissing($column))) + end + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(sdf) * prop_val) + push!(result_dfs, first(sdf, num_rows)) + elseif $with_ties + push!(result_dfs, sdf) + else + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + push!(result_dfs, first(sdf, n)) + end + end + temp_df = vcat(result_dfs...) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + else + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(temp_df) * prop_val) + temp_df = first(temp_df, num_rows) + elseif !$with_ties + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + temp_df = first(temp_df, n) + end + temp_df + end + end end \ No newline at end of file From 2369c30fe211af96d06a3b115e1798cfc431fa86 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Sat, 18 Nov 2023 09:58:32 -0500 Subject: [PATCH 2/4] Minor updates to docstring --- src/docstrings.jl | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index bd9a1d4..0fe8139 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2453,50 +2453,50 @@ Retrieve rows with the maximum value(s) from the specified column of a DataFrame - `column`: The column for which to slice the maximum values. - `with_ties`: Whether or not all ties will be shown, defaults to true. When false it will only show the first row. - `prop`: The proportion of rows to slice. -- `n = integer`: An optional argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. +- `n`: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. - `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice. # Examples ```jldoctest julia> df = DataFrame( - dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6], - dt2 = [0.3, 2, missing, 3, 6, 5, 7, 7], - dt3 = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + a = [missing, 0.2, missing, missing, 1, missing, 5, 6], + b = [0.3, 2, missing, 3, 6, 5, 7, 7], + c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); julia> @chain df begin - @slice_max(dt2) + @slice_max(b) end 2×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 2 │ 6.0 7.0 6.0 julia> @chain df begin - @slice_max(dt2, with_ties = false) + @slice_max(b, with_ties = false) end 1×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 julia> @chain df begin - @slice_max(dt2, with_ties = false, n=2) + @slice_max(dt2, with_ties = false, n = 2) end 2×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 2 │ 6.0 7.0 6.0 julia> @chain df begin - @slice_max(dt2, prop = .5, missing_rm=true) + @slice_max(b, prop = .5, missing_rm = true) end 3×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 @@ -2516,50 +2516,50 @@ Retrieve rows with the minimum value(s) from the specified column of a DataFrame - `column`: The column for which to slice the minimum values. - `with_ties`: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row. - `prop`: The proportion of rows to slice. -- `n = integer`: An optional argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. +- `n`: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. - `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice. # Examples ```jldoctest julia> df = DataFrame( - dt1 = [missing, 0.2, missing, missing, 1, missing, 5, 6], - dt2 = [0.3, 2, missing, 0.3, 6, 5, 7, 7], - dt3 = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + a = [missing, 0.2, missing, missing, 1, missing, 5, 6], + b = [0.3, 2, missing, 0.3, 6, 5, 7, 7], + c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); julia> @chain df begin - @slice_min(dt2) + @slice_min(b) end 2×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼─────────────────────────────── 1 │ missing 0.3 0.2 2 │ missing 0.3 missing julia> @chain df begin - @slice_min(dt2, with_ties = false) + @slice_min(b, with_ties = false) end 1×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ missing 0.3 0.2 julia> @chain df begin - @slice_min(dt2, with_ties = true, n=1) + @slice_min(b, with_ties = true, n = 1) end 2×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a b c │ Float64? Float64? Float64? ─────┼─────────────────────────────── 1 │ missing 0.3 0.2 2 │ missing 0.3 missing julia> @chain df begin - @slice_min(dt2, prop = .5, missing_rm = true) + @slice_min(b, prop = .5, missing_rm = true) end 3×3 DataFrame - Row │ dt1 dt2 dt3 + Row │ a c c │ Float64? Float64? Float64? ─────┼──────────────────────────────── 1 │ missing 0.3 0.2 From 68112512127c6a9bce6488259a647048c77502ce Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Sat, 18 Nov 2023 10:04:56 -0500 Subject: [PATCH 3/4] Fix docstring error that I introduced, and add Statistics version to compat per new compat requirements. --- Project.toml | 1 + src/docstrings.jl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index e3d197e..c4b8da7 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ DataFrames = "1.5" MacroTools = "0.5" Reexport = "0.2, 1" ShiftedArrays = "2" +Statistics = "1.6" StatsBase = "0.34, 1" julia = "1.6" diff --git a/src/docstrings.jl b/src/docstrings.jl index 0fe8139..8cbd76a 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2559,7 +2559,7 @@ julia> @chain df begin @slice_min(b, prop = .5, missing_rm = true) end 3×3 DataFrame - Row │ a c c + Row │ a b c │ Float64? Float64? Float64? ─────┼──────────────────────────────── 1 │ missing 0.3 0.2 From 5656c87b15f53deee6c9bdd0401ee171513a7d84 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Sat, 18 Nov 2023 10:25:54 -0500 Subject: [PATCH 4/4] Fix one more docstring error and update README and index. --- README.md | 2 +- docs/src/index.md | 2 +- src/docstrings.jl | 25 +++++++++++++------------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index bf97f23..c9b2840 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ TidierData.jl currently supports the following top-level macros: - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` -- `@slice()` and `@slice_sample()` +- `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` diff --git a/docs/src/index.md b/docs/src/index.md index ceaddf7..5ed7eff 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -94,7 +94,7 @@ TidierData.jl currently supports the following top-level macros: - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` - - `@slice()` and `@slice_sample()` + - `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` diff --git a/src/docstrings.jl b/src/docstrings.jl index 8cbd76a..5fc9a5e 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2467,7 +2467,7 @@ julia> @chain df begin @slice_max(b) end 2×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 @@ -2477,26 +2477,26 @@ julia> @chain df begin @slice_max(b, with_ties = false) end 1×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 julia> @chain df begin - @slice_max(dt2, with_ties = false, n = 2) + @slice_max(b, with_ties = false, n = 2) end 2×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 2 │ 6.0 7.0 6.0 julia> @chain df begin - @slice_max(b, prop = .5, missing_rm = true) + @slice_max(b, prop = 0.5, missing_rm = true) end 3×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ 5.0 7.0 5.0 @@ -2530,17 +2530,17 @@ julia> @chain df begin @slice_min(b) end 2×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼─────────────────────────────── 1 │ missing 0.3 0.2 - 2 │ missing 0.3 missing + 2 │ missing 0.3 missing julia> @chain df begin @slice_min(b, with_ties = false) end 1×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼────────────────────────────── 1 │ missing 0.3 0.2 @@ -2549,17 +2549,18 @@ julia> @chain df begin @slice_min(b, with_ties = true, n = 1) end 2×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼─────────────────────────────── 1 │ missing 0.3 0.2 2 │ missing 0.3 missing + julia> @chain df begin - @slice_min(b, prop = .5, missing_rm = true) + @slice_min(b, prop = 0.5, missing_rm = true) end 3×3 DataFrame - Row │ a b c + Row │ a b c │ Float64? Float64? Float64? ─────┼──────────────────────────────── 1 │ missing 0.3 0.2