diff --git a/Project.toml b/Project.toml index e3d197e..c4b8da7 100644 --- a/Project.toml +++ b/Project.toml @@ -20,6 +20,7 @@ DataFrames = "1.5" MacroTools = "0.5" Reexport = "0.2, 1" ShiftedArrays = "2" +Statistics = "1.6" StatsBase = "0.34, 1" julia = "1.6" diff --git a/README.md b/README.md index bf97f23..c9b2840 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ TidierData.jl currently supports the following top-level macros: - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` -- `@slice()` and `@slice_sample()` +- `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl index 0c24b53..ef87a80 100644 --- a/docs/examples/UserGuide/slice.jl +++ b/docs/examples/UserGuide/slice.jl @@ -65,4 +65,26 @@ end @chain df begin @slice_sample(5) -end \ No newline at end of file +end + +# ## Slice the min + +# This line selects all rows with the the minimum value of the desired column + +@chain df begin + @slice_min(b) +end + +# This line will only show the first row. + +@chain df begin + @slice_min(b, with_ties = false) +end + +# ## Slice the max + +# The optional prop arguement will slice a proportion of the full dataframe. + +@chain df begin + @slice_max(b, prop = .5) +end diff --git a/docs/src/index.md b/docs/src/index.md index ceaddf7..5ed7eff 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -94,7 +94,7 @@ TidierData.jl currently supports the following top-level macros: - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` - - `@slice()` and `@slice_sample()` + - `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` diff --git a/src/TidierData.jl b/src/TidierData.jl index 6fc2c3b..7cac670 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end as_float, as_integer, as_string, is_float, is_integer, is_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? diff --git a/src/docstrings.jl b/src/docstrings.jl index 6afa139..5fc9a5e 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2441,3 +2441,130 @@ julia> @chain df begin 5 │ 25 5 15 ``` """ + +const docstring_slice_max = +""" + @slice_max(df, column; with_ties, n, prop, missing_rm) + +Retrieve rows with the maximum value(s) from the specified column of a DataFrame. + +# Arguments +- `df`: The source data frame or grouped data frame from which to slice rows. +- `column`: The column for which to slice the maximum values. +- `with_ties`: Whether or not all ties will be shown, defaults to true. When false it will only show the first row. +- `prop`: The proportion of rows to slice. +- `n`: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. +- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice. + +# Examples +```jldoctest +julia> df = DataFrame( + a = [missing, 0.2, missing, missing, 1, missing, 5, 6], + b = [0.3, 2, missing, 3, 6, 5, 7, 7], + c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + +julia> @chain df begin + @slice_max(b) + end +2×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 + +julia> @chain df begin + @slice_max(b, with_ties = false) + end +1×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + +julia> @chain df begin + @slice_max(b, with_ties = false, n = 2) + end +2×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 + +julia> @chain df begin + @slice_max(b, prop = 0.5, missing_rm = true) + end +3×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 + 3 │ 1.0 6.0 1.0 +``` +""" + +const docstring_slice_min = +""" + @slice_min(df, column; with_ties, n, prop, missing_rm) + +Retrieve rows with the minimum value(s) from the specified column of a DataFrame. + +# Arguments +- `df`: The source data frame or grouped data frame from which to slice rows. +- `column`: The column for which to slice the minimum values. +- `with_ties`: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row. +- `prop`: The proportion of rows to slice. +- `n`: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden. +- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice. + +# Examples +```jldoctest +julia> df = DataFrame( + a = [missing, 0.2, missing, missing, 1, missing, 5, 6], + b = [0.3, 2, missing, 0.3, 6, 5, 7, 7], + c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + +julia> @chain df begin + @slice_min(b) + end +2×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼─────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + +julia> @chain df begin + @slice_min(b, with_ties = false) + end +1×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ missing 0.3 0.2 + +julia> @chain df begin + @slice_min(b, with_ties = true, n = 1) + end +2×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼─────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + + +julia> @chain df begin + @slice_min(b, prop = 0.5, missing_rm = true) + end +3×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼──────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ missing 0.3 missing + 3 │ 0.2 2.0 0.2 +``` +""" \ No newline at end of file diff --git a/src/slice.jl b/src/slice.jl index a5a5b06..276af60 100644 --- a/src/slice.jl +++ b/src/slice.jl @@ -71,4 +71,172 @@ macro slice_sample(df, exprs...) end return df_expr +end + +""" +$docstring_slice_max +""" +macro slice_max(df, exprs...) + expr_dict = Dict() + column = nothing + missing_rm = true + with_ties = true + arranged = false + for expr in exprs + if @capture(expr, lhs_ = rhs_) + expr_dict[lhs] = rhs + if lhs == :missing_rm + missing_rm = rhs + elseif lhs == :prop + arranged = true + end + else + column = expr + end + end + if haskey(expr_dict, :with_ties) + with_ties = expr_dict[:with_ties] + end + if column === nothing + throw(ArgumentError("No column provided")) + end + return quote + grouping_cols = Symbol[] + if $(esc(df)) isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols($(esc(df))) + end + temp_df = if $arranged + if $missing_rm + @chain $(esc(df)) begin + @filter(!ismissing($column)) + @arrange(desc($column)) + end + else + @chain $(esc(df)) begin + @arrange(desc($column)) + end + end + else + @filter($(esc(df)), $column == maximum(skipmissing($column))) + end + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(sdf) * prop_val) + push!(result_dfs, first(sdf, num_rows)) + elseif $with_ties + push!(result_dfs, sdf) + else + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + push!(result_dfs, first(sdf, n)) + end + end + temp_df = vcat(result_dfs...) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + else + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(temp_df) * prop_val) + temp_df = first(temp_df, num_rows) + elseif !$with_ties + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + temp_df = first(temp_df, n) + end + temp_df + end + end +end + +""" +$docstring_slice_min +""" +macro slice_min(df, exprs...) + expr_dict = Dict() + column = nothing + missing_rm = true + with_ties = true + arranged = false + for expr in exprs + if @capture(expr, lhs_ = rhs_) + expr_dict[lhs] = rhs + if lhs == :missing_rm + missing_rm = rhs + elseif lhs == :prop + arranged = true + end + else + column = expr + end + end + if haskey(expr_dict, :with_ties) + with_ties = expr_dict[:with_ties] + end + if column === nothing + throw(ArgumentError("No column provided")) + end + return quote + grouping_cols = Symbol[] + if $(esc(df)) isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols($(esc(df))) + end + temp_df = if $arranged + if $missing_rm + @chain $(esc(df)) begin + @filter(!ismissing($column)) + @arrange($column) + end + else + @chain $(esc(df)) begin + @arrange($column) + end + end + else + @filter($(esc(df)), $column == minimum(skipmissing($column))) + end + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(sdf) * prop_val) + push!(result_dfs, first(sdf, num_rows)) + elseif $with_ties + push!(result_dfs, sdf) + else + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + push!(result_dfs, first(sdf, n)) + end + end + temp_df = vcat(result_dfs...) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + else + local prop_val + if haskey($expr_dict, :prop) + prop_val = $expr_dict[:prop] + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + num_rows = floor(Int, nrow(temp_df) * prop_val) + temp_df = first(temp_df, num_rows) + elseif !$with_ties + n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1 + temp_df = first(temp_df, n) + end + temp_df + end + end end \ No newline at end of file