From 6086a5d6f3dc87b27ca005e80d393062ea052307 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sat, 18 Nov 2023 18:58:42 -0500 Subject: [PATCH] added @slice_head and @slice_tail --- README.md | 2 +- docs/examples/UserGuide/slice.jl | 12 ++++ src/TidierData.jl | 2 +- src/docstrings.jl | 83 ++++++++++++++++++++++++++++ src/slice.jl | 94 ++++++++++++++++++++++++++++++++ 5 files changed, 191 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 949f1b5..cd8c53b 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ TidierData.jl currently supports the following top-level macros: - `@mutate()` and `@transmute()` - `@summarize()` and `@summarise()` - `@filter()` -- `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()` +- `@slice()`, `@slice_sample()`, `@slice_min()`, `@slice_max()`, `@slice_head()`, and `@slice_tail()` - `@group_by()` and `@ungroup()` - `@arrange()` - `@pull()` diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl index ef87a80..de0e886 100644 --- a/docs/examples/UserGuide/slice.jl +++ b/docs/examples/UserGuide/slice.jl @@ -88,3 +88,15 @@ end @chain df begin @slice_max(b, prop = .5) end + +# ## Slice the tail + +@chain df begin + @slice_tail(prop = .5) +end + +# ## Slice the head + +@chain df begin + @slice_head(n = 3) +end \ No newline at end of file diff --git a/src/TidierData.jl b/src/TidierData.jl index 833176b..a89c05b 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end as_float, as_integer, as_string, is_float, is_integer, is_string, missing_if, replace_missing, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @rename_with + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @rename_with, @slice_head, @slice_tail # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? diff --git a/src/docstrings.jl b/src/docstrings.jl index 0cc2516..e2e4ba4 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2568,6 +2568,89 @@ julia> @chain df begin 3 │ 0.2 2.0 0.2 ``` """ + +const docstring_slice_head = +""" + @slice_head(df; n, prop) + +Retrieve rows in the beginning of a DataFrame. + +# Arguments +- `df`: The source data frame or grouped data frame from which to slice rows. +- `prop`: The proportion of rows to slice. +- `n`: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1. + +# Examples +```jldoctest +julia> df = DataFrame( + a = [missing, 0.2, missing, missing, 1, missing, 5, 6], + b = [0.3, 2, missing, 0.3, 6, 5, 7, 7], + c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + +julia> @chain df begin + @slice_head(n = 3) + end +3×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼──────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ 0.2 2.0 0.2 + 3 │ missing missing 0.2 + +julia> @chain df begin + @slice_head(prop = .25) + end +2×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼─────────────────────────────── + 1 │ missing 0.3 0.2 + 2 │ 0.2 2.0 0.2 +``` +""" + +const docstring_slice_tail = +""" + @slice_tail(df; n, prop) + +Retrieve rows in the beginning of a DataFrame. + +# Arguments +- `df`: The source data frame or grouped data frame from which to slice rows. +- `prop`: The proportion of rows to slice. +- `n`: An optional integer argument to specify the number of rows at the beginning of the dataframe to retrieve. Defaults to 1. + +# Examples +```jldoctest +julia> df = DataFrame( + a = [missing, 0.2, missing, missing, 1, missing, 5, 6], + b = [0.3, 2, missing, 0.3, 6, 5, 7, 7], + c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]); + +julia> @chain df begin + @slice_tail(n = 3) + end +3×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼──────────────────────────────── + 1 │ missing 5.0 missing + 2 │ 5.0 7.0 5.0 + 3 │ 6.0 7.0 6.0 + +julia> @chain df begin + @slice_tail(prop = .25) + end +2×3 DataFrame + Row │ a b c + │ Float64? Float64? Float64? +─────┼────────────────────────────── + 1 │ 5.0 7.0 5.0 + 2 │ 6.0 7.0 6.0 +``` +""" + const docstring_missing_if = """ missing_if(x, value) diff --git a/src/slice.jl b/src/slice.jl index 276af60..45aa3d8 100644 --- a/src/slice.jl +++ b/src/slice.jl @@ -239,4 +239,98 @@ macro slice_min(df, exprs...) temp_df end end +end + +""" +$docstring_slice_head +""" +macro slice_head(df, exprs...) + expr_dict = :(Dict()) + + for expr in exprs + if @capture(expr, lhs_ = rhs_) + push!(expr_dict.args, :($(QuoteNode(lhs)) => $(esc(rhs)))) + end + end + return quote + expr_dict = $expr_dict + temp_df = $(esc(df)) + grouping_cols = Symbol[] + + if temp_df isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols(temp_df) + end + local n = get(expr_dict, :n, 1) + local prop_val = get(expr_dict, :prop, 1.0) + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + local group_n = n + if prop_val != 1.0 + group_n = floor(Int, nrow(sdf) * prop_val) + end + push!(result_dfs, first(sdf, group_n)) + end + temp_df = vcat(result_dfs...) + else + if prop_val != 1.0 + n = floor(Int, nrow(temp_df) * prop_val) + end + temp_df = first(temp_df, n) + end + + if !isempty(grouping_cols) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + end + temp_df + end +end + +""" +$docstring_slice_tail +""" +macro slice_tail(df, exprs...) + expr_dict = :(Dict()) + for expr in exprs + if @capture(expr, lhs_ = rhs_) + push!(expr_dict.args, :($(QuoteNode(lhs)) => $(esc(rhs)))) + end + end + return quote + expr_dict = $expr_dict + temp_df = $(esc(df)) + grouping_cols = Symbol[] + if temp_df isa DataFrames.GroupedDataFrame + grouping_cols = DataFrames.groupcols(temp_df) + end + local n = get(expr_dict, :n, 1) + local prop_val = get(expr_dict, :prop, 1.0) + if prop_val < 0.0 || prop_val > 1.0 + throw(ArgumentError("Prop value should be between 0 and 1")) + end + if temp_df isa DataFrames.GroupedDataFrame + result_dfs = [] + for sdf in temp_df + local group_n = n + if prop_val != 1.0 + group_n = floor(Int, nrow(sdf) * prop_val) + end + push!(result_dfs, last(sdf, group_n)) + end + temp_df = vcat(result_dfs...) + else + if prop_val != 1.0 + n = floor(Int, nrow(temp_df) * prop_val) + end + temp_df = last(temp_df, n) + end + + if !isempty(grouping_cols) + temp_df = DataFrames.groupby(temp_df, grouping_cols) + end + temp_df + end end \ No newline at end of file