Skip to content

Commit

Permalink
Merge pull request #59 from drizk1/slice_min_max
Browse files Browse the repository at this point in the history
@slice_min and @slice_max
  • Loading branch information
Karandeep Singh committed Nov 18, 2023
2 parents 84d1322 + 5656c87 commit ca0c6e0
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 4 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ DataFrames = "1.5"
MacroTools = "0.5"
Reexport = "0.2, 1"
ShiftedArrays = "2"
Statistics = "1.6"
StatsBase = "0.34, 1"
julia = "1.6"

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ TidierData.jl currently supports the following top-level macros:
- `@mutate()` and `@transmute()`
- `@summarize()` and `@summarise()`
- `@filter()`
- `@slice()` and `@slice_sample()`
- `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()`
- `@group_by()` and `@ungroup()`
- `@arrange()`
- `@pull()`
Expand Down
24 changes: 23 additions & 1 deletion docs/examples/UserGuide/slice.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,26 @@ end

@chain df begin
@slice_sample(5)
end
end

# ## Slice the min

# This line selects all rows with the the minimum value of the desired column

@chain df begin
@slice_min(b)
end

# This line will only show the first row.

@chain df begin
@slice_min(b, with_ties = false)
end

# ## Slice the max

# The optional prop arguement will slice a proportion of the full dataframe.

@chain df begin
@slice_max(b, prop = .5)
end
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ TidierData.jl currently supports the following top-level macros:
- `@mutate()` and `@transmute()`
- `@summarize()` and `@summarise()`
- `@filter()`
- `@slice()` and `@slice_sample()`
- `@slice()`, `@slice_sample()`, `@slice_min()`, and `@slice_max()`
- `@group_by()` and `@ungroup()`
- `@arrange()`
- `@pull()`
Expand Down
2 changes: 1 addition & 1 deletion src/TidierData.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end
as_float, as_integer, as_string, is_float, is_integer, is_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter,
@group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join,
@pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate,
@unite, @summary, @fill_missing, @slice_sample
@unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max

# Package global variables
const code = Ref{Bool}(false) # output DataFrames.jl code?
Expand Down
127 changes: 127 additions & 0 deletions src/docstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2441,3 +2441,130 @@ julia> @chain df begin
5 │ 25 5 15
```
"""

const docstring_slice_max =
"""
@slice_max(df, column; with_ties, n, prop, missing_rm)
Retrieve rows with the maximum value(s) from the specified column of a DataFrame.
# Arguments
- `df`: The source data frame or grouped data frame from which to slice rows.
- `column`: The column for which to slice the maximum values.
- `with_ties`: Whether or not all ties will be shown, defaults to true. When false it will only show the first row.
- `prop`: The proportion of rows to slice.
- `n`: An optional integer argument to specify the number of maximum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.
# Examples
```jldoctest
julia> df = DataFrame(
a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
b = [0.3, 2, missing, 3, 6, 5, 7, 7],
c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);
julia> @chain df begin
@slice_max(b)
end
2×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼──────────────────────────────
1 │ 5.0 7.0 5.0
2 │ 6.0 7.0 6.0
julia> @chain df begin
@slice_max(b, with_ties = false)
end
1×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼──────────────────────────────
1 │ 5.0 7.0 5.0
julia> @chain df begin
@slice_max(b, with_ties = false, n = 2)
end
2×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼──────────────────────────────
1 │ 5.0 7.0 5.0
2 │ 6.0 7.0 6.0
julia> @chain df begin
@slice_max(b, prop = 0.5, missing_rm = true)
end
3×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼──────────────────────────────
1 │ 5.0 7.0 5.0
2 │ 6.0 7.0 6.0
3 │ 1.0 6.0 1.0
```
"""

const docstring_slice_min =
"""
@slice_min(df, column; with_ties, n, prop, missing_rm)
Retrieve rows with the minimum value(s) from the specified column of a DataFrame.
# Arguments
- `df`: The source data frame or grouped data frame from which to slice rows.
- `column`: The column for which to slice the minimum values.
- `with_ties`: Whether or not all ties will be shown, defaults to true and shows all ties. When false it will only show the first row.
- `prop`: The proportion of rows to slice.
- `n`: An optional integer argument to specify the number of minimum rows to retrieve. If with_ties = true, and the ties > n, n will be overridden.
- `missing_rm`: Defaults to true, skips the missing values when determining the proportion of the dataframe to slice.
# Examples
```jldoctest
julia> df = DataFrame(
a = [missing, 0.2, missing, missing, 1, missing, 5, 6],
b = [0.3, 2, missing, 0.3, 6, 5, 7, 7],
c = [0.2, 0.2, 0.2, missing, 1, missing, 5, 6]);
julia> @chain df begin
@slice_min(b)
end
2×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼───────────────────────────────
1 │ missing 0.3 0.2
2 │ missing 0.3 missing
julia> @chain df begin
@slice_min(b, with_ties = false)
end
1×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼──────────────────────────────
1 │ missing 0.3 0.2
julia> @chain df begin
@slice_min(b, with_ties = true, n = 1)
end
2×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼───────────────────────────────
1 │ missing 0.3 0.2
2 │ missing 0.3 missing
julia> @chain df begin
@slice_min(b, prop = 0.5, missing_rm = true)
end
3×3 DataFrame
Row │ a b c
│ Float64? Float64? Float64?
─────┼────────────────────────────────
1 │ missing 0.3 0.2
2 │ missing 0.3 missing
3 │ 0.2 2.0 0.2
```
"""
168 changes: 168 additions & 0 deletions src/slice.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,172 @@ macro slice_sample(df, exprs...)
end

return df_expr
end

"""
$docstring_slice_max
"""
macro slice_max(df, exprs...)
expr_dict = Dict()
column = nothing
missing_rm = true
with_ties = true
arranged = false
for expr in exprs
if @capture(expr, lhs_ = rhs_)
expr_dict[lhs] = rhs
if lhs == :missing_rm
missing_rm = rhs
elseif lhs == :prop
arranged = true
end
else
column = expr
end
end
if haskey(expr_dict, :with_ties)
with_ties = expr_dict[:with_ties]
end
if column === nothing
throw(ArgumentError("No column provided"))
end
return quote
grouping_cols = Symbol[]
if $(esc(df)) isa DataFrames.GroupedDataFrame
grouping_cols = DataFrames.groupcols($(esc(df)))
end
temp_df = if $arranged
if $missing_rm
@chain $(esc(df)) begin
@filter(!ismissing($column))
@arrange(desc($column))
end
else
@chain $(esc(df)) begin
@arrange(desc($column))
end
end
else
@filter($(esc(df)), $column == maximum(skipmissing($column)))
end
if temp_df isa DataFrames.GroupedDataFrame
result_dfs = []
for sdf in temp_df
local prop_val
if haskey($expr_dict, :prop)
prop_val = $expr_dict[:prop]
if prop_val < 0.0 || prop_val > 1.0
throw(ArgumentError("Prop value should be between 0 and 1"))
end
num_rows = floor(Int, nrow(sdf) * prop_val)
push!(result_dfs, first(sdf, num_rows))
elseif $with_ties
push!(result_dfs, sdf)
else
n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
push!(result_dfs, first(sdf, n))
end
end
temp_df = vcat(result_dfs...)
temp_df = DataFrames.groupby(temp_df, grouping_cols)
else
local prop_val
if haskey($expr_dict, :prop)
prop_val = $expr_dict[:prop]
if prop_val < 0.0 || prop_val > 1.0
throw(ArgumentError("Prop value should be between 0 and 1"))
end
num_rows = floor(Int, nrow(temp_df) * prop_val)
temp_df = first(temp_df, num_rows)
elseif !$with_ties
n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
temp_df = first(temp_df, n)
end
temp_df
end
end
end

"""
$docstring_slice_min
"""
macro slice_min(df, exprs...)
expr_dict = Dict()
column = nothing
missing_rm = true
with_ties = true
arranged = false
for expr in exprs
if @capture(expr, lhs_ = rhs_)
expr_dict[lhs] = rhs
if lhs == :missing_rm
missing_rm = rhs
elseif lhs == :prop
arranged = true
end
else
column = expr
end
end
if haskey(expr_dict, :with_ties)
with_ties = expr_dict[:with_ties]
end
if column === nothing
throw(ArgumentError("No column provided"))
end
return quote
grouping_cols = Symbol[]
if $(esc(df)) isa DataFrames.GroupedDataFrame
grouping_cols = DataFrames.groupcols($(esc(df)))
end
temp_df = if $arranged
if $missing_rm
@chain $(esc(df)) begin
@filter(!ismissing($column))
@arrange($column)
end
else
@chain $(esc(df)) begin
@arrange($column)
end
end
else
@filter($(esc(df)), $column == minimum(skipmissing($column)))
end
if temp_df isa DataFrames.GroupedDataFrame
result_dfs = []
for sdf in temp_df
local prop_val
if haskey($expr_dict, :prop)
prop_val = $expr_dict[:prop]
if prop_val < 0.0 || prop_val > 1.0
throw(ArgumentError("Prop value should be between 0 and 1"))
end
num_rows = floor(Int, nrow(sdf) * prop_val)
push!(result_dfs, first(sdf, num_rows))
elseif $with_ties
push!(result_dfs, sdf)
else
n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
push!(result_dfs, first(sdf, n))
end
end
temp_df = vcat(result_dfs...)
temp_df = DataFrames.groupby(temp_df, grouping_cols)
else
local prop_val
if haskey($expr_dict, :prop)
prop_val = $expr_dict[:prop]
if prop_val < 0.0 || prop_val > 1.0
throw(ArgumentError("Prop value should be between 0 and 1"))
end
num_rows = floor(Int, nrow(temp_df) * prop_val)
temp_df = first(temp_df, num_rows)
elseif !$with_ties
n = haskey($expr_dict, :n) ? $expr_dict[:n] : 1
temp_df = first(temp_df, n)
end
temp_df
end
end
end

0 comments on commit ca0c6e0

Please sign in to comment.