From a1d7261bf0d9cc5f266d9a1d895845369ae3a48a Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 17 Dec 2023 12:24:16 -0500 Subject: [PATCH 1/6] added support for separate_rows --- docs/examples/UserGuide/sep_unite.jl | 17 +++++- src/TidierData.jl | 2 +- src/docstrings.jl | 48 +++++++++++++++++ src/separate_unite.jl | 80 ++++++++++++++++++++++++++++ 4 files changed, 144 insertions(+), 3 deletions(-) diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl index 9c0888d..d60c0f7 100644 --- a/docs/examples/UserGuide/sep_unite.jl +++ b/docs/examples/UserGuide/sep_unite.jl @@ -4,7 +4,9 @@ using TidierData df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); -# ## Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter +# ## @separate + +# Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter @chain df begin @separate(a, (b, c, d), "-") @@ -16,9 +18,11 @@ new_names = ["x$(i)" for i in 1:3]; # or new_names = ["b", "c", "d"], or new_nam @separate(df, a, !!new_names, "-") +# ## @unite + # The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter +# Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter -# ## Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); @@ -26,3 +30,12 @@ df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, @unite(new_col, (b, c, d), "/") end + +# @separate_rows + +# ## Separate rows into multiple rows based on a chosen delimiter. + +df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"], e = ["11", "22;33;44", "55;66"]); + +@separate_rows(df, b:5, ";") + diff --git a/src/TidierData.jl b/src/TidierData.jl index af840da..ccc3288 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end as_float, as_integer, as_string, is_float, is_integer, is_string, missing_if, replace_missing, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? diff --git a/src/docstrings.jl b/src/docstrings.jl index d36a469..0446bc2 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2908,4 +2908,52 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a) 2 │ banana doc2 2 3 │ cherry doc3 3 ``` +""" + +const docstring_separate_rows = +""" + separate_rows(df, column(s), delimiter) + +Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter. + +# Arguments +- `df`: A DataFrame +- `columns`: A column or collection of columns to be split. Can be a mix of integers and symbols +- `delimiter`: The string or character or regular expression used to split the column values. + +# Examples +```jldoctest +julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"], e = ["11", "22;33;44", "55;66"]) +3×5 DataFrame + Row │ a b c d e + │ Int64 String String String String +─────┼─────────────────────────────────────────── + 1 │ 1 a 1 7 11 + 2 │ 2 aa;bb;cc 2;3;4 8;9;10 22;33;44 + 3 │ 3 dd;ee 5;6 11;12 55;66 + +julia> @separate_rows(df, 2, 3, 5, ";" ) +6×5 DataFrame + Row │ a b c d e + │ Int64 SubStrin… SubStrin… SubStrin… String +─────┼────────────────────────────────────────────────── + 1 │ 1 a 1 7 11 + 2 │ 2 aa 2 8 22;33;44 + 3 │ 2 bb 3 9 22;33;44 + 4 │ 2 cc 4 10 22;33;44 + 5 │ 3 dd 5 11 55;66 + 6 │ 3 ee 6 12 55;66 + +julia>@separate_rows(df, b:5, ";") +6×5 DataFrame + Row │ a b c d e + │ Int64 SubStrin… SubStrin… SubStrin… SubStrin… +─────┼─────────────────────────────────────────────────── + 1 │ 1 a 1 7 11 + 2 │ 2 aa 2 8 22 + 3 │ 2 bb 3 9 33 + 4 │ 2 cc 4 10 44 + 5 │ 3 dd 5 11 55 + 6 │ 3 ee 6 12 66 +``` """ \ No newline at end of file diff --git a/src/separate_unite.jl b/src/separate_unite.jl index 133289f..4d89a3b 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -82,3 +82,83 @@ macro unite(df, new_col, from_cols, sep) unite($(esc(df)), $new_col_quoted, $(from_cols_expr), $(esc(sep))) end end + + +### separate_rows +function separate_rows(df::DataFrame, columns, delimiter::Union{Regex, String}) + temp_df = copy(df) + + # Convert all references to column symbols + column_symbols = [] + for col in columns + if col isa Integer + push!(column_symbols, Symbol(names(df)[col])) + elseif col isa AbstractRange + append!(column_symbols, Symbol.(names(df)[collect(col)])) + elseif typeof(col) <: Between + # Get the column indices for the Between range + col_indices = DataFrames.index(df)[col] + append!(column_symbols, Symbol.(names(df)[col_indices])) + else + push!(column_symbols, Symbol(col)) + end + end + + # Initialize an array to hold expanded data for each column + expanded_data = Dict{Symbol, Vector{Any}}() + + for column in column_symbols + expanded_data[column] = [] + + for row in eachrow(temp_df) + value = row[column] + # Handle missing values and non-string types + if ismissing(value) || typeof(value) != String + push!(expanded_data[column], [value]) + else + push!(expanded_data[column], split(value, delimiter)) + end + end + end + + # Replace the columns with expanded data + for column in column_symbols + temp_df[!, column] = expanded_data[column] + end + + # Flatten the DataFrame only once after all columns have been expanded + temp_df = flatten(temp_df, column_symbols) + + return temp_df + end + + """ + $docstring_separate_rows + """ + macro separate_rows(df, args...) + delimiter = esc(last(args)) + exprs = Base.front(args) + + interpolated_exprs = parse_interpolation.(exprs) + + tidy_exprs = [i[1] for i in interpolated_exprs] + any_found_n = any([i[2] for i in interpolated_exprs]) + any_found_row_number = any([i[3] for i in interpolated_exprs]) + + tidy_exprs = parse_tidy.(tidy_exprs) + df_expr = quote + local df_copy = $(esc(df)) # not a copy + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + end + df_output + end + return df_expr + end \ No newline at end of file From 291e342c7aeb68af5bf17c57df607201493a901b Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 17 Dec 2023 12:33:04 -0500 Subject: [PATCH 2/6] small docstring fix --- src/docstrings.jl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index 0446bc2..de3ac09 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2934,15 +2934,15 @@ julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4" julia> @separate_rows(df, 2, 3, 5, ";" ) 6×5 DataFrame - Row │ a b c d e - │ Int64 SubStrin… SubStrin… SubStrin… String -─────┼────────────────────────────────────────────────── - 1 │ 1 a 1 7 11 - 2 │ 2 aa 2 8 22;33;44 - 3 │ 2 bb 3 9 22;33;44 - 4 │ 2 cc 4 10 22;33;44 - 5 │ 3 dd 5 11 55;66 - 6 │ 3 ee 6 12 55;66 + Row │ a b c d e + │ Int64 SubStrin… SubStrin… String SubStrin… +─────┼──────────────────────────────────────────────── + 1 │ 1 a 1 7 11 + 2 │ 2 aa 2 8;9;10 22 + 3 │ 2 bb 3 8;9;10 33 + 4 │ 2 cc 4 8;9;10 44 + 5 │ 3 dd 5 11;12 55 + 6 │ 3 ee 6 11;12 66 julia>@separate_rows(df, b:5, ";") 6×5 DataFrame From 3447dffdcbd4cdd22b267eed06be25084540d6e8 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 17 Dec 2023 12:40:08 -0500 Subject: [PATCH 3/6] simplified sep_rows docstring. --- src/docstrings.jl | 64 +++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index de3ac09..669a3d6 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2923,37 +2923,37 @@ Split the contents of specified columns in a DataFrame into multiple rows based # Examples ```jldoctest -julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"], e = ["11", "22;33;44", "55;66"]) -3×5 DataFrame - Row │ a b c d e - │ Int64 String String String String -─────┼─────────────────────────────────────────── - 1 │ 1 a 1 7 11 - 2 │ 2 aa;bb;cc 2;3;4 8;9;10 22;33;44 - 3 │ 3 dd;ee 5;6 11;12 55;66 - -julia> @separate_rows(df, 2, 3, 5, ";" ) -6×5 DataFrame - Row │ a b c d e - │ Int64 SubStrin… SubStrin… String SubStrin… -─────┼──────────────────────────────────────────────── - 1 │ 1 a 1 7 11 - 2 │ 2 aa 2 8;9;10 22 - 3 │ 2 bb 3 8;9;10 33 - 4 │ 2 cc 4 8;9;10 44 - 5 │ 3 dd 5 11;12 55 - 6 │ 3 ee 6 11;12 66 - -julia>@separate_rows(df, b:5, ";") -6×5 DataFrame - Row │ a b c d e - │ Int64 SubStrin… SubStrin… SubStrin… SubStrin… -─────┼─────────────────────────────────────────────────── - 1 │ 1 a 1 7 11 - 2 │ 2 aa 2 8 22 - 3 │ 2 bb 3 9 33 - 4 │ 2 cc 4 10 44 - 5 │ 3 dd 5 11 55 - 6 │ 3 ee 6 12 66 +julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"]) +3×4 DataFrame + Row │ a b c d + │ Int64 String String String +─────┼───────────────────────────────── + 1 │ 1 a 1 7 + 2 │ 2 aa;bb;cc 2;3;4 8;9;10 + 3 │ 3 dd;ee 5;6 11;12 + +julia> @separate_rows(df, 2, 4, ";" ) +6×4 DataFrame + Row │ a b c d + │ Int64 SubStrin… String SubStrin… +─────┼───────────────────────────────────── + 1 │ 1 a 1 7 + 2 │ 2 aa 2;3;4 8 + 3 │ 2 bb 2;3;4 9 + 4 │ 2 cc 2;3;4 10 + 5 │ 3 dd 5;6 11 + 6 │ 3 ee 5;6 12 + +julia> @separate_rows(df, b:d, ";") +6×4 DataFrame + Row │ a b c d + │ Int64 SubStrin… SubStrin… SubStrin… +─────┼──────────────────────────────────────── + 1 │ 1 a 1 7 + 2 │ 2 aa 2 8 + 3 │ 2 bb 3 9 + 4 │ 2 cc 4 10 + 5 │ 3 dd 5 11 + 6 │ 3 ee 6 12 ``` """ \ No newline at end of file From de9b8edf1967fc6591b1bb946d78bb4ddd852ec8 Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 17 Dec 2023 16:36:32 -0500 Subject: [PATCH 4/6] gdf support to @separate_rows --- src/docstrings.jl | 4 +-- src/separate_unite.jl | 68 +++++++++++++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index 669a3d6..7aa7f5b 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2944,8 +2944,8 @@ julia> @separate_rows(df, 2, 4, ";" ) 5 │ 3 dd 5;6 11 6 │ 3 ee 5;6 12 -julia> @separate_rows(df, b:d, ";") -6×4 DataFrame + local df_output = separate_rows2(df_copy, [$(exprs...)], $delimiter) + 6×4 DataFrame Row │ a b c d │ Int64 SubStrin… SubStrin… SubStrin… ─────┼──────────────────────────────────────── diff --git a/src/separate_unite.jl b/src/separate_unite.jl index 4d89a3b..f108773 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -85,20 +85,25 @@ end ### separate_rows -function separate_rows(df::DataFrame, columns, delimiter::Union{Regex, String}) - temp_df = copy(df) +function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String}) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + + # Ungroup if necessary + temp_df = copy(is_grouped ? parent(df) : df) + # temp_df = copy(df) # Convert all references to column symbols column_symbols = [] for col in columns if col isa Integer - push!(column_symbols, Symbol(names(df)[col])) + push!(column_symbols, Symbol(names(temp_df)[col])) elseif col isa AbstractRange - append!(column_symbols, Symbol.(names(df)[collect(col)])) + append!(column_symbols, Symbol.(names(temp_df)[collect(col)])) elseif typeof(col) <: Between # Get the column indices for the Between range - col_indices = DataFrames.index(df)[col] - append!(column_symbols, Symbol.(names(df)[col_indices])) + col_indices = DataFrames.index(temp_df)[col] + append!(column_symbols, Symbol.(names(temp_df)[col_indices])) else push!(column_symbols, Symbol(col)) end @@ -128,17 +133,18 @@ function separate_rows(df::DataFrame, columns, delimiter::Union{Regex, String}) # Flatten the DataFrame only once after all columns have been expanded temp_df = flatten(temp_df, column_symbols) - + if is_grouped + temp_df = groupby(temp_df, grouping_columns) + end return temp_df end - + """ $docstring_separate_rows """ - macro separate_rows(df, args...) - delimiter = esc(last(args)) - exprs = Base.front(args) - + macro separate_rows(df, exprs...) + delimiter = esc(last(exprs)) + exprs = Base.front(exprs) interpolated_exprs = parse_interpolation.(exprs) tidy_exprs = [i[1] for i in interpolated_exprs] @@ -147,18 +153,48 @@ function separate_rows(df::DataFrame, columns, delimiter::Union{Regex, String}) tidy_exprs = parse_tidy.(tidy_exprs) df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = transform($(esc(df)); ungroup = false) + else + local df_copy = copy($(esc(df))) + end + else local df_copy = $(esc(df)) # not a copy + end + + if $(esc(df)) isa GroupedDataFrame + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + end + else if $any_found_n - transform!(df_copy, nrow => :TidierData_n) + transform!(df_copy, nrow => :TidierData_n) end if $any_found_row_number - transform!(df_copy, eachindex => :TidierData_row_number) + transform!(df_copy, eachindex => :TidierData_row_number) end + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + if $any_found_n || $any_found_row_number - select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end - df_output + end + + df_output + end + if code[] + @info MacroTools.prettify(df_expr) end return df_expr end \ No newline at end of file From 471e382e184734b3638d58aafc75e3dba9b77e0a Mon Sep 17 00:00:00 2001 From: drizk1 Date: Sun, 17 Dec 2023 16:41:56 -0500 Subject: [PATCH 5/6] docstring fix --- src/docstrings.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/docstrings.jl b/src/docstrings.jl index 7aa7f5b..eb72a5e 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2944,8 +2944,8 @@ julia> @separate_rows(df, 2, 4, ";" ) 5 │ 3 dd 5;6 11 6 │ 3 ee 5;6 12 - local df_output = separate_rows2(df_copy, [$(exprs...)], $delimiter) - 6×4 DataFrame +julia> @separate_rows(df, b:d, ";" ) +6×4 DataFrame Row │ a b c d │ Int64 SubStrin… SubStrin… SubStrin… ─────┼──────────────────────────────────────── From fbd478394561bdb093b6aaeec6cd55c4760c3692 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 19 Dec 2023 00:53:11 -0500 Subject: [PATCH 6/6] Minor cleanup, bump version to 0.14.1. --- NEWS.md | 4 + Project.toml | 2 +- README.md | 2 +- docs/examples/UserGuide/sep_unite.jl | 24 ++- docs/src/index.md | 2 +- src/docstrings.jl | 9 +- src/separate_unite.jl | 258 +++++++++++++-------------- 7 files changed, 157 insertions(+), 144 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4233641..edaa349 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierData.jl updates +## v0.14.1 - 2023-12-19 +- `@separate()` now supports regular expressions +- Adds `@separate_rows()` + ## v0.14.0 - 2023-12-12 - Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped - Add `collect()` to not_vectorized[] array diff --git a/Project.toml b/Project.toml index d305da7..db633d7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.14.0" +version = "0.14.1" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index 532b6c9..2fde6cb 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` -- `@separate()` and `@unite()` +- `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl index d60c0f7..f74184d 100644 --- a/docs/examples/UserGuide/sep_unite.jl +++ b/docs/examples/UserGuide/sep_unite.jl @@ -4,7 +4,7 @@ using TidierData df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); -# ## @separate +# ## `@separate` # Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter @@ -12,30 +12,38 @@ df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); @separate(a, (b, c, d), "-") end -# The into columns can also be designated as follows +# The `into` columns can also be designated as follows: new_names = ["x$(i)" for i in 1:3]; # or new_names = ["b", "c", "d"], or new_names = [:b, :c, :d] @separate(df, a, !!new_names, "-") -# ## @unite +# ## `@unite` # The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter # Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter -df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); +df = DataFrame( + b = ["1", "2", "3"], + c = ["1", "2", "3"], + d = [missing, missing, "3"]); @chain df begin @unite(new_col, (b, c, d), "/") end -# @separate_rows +# ## `@separate_rows` -# ## Separate rows into multiple rows based on a chosen delimiter. +# Separate rows into multiple rows based on a chosen delimiter. -df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"], e = ["11", "22;33;44", "55;66"]); +df = DataFrame( + a = 1:3, + b = ["a", "aa;bb;cc", "dd;ee"], + c = ["1", "2;3;4", "5;6"], + d = ["7", "8;9;10", "11;12"], + e = ["11", "22;33;44", "55;66"]); -@separate_rows(df, b:5, ";") +@separate_rows(df, b:e, ";") diff --git a/docs/src/index.md b/docs/src/index.md index 4ff05cc..3cb50a3 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -103,7 +103,7 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - - `@separate()` and `@unite()` + - `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/src/docstrings.jl b/src/docstrings.jl index eb72a5e..3d10deb 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2912,18 +2912,21 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a) const docstring_separate_rows = """ - separate_rows(df, column(s), delimiter) + separate_rows(df, columns..., delimiter) Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter. # Arguments - `df`: A DataFrame -- `columns`: A column or collection of columns to be split. Can be a mix of integers and symbols +- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names. - `delimiter`: The string or character or regular expression used to split the column values. # Examples ```jldoctest -julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"]) +julia> df = DataFrame(a = 1:3, + b = ["a", "aa;bb;cc", "dd;ee"], + c = ["1", "2;3;4", "5;6"], + d = ["7", "8;9;10", "11;12"]) 3×4 DataFrame Row │ a b c d │ Int64 String String String diff --git a/src/separate_unite.jl b/src/separate_unite.jl index f108773..0f265c9 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -6,24 +6,6 @@ function safe_getindex(arr, index, default_value="") end end -function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) - new_df = df[:, :] - new_cols = map(x -> split(x, sep), new_df[:, col]) - max_cols = maximum(length.(new_cols)) - - if length(into) < max_cols - error("Not enough names provided in `into` for all split columns.") - end - - for i in 1:max_cols - new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) - end - - new_df = select(new_df, Not(col)) - - return new_df -end - """ $docstring_separate """ @@ -50,11 +32,22 @@ macro separate(df, from, into, sep) end end +function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) + new_df = df[:, :] + new_cols = map(x -> split(x, sep), new_df[:, col]) + max_cols = maximum(length.(new_cols)) -function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") - new_df = df[:, :] - new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] - return new_df + if length(into) < max_cols + error("Not enough names provided in `into` for all split columns.") + end + + for i in 1:max_cols + new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + end + + new_df = select(new_df, Not(col)) + + return new_df end """ @@ -83,118 +76,123 @@ macro unite(df, new_col, from_cols, sep) end end +function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") + new_df = df[:, :] + new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] + return new_df +end -### separate_rows -function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String}) - is_grouped = df isa GroupedDataFrame - grouping_columns = is_grouped ? groupcols(df) : Symbol[] - - # Ungroup if necessary - temp_df = copy(is_grouped ? parent(df) : df) - # temp_df = copy(df) - - # Convert all references to column symbols - column_symbols = [] - for col in columns - if col isa Integer - push!(column_symbols, Symbol(names(temp_df)[col])) - elseif col isa AbstractRange - append!(column_symbols, Symbol.(names(temp_df)[collect(col)])) - elseif typeof(col) <: Between - # Get the column indices for the Between range - col_indices = DataFrames.index(temp_df)[col] - append!(column_symbols, Symbol.(names(temp_df)[col_indices])) - else - push!(column_symbols, Symbol(col)) - end - end - - # Initialize an array to hold expanded data for each column - expanded_data = Dict{Symbol, Vector{Any}}() - - for column in column_symbols - expanded_data[column] = [] - - for row in eachrow(temp_df) - value = row[column] - # Handle missing values and non-string types - if ismissing(value) || typeof(value) != String - push!(expanded_data[column], [value]) - else - push!(expanded_data[column], split(value, delimiter)) - end - end - end - - # Replace the columns with expanded data - for column in column_symbols - temp_df[!, column] = expanded_data[column] - end - - # Flatten the DataFrame only once after all columns have been expanded - temp_df = flatten(temp_df, column_symbols) - if is_grouped - temp_df = groupby(temp_df, grouping_columns) - end - return temp_df - end - - """ - $docstring_separate_rows - """ - macro separate_rows(df, exprs...) - delimiter = esc(last(exprs)) - exprs = Base.front(exprs) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $any_found_n || $any_found_row_number - if $(esc(df)) isa GroupedDataFrame - local df_copy = transform($(esc(df)); ungroup = false) - else - local df_copy = copy($(esc(df))) - end +""" +$docstring_separate_rows +""" +macro separate_rows(df, exprs...) + delimiter = esc(last(exprs)) # extract the delimiter + exprs = Base.front(exprs) # select all but the last value + interpolated_exprs = parse_interpolation.(exprs) + + tidy_exprs = [i[1] for i in interpolated_exprs] + any_found_n = any([i[2] for i in interpolated_exprs]) + any_found_row_number = any([i[3] for i in interpolated_exprs]) + + tidy_exprs = parse_tidy.(tidy_exprs) + df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = transform($(esc(df)); ungroup = false) else - local df_copy = $(esc(df)) # not a copy + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + + if $(esc(df)) isa GroupedDataFrame + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end - if $(esc(df)) isa GroupedDataFrame - if $any_found_n - transform!(df_copy, nrow => :TidierData_n; ungroup = false) - end - if $any_found_row_number - transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) - end - - local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + end + else + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end - if $any_found_n || $any_found_row_number - select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) - end - else - if $any_found_n - transform!(df_copy, nrow => :TidierData_n) - end - if $any_found_row_number - transform!(df_copy, eachindex => :TidierData_row_number) - end - - local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) - - if $any_found_n || $any_found_row_number - select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) - end + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end - - df_output end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr - end \ No newline at end of file + + df_output + end + if code[] + @info MacroTools.prettify(df_expr) + end + return df_expr +end + +### separate_rows +function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String}) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + + # Ungroup if necessary + temp_df = copy(is_grouped ? parent(df) : df) + # temp_df = copy(df) + + # Convert all references to column symbols + column_symbols = [] + for col in columns + if col isa Integer + push!(column_symbols, Symbol(names(temp_df)[col])) + elseif col isa AbstractRange + append!(column_symbols, Symbol.(names(temp_df)[collect(col)])) + elseif typeof(col) <: Between + # Get the column indices for the Between range + col_indices = DataFrames.index(temp_df)[col] + append!(column_symbols, Symbol.(names(temp_df)[col_indices])) + else + push!(column_symbols, Symbol(col)) + end + end + + # Initialize an array to hold expanded data for each column + expanded_data = Dict{Symbol, Vector{Any}}() + + for column in column_symbols + expanded_data[column] = [] + + for row in eachrow(temp_df) + value = row[column] + # Handle missing values and non-string types + if ismissing(value) || typeof(value) != String + push!(expanded_data[column], [value]) + else + push!(expanded_data[column], split(value, delimiter)) + end + end + end + + # Replace the columns with expanded data + for column in column_symbols + temp_df[!, column] = expanded_data[column] + end + + # Flatten the DataFrame only once after all columns have been expanded + temp_df = flatten(temp_df, column_symbols) + if is_grouped + temp_df = groupby(temp_df, grouping_columns) + end + return temp_df +end \ No newline at end of file