diff --git a/NEWS.md b/NEWS.md index 4233641..edaa349 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierData.jl updates +## v0.14.1 - 2023-12-19 +- `@separate()` now supports regular expressions +- Adds `@separate_rows()` + ## v0.14.0 - 2023-12-12 - Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped - Add `collect()` to not_vectorized[] array diff --git a/Project.toml b/Project.toml index d305da7..db633d7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.14.0" +version = "0.14.1" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index 532b6c9..2fde6cb 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` -- `@separate()` and `@unite()` +- `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl index 9c0888d..f74184d 100644 --- a/docs/examples/UserGuide/sep_unite.jl +++ b/docs/examples/UserGuide/sep_unite.jl @@ -4,25 +4,46 @@ using TidierData df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); -# ## Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter +# ## `@separate` + +# Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter @chain df begin @separate(a, (b, c, d), "-") end -# The into columns can also be designated as follows +# The `into` columns can also be designated as follows: new_names = ["x$(i)" for i in 1:3]; # or new_names = ["b", "c", "d"], or new_names = [:b, :c, :d] @separate(df, a, !!new_names, "-") +# ## `@unite` + # The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter +# Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter -# ## Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter -df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); +df = DataFrame( + b = ["1", "2", "3"], + c = ["1", "2", "3"], + d = [missing, missing, "3"]); @chain df begin @unite(new_col, (b, c, d), "/") end + +# ## `@separate_rows` + +# Separate rows into multiple rows based on a chosen delimiter. + +df = DataFrame( + a = 1:3, + b = ["a", "aa;bb;cc", "dd;ee"], + c = ["1", "2;3;4", "5;6"], + d = ["7", "8;9;10", "11;12"], + e = ["11", "22;33;44", "55;66"]); + +@separate_rows(df, b:e, ";") + diff --git a/docs/src/index.md b/docs/src/index.md index 4ff05cc..3cb50a3 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -103,7 +103,7 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - - `@separate()` and `@unite()` + - `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/src/TidierData.jl b/src/TidierData.jl index af840da..ccc3288 100644 --- a/src/TidierData.jl +++ b/src/TidierData.jl @@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end as_float, as_integer, as_string, is_float, is_integer, is_string, missing_if, replace_missing, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join, @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate, - @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with + @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows # Package global variables const code = Ref{Bool}(false) # output DataFrames.jl code? diff --git a/src/docstrings.jl b/src/docstrings.jl index d36a469..3d10deb 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2908,4 +2908,55 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a) 2 │ banana doc2 2 3 │ cherry doc3 3 ``` +""" + +const docstring_separate_rows = +""" + separate_rows(df, columns..., delimiter) + +Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter. + +# Arguments +- `df`: A DataFrame +- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names. +- `delimiter`: The string or character or regular expression used to split the column values. + +# Examples +```jldoctest +julia> df = DataFrame(a = 1:3, + b = ["a", "aa;bb;cc", "dd;ee"], + c = ["1", "2;3;4", "5;6"], + d = ["7", "8;9;10", "11;12"]) +3×4 DataFrame + Row │ a b c d + │ Int64 String String String +─────┼───────────────────────────────── + 1 │ 1 a 1 7 + 2 │ 2 aa;bb;cc 2;3;4 8;9;10 + 3 │ 3 dd;ee 5;6 11;12 + +julia> @separate_rows(df, 2, 4, ";" ) +6×4 DataFrame + Row │ a b c d + │ Int64 SubStrin… String SubStrin… +─────┼───────────────────────────────────── + 1 │ 1 a 1 7 + 2 │ 2 aa 2;3;4 8 + 3 │ 2 bb 2;3;4 9 + 4 │ 2 cc 2;3;4 10 + 5 │ 3 dd 5;6 11 + 6 │ 3 ee 5;6 12 + +julia> @separate_rows(df, b:d, ";" ) +6×4 DataFrame + Row │ a b c d + │ Int64 SubStrin… SubStrin… SubStrin… +─────┼──────────────────────────────────────── + 1 │ 1 a 1 7 + 2 │ 2 aa 2 8 + 3 │ 2 bb 3 9 + 4 │ 2 cc 4 10 + 5 │ 3 dd 5 11 + 6 │ 3 ee 6 12 +``` """ \ No newline at end of file diff --git a/src/separate_unite.jl b/src/separate_unite.jl index 133289f..0f265c9 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -6,24 +6,6 @@ function safe_getindex(arr, index, default_value="") end end -function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) - new_df = df[:, :] - new_cols = map(x -> split(x, sep), new_df[:, col]) - max_cols = maximum(length.(new_cols)) - - if length(into) < max_cols - error("Not enough names provided in `into` for all split columns.") - end - - for i in 1:max_cols - new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) - end - - new_df = select(new_df, Not(col)) - - return new_df -end - """ $docstring_separate """ @@ -50,11 +32,22 @@ macro separate(df, from, into, sep) end end +function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) + new_df = df[:, :] + new_cols = map(x -> split(x, sep), new_df[:, col]) + max_cols = maximum(length.(new_cols)) -function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") - new_df = df[:, :] - new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] - return new_df + if length(into) < max_cols + error("Not enough names provided in `into` for all split columns.") + end + + for i in 1:max_cols + new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + end + + new_df = select(new_df, Not(col)) + + return new_df end """ @@ -82,3 +75,124 @@ macro unite(df, new_col, from_cols, sep) unite($(esc(df)), $new_col_quoted, $(from_cols_expr), $(esc(sep))) end end + +function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") + new_df = df[:, :] + new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] + return new_df +end + +""" +$docstring_separate_rows +""" +macro separate_rows(df, exprs...) + delimiter = esc(last(exprs)) # extract the delimiter + exprs = Base.front(exprs) # select all but the last value + interpolated_exprs = parse_interpolation.(exprs) + + tidy_exprs = [i[1] for i in interpolated_exprs] + any_found_n = any([i[2] for i in interpolated_exprs]) + any_found_row_number = any([i[3] for i in interpolated_exprs]) + + tidy_exprs = parse_tidy.(tidy_exprs) + df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = transform($(esc(df)); ungroup = false) + else + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + + if $(esc(df)) isa GroupedDataFrame + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end + + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + end + else + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end + + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) + end + end + + df_output + end + if code[] + @info MacroTools.prettify(df_expr) + end + return df_expr +end + +### separate_rows +function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String}) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + + # Ungroup if necessary + temp_df = copy(is_grouped ? parent(df) : df) + # temp_df = copy(df) + + # Convert all references to column symbols + column_symbols = [] + for col in columns + if col isa Integer + push!(column_symbols, Symbol(names(temp_df)[col])) + elseif col isa AbstractRange + append!(column_symbols, Symbol.(names(temp_df)[collect(col)])) + elseif typeof(col) <: Between + # Get the column indices for the Between range + col_indices = DataFrames.index(temp_df)[col] + append!(column_symbols, Symbol.(names(temp_df)[col_indices])) + else + push!(column_symbols, Symbol(col)) + end + end + + # Initialize an array to hold expanded data for each column + expanded_data = Dict{Symbol, Vector{Any}}() + + for column in column_symbols + expanded_data[column] = [] + + for row in eachrow(temp_df) + value = row[column] + # Handle missing values and non-string types + if ismissing(value) || typeof(value) != String + push!(expanded_data[column], [value]) + else + push!(expanded_data[column], split(value, delimiter)) + end + end + end + + # Replace the columns with expanded data + for column in column_symbols + temp_df[!, column] = expanded_data[column] + end + + # Flatten the DataFrame only once after all columns have been expanded + temp_df = flatten(temp_df, column_symbols) + if is_grouped + temp_df = groupby(temp_df, grouping_columns) + end + return temp_df +end \ No newline at end of file