From fbd478394561bdb093b6aaeec6cd55c4760c3692 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Tue, 19 Dec 2023 00:53:11 -0500 Subject: [PATCH] Minor cleanup, bump version to 0.14.1. --- NEWS.md | 4 + Project.toml | 2 +- README.md | 2 +- docs/examples/UserGuide/sep_unite.jl | 24 ++- docs/src/index.md | 2 +- src/docstrings.jl | 9 +- src/separate_unite.jl | 258 +++++++++++++-------------- 7 files changed, 157 insertions(+), 144 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4233641..edaa349 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # TidierData.jl updates +## v0.14.1 - 2023-12-19 +- `@separate()` now supports regular expressions +- Adds `@separate_rows()` + ## v0.14.0 - 2023-12-12 - Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped - Add `collect()` to not_vectorized[] array diff --git a/Project.toml b/Project.toml index d305da7..db633d7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TidierData" uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80" authors = ["Karandeep Singh"] -version = "0.14.0" +version = "0.14.1" [deps] Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" diff --git a/README.md b/README.md index 532b6c9..2fde6cb 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` -- `@separate()` and `@unite()` +- `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing()` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl index d60c0f7..f74184d 100644 --- a/docs/examples/UserGuide/sep_unite.jl +++ b/docs/examples/UserGuide/sep_unite.jl @@ -4,7 +4,7 @@ using TidierData df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); -# ## @separate +# ## `@separate` # Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter @@ -12,30 +12,38 @@ df = DataFrame(a = ["1-1", "2-2", "3-3-3"]); @separate(a, (b, c, d), "-") end -# The into columns can also be designated as follows +# The `into` columns can also be designated as follows: new_names = ["x$(i)" for i in 1:3]; # or new_names = ["b", "c", "d"], or new_names = [:b, :c, :d] @separate(df, a, !!new_names, "-") -# ## @unite +# ## `@unite` # The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter # Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter -df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]); +df = DataFrame( + b = ["1", "2", "3"], + c = ["1", "2", "3"], + d = [missing, missing, "3"]); @chain df begin @unite(new_col, (b, c, d), "/") end -# @separate_rows +# ## `@separate_rows` -# ## Separate rows into multiple rows based on a chosen delimiter. +# Separate rows into multiple rows based on a chosen delimiter. -df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"], e = ["11", "22;33;44", "55;66"]); +df = DataFrame( + a = 1:3, + b = ["a", "aa;bb;cc", "dd;ee"], + c = ["1", "2;3;4", "5;6"], + d = ["7", "8;9;10", "11;12"], + e = ["11", "22;33;44", "55;66"]); -@separate_rows(df, b:5, ";") +@separate_rows(df, b:e, ";") diff --git a/docs/src/index.md b/docs/src/index.md index 4ff05cc..3cb50a3 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -103,7 +103,7 @@ TidierData.jl currently supports the following top-level macros: - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()` - `@bind_rows()` and `@bind_cols()` - `@pivot_wider()` and `@pivot_longer()` - - `@separate()` and `@unite()` + - `@separate()`, `@separate_rows()`, and `@unite()` - `@drop_missing()` and `@fill_missing` - `@clean_names()` (as in R's `janitor::clean_names()` function) - `@summary()` (as in R's `summary()` function) diff --git a/src/docstrings.jl b/src/docstrings.jl index eb72a5e..3d10deb 100644 --- a/src/docstrings.jl +++ b/src/docstrings.jl @@ -2912,18 +2912,21 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a) const docstring_separate_rows = """ - separate_rows(df, column(s), delimiter) + separate_rows(df, columns..., delimiter) Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter. # Arguments - `df`: A DataFrame -- `columns`: A column or collection of columns to be split. Can be a mix of integers and symbols +- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names. - `delimiter`: The string or character or regular expression used to split the column values. # Examples ```jldoctest -julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"]) +julia> df = DataFrame(a = 1:3, + b = ["a", "aa;bb;cc", "dd;ee"], + c = ["1", "2;3;4", "5;6"], + d = ["7", "8;9;10", "11;12"]) 3×4 DataFrame Row │ a b c d │ Int64 String String String diff --git a/src/separate_unite.jl b/src/separate_unite.jl index f108773..0f265c9 100644 --- a/src/separate_unite.jl +++ b/src/separate_unite.jl @@ -6,24 +6,6 @@ function safe_getindex(arr, index, default_value="") end end -function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) - new_df = df[:, :] - new_cols = map(x -> split(x, sep), new_df[:, col]) - max_cols = maximum(length.(new_cols)) - - if length(into) < max_cols - error("Not enough names provided in `into` for all split columns.") - end - - for i in 1:max_cols - new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) - end - - new_df = select(new_df, Not(col)) - - return new_df -end - """ $docstring_separate """ @@ -50,11 +32,22 @@ macro separate(df, from, into, sep) end end +function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String}) + new_df = df[:, :] + new_cols = map(x -> split(x, sep), new_df[:, col]) + max_cols = maximum(length.(new_cols)) -function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") - new_df = df[:, :] - new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] - return new_df + if length(into) < max_cols + error("Not enough names provided in `into` for all split columns.") + end + + for i in 1:max_cols + new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols) + end + + new_df = select(new_df, Not(col)) + + return new_df end """ @@ -83,118 +76,123 @@ macro unite(df, new_col, from_cols, sep) end end +function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_") + new_df = df[:, :] + new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])] + return new_df +end -### separate_rows -function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String}) - is_grouped = df isa GroupedDataFrame - grouping_columns = is_grouped ? groupcols(df) : Symbol[] - - # Ungroup if necessary - temp_df = copy(is_grouped ? parent(df) : df) - # temp_df = copy(df) - - # Convert all references to column symbols - column_symbols = [] - for col in columns - if col isa Integer - push!(column_symbols, Symbol(names(temp_df)[col])) - elseif col isa AbstractRange - append!(column_symbols, Symbol.(names(temp_df)[collect(col)])) - elseif typeof(col) <: Between - # Get the column indices for the Between range - col_indices = DataFrames.index(temp_df)[col] - append!(column_symbols, Symbol.(names(temp_df)[col_indices])) - else - push!(column_symbols, Symbol(col)) - end - end - - # Initialize an array to hold expanded data for each column - expanded_data = Dict{Symbol, Vector{Any}}() - - for column in column_symbols - expanded_data[column] = [] - - for row in eachrow(temp_df) - value = row[column] - # Handle missing values and non-string types - if ismissing(value) || typeof(value) != String - push!(expanded_data[column], [value]) - else - push!(expanded_data[column], split(value, delimiter)) - end - end - end - - # Replace the columns with expanded data - for column in column_symbols - temp_df[!, column] = expanded_data[column] - end - - # Flatten the DataFrame only once after all columns have been expanded - temp_df = flatten(temp_df, column_symbols) - if is_grouped - temp_df = groupby(temp_df, grouping_columns) - end - return temp_df - end - - """ - $docstring_separate_rows - """ - macro separate_rows(df, exprs...) - delimiter = esc(last(exprs)) - exprs = Base.front(exprs) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $any_found_n || $any_found_row_number - if $(esc(df)) isa GroupedDataFrame - local df_copy = transform($(esc(df)); ungroup = false) - else - local df_copy = copy($(esc(df))) - end +""" +$docstring_separate_rows +""" +macro separate_rows(df, exprs...) + delimiter = esc(last(exprs)) # extract the delimiter + exprs = Base.front(exprs) # select all but the last value + interpolated_exprs = parse_interpolation.(exprs) + + tidy_exprs = [i[1] for i in interpolated_exprs] + any_found_n = any([i[2] for i in interpolated_exprs]) + any_found_row_number = any([i[3] for i in interpolated_exprs]) + + tidy_exprs = parse_tidy.(tidy_exprs) + df_expr = quote + if $any_found_n || $any_found_row_number + if $(esc(df)) isa GroupedDataFrame + local df_copy = transform($(esc(df)); ungroup = false) else - local df_copy = $(esc(df)) # not a copy + local df_copy = copy($(esc(df))) + end + else + local df_copy = $(esc(df)) # not a copy + end + + if $(esc(df)) isa GroupedDataFrame + if $any_found_n + transform!(df_copy, nrow => :TidierData_n; ungroup = false) end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) + end - if $(esc(df)) isa GroupedDataFrame - if $any_found_n - transform!(df_copy, nrow => :TidierData_n; ungroup = false) - end - if $any_found_row_number - transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false) - end - - local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) + end + else + if $any_found_n + transform!(df_copy, nrow => :TidierData_n) + end + if $any_found_row_number + transform!(df_copy, eachindex => :TidierData_row_number) + end - if $any_found_n || $any_found_row_number - select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false) - end - else - if $any_found_n - transform!(df_copy, nrow => :TidierData_n) - end - if $any_found_row_number - transform!(df_copy, eachindex => :TidierData_row_number) - end - - local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) - - if $any_found_n || $any_found_row_number - select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) - end + local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter) + + if $any_found_n || $any_found_row_number + select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$"))) end - - df_output end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr - end \ No newline at end of file + + df_output + end + if code[] + @info MacroTools.prettify(df_expr) + end + return df_expr +end + +### separate_rows +function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String}) + is_grouped = df isa GroupedDataFrame + grouping_columns = is_grouped ? groupcols(df) : Symbol[] + + # Ungroup if necessary + temp_df = copy(is_grouped ? parent(df) : df) + # temp_df = copy(df) + + # Convert all references to column symbols + column_symbols = [] + for col in columns + if col isa Integer + push!(column_symbols, Symbol(names(temp_df)[col])) + elseif col isa AbstractRange + append!(column_symbols, Symbol.(names(temp_df)[collect(col)])) + elseif typeof(col) <: Between + # Get the column indices for the Between range + col_indices = DataFrames.index(temp_df)[col] + append!(column_symbols, Symbol.(names(temp_df)[col_indices])) + else + push!(column_symbols, Symbol(col)) + end + end + + # Initialize an array to hold expanded data for each column + expanded_data = Dict{Symbol, Vector{Any}}() + + for column in column_symbols + expanded_data[column] = [] + + for row in eachrow(temp_df) + value = row[column] + # Handle missing values and non-string types + if ismissing(value) || typeof(value) != String + push!(expanded_data[column], [value]) + else + push!(expanded_data[column], split(value, delimiter)) + end + end + end + + # Replace the columns with expanded data + for column in column_symbols + temp_df[!, column] = expanded_data[column] + end + + # Flatten the DataFrame only once after all columns have been expanded + temp_df = flatten(temp_df, column_symbols) + if is_grouped + temp_df = groupby(temp_df, grouping_columns) + end + return temp_df +end \ No newline at end of file