Merge pull request #72 from drizk1/add-@separate_rows

added support for @separate_rows
TidierOrg · Dec 19, 2023 · 9c42273 · 9c42273 · kdpsingh · Dec 19, 2023
2 parents ee49f0b + fbd4783
commit 9c42273
Show file tree

Hide file tree

Showing 8 changed files with 220 additions and 30 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # TidierData.jl updates
 
+## v0.14.1 - 2023-12-19
+- `@separate()` now supports regular expressions
+- Adds `@separate_rows()`
+
 ## v0.14.0 - 2023-12-12
 - Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped
 - Add `collect()` to not_vectorized[] array

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.14.0"
+version = "0.14.1"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ TidierData.jl currently supports the following top-level macros:
 - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()`
 - `@bind_rows()` and `@bind_cols()`
 - `@pivot_wider()` and `@pivot_longer()`
-- `@separate()` and `@unite()`
+- `@separate()`, `@separate_rows()`, and `@unite()`
 - `@drop_missing()` and `@fill_missing()`
 - `@clean_names()` (as in R's `janitor::clean_names()` function)
 - `@summary()` (as in R's `summary()` function)

diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl
@@ -4,25 +4,46 @@ using TidierData
 
 df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);
 
-# ## Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter
+# ## `@separate`
+
+# Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter
 
 @chain df begin
     @separate(a, (b, c, d), "-")
 end
 
-# The into columns can also be designated as follows
+# The `into` columns can also be designated as follows:
 
 new_names = ["x$(i)" for i in 1:3]; # or new_names = ["b", "c", "d"], or new_names = [:b, :c, :d]
 
 @separate(df, a, !!new_names, "-")
 
+# ## `@unite`
+
 # The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter
+# Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter
 
-# ## Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter
 
-df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
+df = DataFrame(
+       b = ["1", "2", "3"],
+       c = ["1", "2", "3"],
+       d = [missing, missing, "3"]);
 
 @chain df begin
     @unite(new_col, (b, c, d), "/")
 end
 
+
+# ## `@separate_rows` 
+
+# Separate rows into multiple rows based on a chosen delimiter.
+
+df = DataFrame(
+       a = 1:3,
+       b = ["a", "aa;bb;cc", "dd;ee"],
+       c = ["1", "2;3;4", "5;6"],
+       d = ["7", "8;9;10", "11;12"],
+       e = ["11", "22;33;44", "55;66"]);
+
+@separate_rows(df, b:e, ";")
+
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -103,7 +103,7 @@ TidierData.jl currently supports the following top-level macros:
     - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()`
     - `@bind_rows()` and `@bind_cols()`
     - `@pivot_wider()` and `@pivot_longer()`
-    - `@separate()` and `@unite()`
+    - `@separate()`, `@separate_rows()`, and `@unite()`
     - `@drop_missing()` and `@fill_missing`
     - `@clean_names()` (as in R's `janitor::clean_names()` function)
     - `@summary()` (as in R's `summary()` function)

diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -19,7 +19,7 @@ export TidierData_set, across, desc, n, row_number, everything, starts_with, end
       as_float, as_integer, as_string, is_float, is_integer, is_string, missing_if, replace_missing, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter,
       @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, @anti_join, @semi_join,
       @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_missing, @glimpse, @separate,
-      @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with
+      @unite, @summary, @fill_missing, @slice_sample, @slice_min, @slice_max, @slice_head, @slice_tail, @rename_with, @separate_rows
 
 # Package global variables
 const code = Ref{Bool}(false) # output DataFrames.jl code?

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -2908,4 +2908,55 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a)
    2 │ banana  doc2          2
    3 │ cherry  doc3          3
 ```
+"""
+
+const docstring_separate_rows =
+"""
+    separate_rows(df, columns..., delimiter)
+
+Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.
+
+# Arguments
+- `df`: A DataFrame
+- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names.
+- `delimiter`: The string or character or regular expression used to split the column values.
+
+# Examples
+```jldoctest
+julia> df = DataFrame(a = 1:3,
+                      b = ["a", "aa;bb;cc", "dd;ee"],
+                      c = ["1", "2;3;4", "5;6"],
+                      d = ["7", "8;9;10", "11;12"])
+3×4 DataFrame
+ Row │ a      b         c       d      
+     │ Int64  String    String  String 
+─────┼─────────────────────────────────
+   1 │     1  a         1       7
+   2 │     2  aa;bb;cc  2;3;4   8;9;10
+   3 │     3  dd;ee     5;6     11;12
+
+julia> @separate_rows(df, 2, 4, ";" )
+6×4 DataFrame
+ Row │ a      b          c       d         
+     │ Int64  SubStrin…  String  SubStrin… 
+─────┼─────────────────────────────────────
+   1 │     1  a          1       7
+   2 │     2  aa         2;3;4   8
+   3 │     2  bb         2;3;4   9
+   4 │     2  cc         2;3;4   10
+   5 │     3  dd         5;6     11
+   6 │     3  ee         5;6     12
+
+julia> @separate_rows(df, b:d, ";" )
+6×4 DataFrame
+ Row │ a      b          c          d         
+     │ Int64  SubStrin…  SubStrin…  SubStrin… 
+─────┼────────────────────────────────────────
+   1 │     1  a          1          7
+   2 │     2  aa         2          8
+   3 │     2  bb         3          9
+   4 │     2  cc         4          10
+   5 │     3  dd         5          11
+   6 │     3  ee         6          12
+```
 """
diff --git a/src/separate_unite.jl b/src/separate_unite.jl
@@ -6,24 +6,6 @@ function safe_getindex(arr, index, default_value="")
     end
 end
 
-function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
-    new_df = df[:, :]
-    new_cols = map(x -> split(x, sep), new_df[:, col])
-    max_cols = maximum(length.(new_cols))
-
-    if length(into) < max_cols
-        error("Not enough names provided in `into` for all split columns.")
-    end
-
-    for i in 1:max_cols
-        new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
-    end
-
-    new_df = select(new_df, Not(col))
-
-    return new_df
-end
-
 """
 $docstring_separate
 """
@@ -50,11 +32,22 @@ macro separate(df, from, into, sep)
     end
 end
 
+function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
+  new_df = df[:, :]
+  new_cols = map(x -> split(x, sep), new_df[:, col])
+  max_cols = maximum(length.(new_cols))
 
-function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_")
-    new_df = df[:, :]
-    new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])]
-    return new_df
+  if length(into) < max_cols
+      error("Not enough names provided in `into` for all split columns.")
+  end
+
+  for i in 1:max_cols
+      new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
+  end
+
+  new_df = select(new_df, Not(col))
+
+  return new_df
 end
 
 """
@@ -82,3 +75,124 @@ macro unite(df, new_col, from_cols, sep)
         unite($(esc(df)), $new_col_quoted, $(from_cols_expr), $(esc(sep)))
     end
 end
+
+function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_")
+  new_df = df[:, :]
+  new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])]
+  return new_df
+end
+
+"""
+$docstring_separate_rows
+"""
+macro separate_rows(df, exprs...)
+  delimiter = esc(last(exprs)) # extract the delimiter
+  exprs = Base.front(exprs) # select all but the last value
+  interpolated_exprs = parse_interpolation.(exprs)
+
+  tidy_exprs = [i[1] for i in interpolated_exprs]
+  any_found_n = any([i[2] for i in interpolated_exprs])
+  any_found_row_number = any([i[3] for i in interpolated_exprs])
+
+  tidy_exprs = parse_tidy.(tidy_exprs)
+  df_expr = quote
+    if $any_found_n || $any_found_row_number
+      if $(esc(df)) isa GroupedDataFrame
+        local df_copy = transform($(esc(df)); ungroup = false)
+      else
+        local df_copy = copy($(esc(df)))
+      end
+    else
+      local df_copy = $(esc(df)) # not a copy
+    end
+
+    if $(esc(df)) isa GroupedDataFrame
+      if $any_found_n
+        transform!(df_copy, nrow => :TidierData_n; ungroup = false)
+      end
+      if $any_found_row_number
+        transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false)
+      end    
+
+      local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter)
+
+      if $any_found_n || $any_found_row_number
+        select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false)
+      end
+    else
+      if $any_found_n
+        transform!(df_copy, nrow => :TidierData_n)
+      end
+      if $any_found_row_number
+        transform!(df_copy, eachindex => :TidierData_row_number)
+      end
+
+      local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter)
+
+      if $any_found_n || $any_found_row_number
+        select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")))
+      end
+    end
+
+    df_output
+  end
+  if code[]
+    @info MacroTools.prettify(df_expr)
+  end
+  return df_expr
+end
+
+### separate_rows
+function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String})
+  is_grouped = df isa GroupedDataFrame
+  grouping_columns = is_grouped ? groupcols(df) : Symbol[]
+
+  # Ungroup if necessary
+  temp_df = copy(is_grouped ? parent(df) : df)
+   # temp_df = copy(df)
+
+  # Convert all references to column symbols
+  column_symbols = []
+  for col in columns
+      if col isa Integer
+          push!(column_symbols, Symbol(names(temp_df)[col]))
+      elseif col isa AbstractRange
+          append!(column_symbols, Symbol.(names(temp_df)[collect(col)]))
+      elseif typeof(col) <: Between
+          # Get the column indices for the Between range
+          col_indices = DataFrames.index(temp_df)[col]
+          append!(column_symbols, Symbol.(names(temp_df)[col_indices]))
+      else
+          push!(column_symbols, Symbol(col))
+      end
+  end
+
+  # Initialize an array to hold expanded data for each column
+  expanded_data = Dict{Symbol, Vector{Any}}()
+
+  for column in column_symbols
+      expanded_data[column] = []
+
+      for row in eachrow(temp_df)
+          value = row[column]
+          # Handle missing values and non-string types
+          if ismissing(value) || typeof(value) != String
+              push!(expanded_data[column], [value])
+          else
+              push!(expanded_data[column], split(value, delimiter))
+          end
+      end
+  end
+
+  # Replace the columns with expanded data
+  for column in column_symbols
+      temp_df[!, column] = expanded_data[column]
+  end
+
+  # Flatten the DataFrame only once after all columns have been expanded
+  temp_df = flatten(temp_df, column_symbols)
+  if is_grouped
+    temp_df = groupby(temp_df, grouping_columns)
+   end
+  return temp_df
+end