From fbd478394561bdb093b6aaeec6cd55c4760c3692 Mon Sep 17 00:00:00 2001
From: Karandeep Singh <kdpsingh@umich.edu>
Date: Tue, 19 Dec 2023 00:53:11 -0500
Subject: [PATCH] Minor cleanup, bump version to 0.14.1.

---
 NEWS.md                              |   4 +
 Project.toml                         |   2 +-
 README.md                            |   2 +-
 docs/examples/UserGuide/sep_unite.jl |  24 ++-
 docs/src/index.md                    |   2 +-
 src/docstrings.jl                    |   9 +-
 src/separate_unite.jl                | 258 +++++++++++++--------------
 7 files changed, 157 insertions(+), 144 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 4233641..edaa349 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # TidierData.jl updates
 
+## v0.14.1 - 2023-12-19
+- `@separate()` now supports regular expressions
+- Adds `@separate_rows()`
+
 ## v0.14.0 - 2023-12-12
 - Update parsing engine so that non-function reserved names from the Base and Core modules (like `missing`, `pi`, and `Real`) are auto-escaped now, with the exception of names in the not_escaped[] array, which are never escaped
 - Add `collect()` to not_vectorized[] array
diff --git a/Project.toml b/Project.toml
index d305da7..db633d7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.14.0"
+version = "0.14.1"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"
diff --git a/README.md b/README.md
index 532b6c9..2fde6cb 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ TidierData.jl currently supports the following top-level macros:
 - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()`
 - `@bind_rows()` and `@bind_cols()`
 - `@pivot_wider()` and `@pivot_longer()`
-- `@separate()` and `@unite()`
+- `@separate()`, `@separate_rows()`, and `@unite()`
 - `@drop_missing()` and `@fill_missing()`
 - `@clean_names()` (as in R's `janitor::clean_names()` function)
 - `@summary()` (as in R's `summary()` function)
diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl
index d60c0f7..f74184d 100644
--- a/docs/examples/UserGuide/sep_unite.jl
+++ b/docs/examples/UserGuide/sep_unite.jl
@@ -4,7 +4,7 @@ using TidierData
 
 df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);
 
-# ## @separate
+# ## `@separate`
 
 # Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter
 
@@ -12,30 +12,38 @@ df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);
     @separate(a, (b, c, d), "-")
 end
 
-# The into columns can also be designated as follows
+# The `into` columns can also be designated as follows:
 
 new_names = ["x$(i)" for i in 1:3]; # or new_names = ["b", "c", "d"], or new_names = [:b, :c, :d]
 
 @separate(df, a, !!new_names, "-")
 
-# ## @unite
+# ## `@unite`
 
 # The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter
 # Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter
 
 
-df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
+df = DataFrame(
+       b = ["1", "2", "3"],
+       c = ["1", "2", "3"],
+       d = [missing, missing, "3"]);
 
 @chain df begin
     @unite(new_col, (b, c, d), "/")
 end
 
 
-# @separate_rows 
+# ## `@separate_rows` 
 
-# ## Separate rows into multiple rows based on a chosen delimiter.
+# Separate rows into multiple rows based on a chosen delimiter.
 
-df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"], e = ["11", "22;33;44", "55;66"]);
+df = DataFrame(
+       a = 1:3,
+       b = ["a", "aa;bb;cc", "dd;ee"],
+       c = ["1", "2;3;4", "5;6"],
+       d = ["7", "8;9;10", "11;12"],
+       e = ["11", "22;33;44", "55;66"]);
 
-@separate_rows(df, b:5, ";")
+@separate_rows(df, b:e, ";")
 
diff --git a/docs/src/index.md b/docs/src/index.md
index 4ff05cc..3cb50a3 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -103,7 +103,7 @@ TidierData.jl currently supports the following top-level macros:
     - `@left_join()`, `@right_join()`, `@inner_join()`, `@full_join()`, `@anti_join()`, and `@semi_join()`
     - `@bind_rows()` and `@bind_cols()`
     - `@pivot_wider()` and `@pivot_longer()`
-    - `@separate()` and `@unite()`
+    - `@separate()`, `@separate_rows()`, and `@unite()`
     - `@drop_missing()` and `@fill_missing`
     - `@clean_names()` (as in R's `janitor::clean_names()` function)
     - `@summary()` (as in R's `summary()` function)
diff --git a/src/docstrings.jl b/src/docstrings.jl
index eb72a5e..3d10deb 100644
--- a/src/docstrings.jl
+++ b/src/docstrings.jl
@@ -2912,18 +2912,21 @@ julia> @rename_with(df, str -> str_remove_all(str, "_a"), !term_a)
 
 const docstring_separate_rows =
 """
-    separate_rows(df, column(s), delimiter)
+    separate_rows(df, columns..., delimiter)
 
 Split the contents of specified columns in a DataFrame into multiple rows based on a given delimiter.
 
 # Arguments
 - `df`: A DataFrame
-- `columns`: A column or collection of columns to be split. Can be a mix of integers  and symbols
+- `columns`: A column or multiple columns to be split. Can be a mix of integers and column names.
 - `delimiter`: The string or character or regular expression used to split the column values.
 
 # Examples
 ```jldoctest
-julia> df = DataFrame(a = 1:3, b = ["a", "aa;bb;cc", "dd;ee"], c = ["1", "2;3;4", "5;6"], d = ["7", "8;9;10", "11;12"])
+julia> df = DataFrame(a = 1:3,
+                      b = ["a", "aa;bb;cc", "dd;ee"],
+                      c = ["1", "2;3;4", "5;6"],
+                      d = ["7", "8;9;10", "11;12"])
 3×4 DataFrame
  Row │ a      b         c       d      
      │ Int64  String    String  String 
diff --git a/src/separate_unite.jl b/src/separate_unite.jl
index f108773..0f265c9 100644
--- a/src/separate_unite.jl
+++ b/src/separate_unite.jl
@@ -6,24 +6,6 @@ function safe_getindex(arr, index, default_value="")
     end
 end
 
-function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
-    new_df = df[:, :]
-    new_cols = map(x -> split(x, sep), new_df[:, col])
-    max_cols = maximum(length.(new_cols))
-
-    if length(into) < max_cols
-        error("Not enough names provided in `into` for all split columns.")
-    end
-
-    for i in 1:max_cols
-        new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
-    end
-
-    new_df = select(new_df, Not(col))
-
-    return new_df
-end
-
 """
 $docstring_separate
 """
@@ -50,11 +32,22 @@ macro separate(df, from, into, sep)
     end
 end
 
+function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::Union{Regex, String})
+  new_df = df[:, :]
+  new_cols = map(x -> split(x, sep), new_df[:, col])
+  max_cols = maximum(length.(new_cols))
 
-function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_")
-    new_df = df[:, :]
-    new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])]
-    return new_df
+  if length(into) < max_cols
+      error("Not enough names provided in `into` for all split columns.")
+  end
+
+  for i in 1:max_cols
+      new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
+  end
+
+  new_df = select(new_df, Not(col))
+
+  return new_df
 end
 
 """
@@ -83,118 +76,123 @@ macro unite(df, new_col, from_cols, sep)
     end
 end
 
+function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_")
+  new_df = df[:, :]
+  new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])]
+  return new_df
+end
 
-### separate_rows
-function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String})
-    is_grouped = df isa GroupedDataFrame
-    grouping_columns = is_grouped ? groupcols(df) : Symbol[]
-  
-    # Ungroup if necessary
-    temp_df = copy(is_grouped ? parent(df) : df)
-     # temp_df = copy(df)
-  
-    # Convert all references to column symbols
-    column_symbols = []
-    for col in columns
-        if col isa Integer
-            push!(column_symbols, Symbol(names(temp_df)[col]))
-        elseif col isa AbstractRange
-            append!(column_symbols, Symbol.(names(temp_df)[collect(col)]))
-        elseif typeof(col) <: Between
-            # Get the column indices for the Between range
-            col_indices = DataFrames.index(temp_df)[col]
-            append!(column_symbols, Symbol.(names(temp_df)[col_indices]))
-        else
-            push!(column_symbols, Symbol(col))
-        end
-    end
-  
-    # Initialize an array to hold expanded data for each column
-    expanded_data = Dict{Symbol, Vector{Any}}()
-  
-    for column in column_symbols
-        expanded_data[column] = []
-  
-        for row in eachrow(temp_df)
-            value = row[column]
-            # Handle missing values and non-string types
-            if ismissing(value) || typeof(value) != String
-                push!(expanded_data[column], [value])
-            else
-                push!(expanded_data[column], split(value, delimiter))
-            end
-        end
-    end
-  
-    # Replace the columns with expanded data
-    for column in column_symbols
-        temp_df[!, column] = expanded_data[column]
-    end
-  
-    # Flatten the DataFrame only once after all columns have been expanded
-    temp_df = flatten(temp_df, column_symbols)
-    if is_grouped
-      temp_df = groupby(temp_df, grouping_columns)
-     end
-    return temp_df
-  end
-  
-  """
-  $docstring_separate_rows
-  """
-  macro separate_rows(df, exprs...)
-    delimiter = esc(last(exprs))
-    exprs = Base.front(exprs)
-    interpolated_exprs = parse_interpolation.(exprs)
-  
-    tidy_exprs = [i[1] for i in interpolated_exprs]
-    any_found_n = any([i[2] for i in interpolated_exprs])
-    any_found_row_number = any([i[3] for i in interpolated_exprs])
-  
-    tidy_exprs = parse_tidy.(tidy_exprs)
-    df_expr = quote
-      if $any_found_n || $any_found_row_number
-        if $(esc(df)) isa GroupedDataFrame
-          local df_copy = transform($(esc(df)); ungroup = false)
-        else
-          local df_copy = copy($(esc(df)))
-        end
+"""
+$docstring_separate_rows
+"""
+macro separate_rows(df, exprs...)
+  delimiter = esc(last(exprs)) # extract the delimiter
+  exprs = Base.front(exprs) # select all but the last value
+  interpolated_exprs = parse_interpolation.(exprs)
+
+  tidy_exprs = [i[1] for i in interpolated_exprs]
+  any_found_n = any([i[2] for i in interpolated_exprs])
+  any_found_row_number = any([i[3] for i in interpolated_exprs])
+
+  tidy_exprs = parse_tidy.(tidy_exprs)
+  df_expr = quote
+    if $any_found_n || $any_found_row_number
+      if $(esc(df)) isa GroupedDataFrame
+        local df_copy = transform($(esc(df)); ungroup = false)
       else
-        local df_copy = $(esc(df)) # not a copy
+        local df_copy = copy($(esc(df)))
+      end
+    else
+      local df_copy = $(esc(df)) # not a copy
+    end
+    
+    if $(esc(df)) isa GroupedDataFrame
+      if $any_found_n
+        transform!(df_copy, nrow => :TidierData_n; ungroup = false)
       end
+      if $any_found_row_number
+        transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false)
+      end    
       
-      if $(esc(df)) isa GroupedDataFrame
-        if $any_found_n
-          transform!(df_copy, nrow => :TidierData_n; ungroup = false)
-        end
-        if $any_found_row_number
-          transform!(df_copy, eachindex => :TidierData_row_number; ungroup = false)
-        end    
-        
-        local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter)
+      local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter)
+      
+      if $any_found_n || $any_found_row_number
+        select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false)
+      end
+    else
+      if $any_found_n
+        transform!(df_copy, nrow => :TidierData_n)
+      end
+      if $any_found_row_number
+        transform!(df_copy, eachindex => :TidierData_row_number)
+      end
         
-        if $any_found_n || $any_found_row_number
-          select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")); ungroup = false)
-        end
-      else
-        if $any_found_n
-          transform!(df_copy, nrow => :TidierData_n)
-        end
-        if $any_found_row_number
-          transform!(df_copy, eachindex => :TidierData_row_number)
-        end
-          
-        local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter)
-  
-        if $any_found_n || $any_found_row_number
-          select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")))
-        end
+      local df_output = separate_rows(df_copy, [$(tidy_exprs...)], $delimiter)
+
+      if $any_found_n || $any_found_row_number
+        select!(df_output, Cols(Not(r"^(TidierData_n|TidierData_row_number)$")))
       end
-  
-      df_output
     end
-    if code[]
-      @info MacroTools.prettify(df_expr)
-    end
-    return df_expr
-  end
\ No newline at end of file
+
+    df_output
+  end
+  if code[]
+    @info MacroTools.prettify(df_expr)
+  end
+  return df_expr
+end
+
+### separate_rows
+function separate_rows(df::Union{DataFrame, GroupedDataFrame}, columns, delimiter::Union{Regex, String})
+  is_grouped = df isa GroupedDataFrame
+  grouping_columns = is_grouped ? groupcols(df) : Symbol[]
+
+  # Ungroup if necessary
+  temp_df = copy(is_grouped ? parent(df) : df)
+   # temp_df = copy(df)
+
+  # Convert all references to column symbols
+  column_symbols = []
+  for col in columns
+      if col isa Integer
+          push!(column_symbols, Symbol(names(temp_df)[col]))
+      elseif col isa AbstractRange
+          append!(column_symbols, Symbol.(names(temp_df)[collect(col)]))
+      elseif typeof(col) <: Between
+          # Get the column indices for the Between range
+          col_indices = DataFrames.index(temp_df)[col]
+          append!(column_symbols, Symbol.(names(temp_df)[col_indices]))
+      else
+          push!(column_symbols, Symbol(col))
+      end
+  end
+
+  # Initialize an array to hold expanded data for each column
+  expanded_data = Dict{Symbol, Vector{Any}}()
+
+  for column in column_symbols
+      expanded_data[column] = []
+
+      for row in eachrow(temp_df)
+          value = row[column]
+          # Handle missing values and non-string types
+          if ismissing(value) || typeof(value) != String
+              push!(expanded_data[column], [value])
+          else
+              push!(expanded_data[column], split(value, delimiter))
+          end
+      end
+  end
+
+  # Replace the columns with expanded data
+  for column in column_symbols
+      temp_df[!, column] = expanded_data[column]
+  end
+
+  # Flatten the DataFrame only once after all columns have been expanded
+  temp_df = flatten(temp_df, column_symbols)
+  if is_grouped
+    temp_df = groupby(temp_df, grouping_columns)
+   end
+  return temp_df
+end
\ No newline at end of file