Merge pull request #18 from drizk1/main

TidierOrg · Aug 4, 2023 · 66b8639 · 66b8639 · kdpsingh · Aug 4, 2023
2 parents 0c62580 + 4b38da1
commit 66b8639
Show file tree

Hide file tree

Showing 10 changed files with 257 additions and 4 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,10 @@
 # TidierData.jl updates
 
-## v1.0.0 - 2023-07-28
+## v0.9.0 - 2023-08-04
+- Exposed `not_vectorized[]` as a package global variable so that the user or other packages can modify it
+- Added `@separate`, `@unite`, and `@summary`
+
+## v0.8.0 - 2023-07-28
 - `Tidier.jl` cloned and changed to `TidierData.jl`
 
 ## v0.7.7 - 2023-07-15

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierData"
 uuid = "fe2206b3-d496-4ee9-a338-6a095c4ece80"
 authors = ["Karandeep Singh"]
-version = "0.8.0"
+version = "0.9.0"
 
 [deps]
 Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc"

diff --git a/README.md b/README.md
@@ -89,8 +89,10 @@ TidierData.jl currently supports the following top-level macros:
 - `@left_join()`, `@right_join()`, `@inner_join()`, and `@full_join()`
 - `@bind_rows()` and `@bind_cols()`
 - `@pivot_wider()` and `@pivot_longer()`
+- `@separate()` and `@unite()`
 - `@drop_na()`
 - `@clean_names()` (as in R's `janitor::clean_names()` function)
+- `@summary()` (as in R's `summary()` function)
 
 TidierData.jl also supports the following helper functions:
 

diff --git a/docs/examples/UserGuide/sep_unite.jl b/docs/examples/UserGuide/sep_unite.jl
@@ -0,0 +1,22 @@
+# Follwing the tidyverse syntax, the `@separate()` macro in `TidierData.jl` separates a single column into multiple columns. This is particularly useful for splitting a column containing delimited values into individual columns.
+
+using TidierData
+
+df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);
+
+# ## Separate the "a" column into "b", "c", and "d" columns based on the dash delimiter
+
+@chain df begin
+    @separate(a, (b, c, d), "-")
+end
+
+# The `@unite` macro brings together multiple columns into one, separate the characters by a user specified delimiter
+
+# ## Here, the `@unite` macro combines the "b", "c", and "d" columns columns into a single new "new_col" column using the "/" delimiter
+
+df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
+
+@chain df begin
+    @unite(new_col, (b, c, d), "/")
+end
+
diff --git a/docs/examples/UserGuide/summary.jl b/docs/examples/UserGuide/summary.jl
@@ -0,0 +1,28 @@
+# The `@summary()` macro in `TidierData.jl` provides a concise way to compute summary statistics on data. Similar to its R counterpart, it will provide the mean, median, Q1, Q3, minimum, maximum, and number of missing values in a numerical column or columns. 
+
+# ## Summary for the whole dataframe 
+
+using TidierData
+
+df = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]);
+
+@chain df begin
+    @summary()
+end
+
+@summary(df)
+
+# ## You can specify columns for which you want to compute the summary. This is useful if the DataFrame has a large number of columns and you're interested in only a subset of them.
+
+@chain df begin
+    @summary(B)
+end
+
+@summary(df, B)
+
+# ## or for a range of columns
+
+@chain df begin
+    @select(B:D)
+    @summary() # you can also write this @summary(2:4)
+end
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -101,8 +101,10 @@ TidierData.jl currently supports the following top-level macros:
     - `@left_join()`, `@right_join()`, `@inner_join()`, and `@full_join()`
     - `@bind_rows()` and `@bind_cols()`
     - `@pivot_wider()` and `@pivot_longer()`
+    - `@separate()` and `@unite()`
     - `@drop_na()`
     - `@clean_names()` (as in R's `janitor::clean_names()` function)
+    - `@summary()` (as in R's `summary()` function)
 ```
 TidierData.jl also supports the following helper functions:
 

diff --git a/src/TidierData.jl b/src/TidierData.jl
@@ -9,15 +9,16 @@ using Reexport
 
 # Exporting `Cols` because `summarize(!!vars, funs))` with multiple interpolated
 # columns requires `Cols()` to be nested within `Cols()`, so `Cols` needs to be exported.
-@reexport using DataFrames: DataFrame, Cols, describe, nrow, proprow
+@reexport using DataFrames: DataFrame, Cols, describe, nrow, proprow, Not, Between, select
 @reexport using Chain
 @reexport using Statistics
 @reexport using ShiftedArrays: lag, lead
 
 export TidierData_set, across, desc, n, row_number, starts_with, ends_with, matches, if_else, case_when, ntile, 
       as_float, as_integer, as_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter,
       @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join,
-      @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_na, @glimpse
+      @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_na, @glimpse, @separate,
+      @unite, @summary
 
 # Package global variables
 const code = Ref{Bool}(false) # output DataFrames.jl code?
@@ -39,6 +40,8 @@ include("pseudofunctions.jl")
 include("helperfunctions.jl")
 include("ntile.jl")
 include("type_conversions.jl")
+include("separate_unite.jl")
+include("summary.jl")
 
 # Function to set global variables
 """

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -1988,4 +1988,93 @@ julia> as_string(1.5)
 julia> as_string(missing)
 missing
 ```
+"""
+
+const docstring_separate = 
+"""
+   @separate(df, From, Into, Separator)
+
+Separate a string column into mulitiple new columns based on a specified delimter 
+
+# Arguments
+- `df`: A DataFrame
+- `From`: Column that will be split
+- `Into`: New column names, supports [] or ()
+- `Separator`: the string or chacater on which to split
+
+# Examples
+```jldoctest
+julia> df = DataFrame(a = ["1-1", "2-2", "3-3-3"]);
+
+julia> @separate(df, a, [b, c, d], "-")
+3×3 DataFrame
+ Row │ b          c          d          
+     │ SubStrin…  SubStrin…  SubStrin…? 
+─────┼──────────────────────────────────
+   1 │ 1          1          missing    
+   2 │ 2          2          missing    
+   3 │ 3          3          3
+
+julia> @chain df begin
+       @separate(a, (b, c, d), "-")
+       end
+3×3 DataFrame
+ Row │ b          c          d          
+     │ SubStrin…  SubStrin…  SubStrin…? 
+─────┼──────────────────────────────────
+   1 │ 1          1          missing    
+   2 │ 2          2          missing    
+   3 │ 3          3          3
+```
+"""
+
+const docstring_unite = 
+"""
+      @unite(df, new_cols, from_cols, sep)
+
+Separate a multiple columns into one new columns using a specific delimter
+
+# Arguments
+- `df`: A DataFrame
+- `new_col`: New column that will recieve the combination
+- `from_cols`: Column names that it will combine, supports [] or ()
+- `sep`: the string or character that will seprate the values in the new column
+
+# Examples
+```jldoctest
+julia> df = DataFrame( b = ["1", "2", "3"], c = ["1", "2", "3"], d = [missing, missing, "3"]);
+
+julia> @unite(df, new_col, (b, c, d), "-")
+3×4 DataFrame
+ Row │ b       c       d        new_col 
+     │ String  String  String?  String  
+─────┼──────────────────────────────────
+   1 │ 1       1       missing  1-1
+   2 │ 2       2       missing  2-2
+   3 │ 3       3       3        3-3-3
+```
+"""
+
+const docstring_summary =
+"""
+       @summary(df, cols...)
+
+For numerical columns, returns a dataframe with the Q1,Q3, min, max, mean, median, number of missing values
+
+# Arguments
+- 'df': A DataFrame
+- `cols`: columns on which summary will be performed. This is an optional arguement, without which summary will be performed on all numerical columns
+
+# Examples
+```jldoctest 
+julia> df = DataFrame( A = [1, 2, 3, 4, 5], B = [missing, 7, 8, 9, 10], C = [11, missing, 13, 14, missing], D = [16, 17, 18, 19, 20]);
+
+julia> @summary(df);
+
+julia> @summary(df, (B:D));
+
+julia> @chain df begin
+       @summary(B:D)
+       end;
+```
 """
diff --git a/src/separate_unite.jl b/src/separate_unite.jl
@@ -0,0 +1,65 @@
+function safe_getindex(arr, index, default_value="")
+    if index <= length(arr)
+        return arr[index]
+    else
+        return default_value
+    end
+end
+
+function separate(df::DataFrame, col::Symbol, into::Vector{Symbol}, sep::String)
+    new_df = df[:, :]
+    new_cols = map(x -> split(x, sep), new_df[:, col])
+    max_cols = maximum(length.(new_cols))
+
+    if length(into) < max_cols
+        error("Not enough names provided in `into` for all split columns.")
+    end
+
+    for i in 1:max_cols
+        new_df[:, into[i]] = map(x -> safe_getindex(x, i, missing), new_cols)
+    end
+
+    new_df = select(new_df, Not(col))
+
+    return new_df
+end
+
+"""
+$docstring_separate
+"""
+macro separate(df, from, into, sep)
+    from = QuoteNode(from)
+
+    if @capture(into, (args__,))
+    elseif @capture(into, [args__])
+    end
+
+    args = QuoteNode.(args)
+
+    var_expr = quote
+         separate($(esc(df)), $from, [$(args...)], $sep)
+    end
+end
+
+
+function unite(df::DataFrame, new_col_name::Symbol, cols::Vector{Symbol}, sep::String="_")
+    new_df = df[:, :]
+    new_df[:, new_col_name] = [join(skipmissing(row), sep) for row in eachrow(df[:, cols])]
+    return new_df
+end
+
+"""
+$docstring_unite
+"""
+macro unite(df, new_col, from_cols, sep)
+    new_col = QuoteNode(new_col)
+
+    if @capture(from_cols, (args__,))
+    elseif @capture(from_cols, [args__])
+    end
+
+    args = QuoteNode.(args)
+    var_expr = quote
+         unite($(esc(df)), $new_col, [$(args...)], $sep)
+    end
+end
diff --git a/src/summary.jl b/src/summary.jl
@@ -0,0 +1,38 @@
+function summary_stats(df::DataFrame)
+    colnames = names(df)
+    summary_data = []
+    for column in colnames
+        col = df[:, column]
+        col_nonmissing = collect(skipmissing(col))
+        push!(summary_data, (
+            Column = column,
+            Min = minimum(col_nonmissing),
+            Q1 = quantile(col_nonmissing, 0.25),
+            Median = median(col_nonmissing),
+            Mean = mean(col_nonmissing),
+            Q3 = quantile(col_nonmissing, 0.75),
+            Max = maximum(col_nonmissing),
+            Count = length(col_nonmissing),
+            Missing_Count = count(ismissing, col)
+        ))
+    end
+    return DataFrame(summary_data)
+end
+
+"""
+$docstring_summary
+"""
+macro summary(df, cols...)
+    if length(cols) == 0
+        return quote
+            summary_stats($(esc(df)))
+        end
+    else
+        selected_cols = [parse_tidy(col) for col in cols]
+        return quote
+            _selected_df = select($(esc(df)), $(selected_cols...))
+            summary_stats(_selected_df)
+        end
+    end
+end
+