From 68adb57226896823af01eb63c0fd28ac3c1d5292 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Mon, 7 Aug 2023 00:49:58 -0400 Subject: [PATCH 1/6] Convert Tidier.jl to a meta-package --- NEWS.md | 3 + Project.toml | 24 +- README.md | 136 +- docs/examples/UserGuide/across.jl | 36 - docs/examples/UserGuide/arrange.jl | 26 - docs/examples/UserGuide/autovec.jl | 44 - docs/examples/UserGuide/binding.jl | 29 - docs/examples/UserGuide/column_names.jl | 46 - docs/examples/UserGuide/conditionals.jl | 75 - docs/examples/UserGuide/dataset_movies.jl | 18 - docs/examples/UserGuide/distinct.jl | 23 - docs/examples/UserGuide/filter.jl | 64 - docs/examples/UserGuide/group_by.jl | 51 - docs/examples/UserGuide/interpolation.jl | 139 -- docs/examples/UserGuide/joins.jl | 31 - docs/examples/UserGuide/mutate_transmute.jl | 61 - docs/examples/UserGuide/pivots.jl | 47 - docs/examples/UserGuide/rename.jl | 25 - docs/examples/UserGuide/select.jl | 71 - docs/examples/UserGuide/slice.jl | 62 - docs/examples/UserGuide/summarize.jl | 44 - docs/mkdocs.yml | 22 +- docs/src/index.md | 151 +- src/Tidier.jl | 693 +------ src/binding.jl | 35 - src/clean_names.jl | 48 - src/compound_verbs.jl | 105 - src/conditionals.jl | 41 - src/docstrings.jl | 1991 ------------------- src/helperfunctions.jl | 8 - src/joins.jl | 107 - src/ntile.jl | 35 - src/parsing.jl | 433 ---- src/pivots.jl | 78 - src/pseudofunctions.jl | 27 - src/type_conversions.jl | 47 - test/Project.toml | 3 - test/runtests.jl | 4 +- 38 files changed, 92 insertions(+), 4791 deletions(-) delete mode 100644 docs/examples/UserGuide/across.jl delete mode 100644 docs/examples/UserGuide/arrange.jl delete mode 100644 docs/examples/UserGuide/autovec.jl delete mode 100644 docs/examples/UserGuide/binding.jl delete mode 100644 docs/examples/UserGuide/column_names.jl delete mode 100644 docs/examples/UserGuide/conditionals.jl delete mode 100644 docs/examples/UserGuide/dataset_movies.jl delete mode 100644 docs/examples/UserGuide/distinct.jl delete mode 100644 docs/examples/UserGuide/filter.jl delete mode 100644 docs/examples/UserGuide/group_by.jl delete mode 100644 docs/examples/UserGuide/interpolation.jl delete mode 100644 docs/examples/UserGuide/joins.jl delete mode 100644 docs/examples/UserGuide/mutate_transmute.jl delete mode 100644 docs/examples/UserGuide/pivots.jl delete mode 100644 docs/examples/UserGuide/rename.jl delete mode 100644 docs/examples/UserGuide/select.jl delete mode 100644 docs/examples/UserGuide/slice.jl delete mode 100644 docs/examples/UserGuide/summarize.jl delete mode 100644 src/binding.jl delete mode 100644 src/clean_names.jl delete mode 100644 src/compound_verbs.jl delete mode 100644 src/conditionals.jl delete mode 100644 src/docstrings.jl delete mode 100644 src/helperfunctions.jl delete mode 100644 src/joins.jl delete mode 100644 src/ntile.jl delete mode 100644 src/parsing.jl delete mode 100644 src/pivots.jl delete mode 100644 src/pseudofunctions.jl delete mode 100644 src/type_conversions.jl diff --git a/NEWS.md b/NEWS.md index 66aa0bc..55a6af0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # Tidier.jl updates +## v1.0.0 - 2023-08-07 +- Convert Tidier.jl to a meta-package that only re-exports other Tidier packages + ## v0.7.7 - 2023-07-15 - Added documentation on how to interpolate variables inside of `for` loops. Note: `!!` interpolation doesn't work inside of `for` loops because macros are expanded during parsing and not at runtime. - Fixed bug in `parse_pivot_arg()` to enable interpolation inside of pivoting functions when used inside a `for` loop. diff --git a/Project.toml b/Project.toml index 44b10d7..9bc0098 100644 --- a/Project.toml +++ b/Project.toml @@ -1,24 +1,24 @@ name = "Tidier" uuid = "f0413319-3358-4bb0-8e7c-0c83523a93bd" authors = ["Karandeep Singh"] -version = "0.7.7" +version = "1.0.0" [deps] -Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" -Cleaner = "caabdcdb-0ab6-47cf-9f62-08858e44f38f" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" -ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +TidierCats = "79ddc9fe-4dbf-4a56-a832-df41fb326d23" +TidierData = "fe2206b3-d496-4ee9-a338-6a095c4ece80" +TidierDates = "20186a3f-b5d3-468e-823e-77aae96fe2d8" +TidierPlots = "337ecbd1-5042-4e2a-ae6f-ca776f97570a" +TidierStrings = "248e6834-d0f8-40ef-8fbb-8e711d883e9c" [compat] -Chain = "0.5" -Cleaner = "0.5.0" -DataFrames = "1.5" -MacroTools = "0.5" Reexport = "0.2, 1" -ShiftedArrays = "2.0.0" +TidierData = ">=0.9.2" +TidierPlots = ">=0.1.0" +TidierCats = ">=0.1.1" +TidierDates = ">=0.1.0" +TidierStrings = ">=0.1.0" + julia = "1.6" [extras] diff --git a/README.md b/README.md index 7f281e9..624464d 100644 --- a/README.md +++ b/README.md @@ -5,41 +5,41 @@ [![Build Status](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/Tidier&label=Downloads)](https://pkgs.genieframework.com?packages=Tidier) - - -## What is Tidier.jl? - -Tidier.jl is a 100% Julia implementation of the R tidyverse -mini-language in Julia. Powered by the DataFrames.jl package and Julia’s -extensive meta-programming capabilities, Tidier.jl is an R user’s love -letter to data analysis in Julia. - -`Tidier.jl` has three goals, which differentiate it from other data analysis -meta-packages in Julia: - -1. **Stick as closely to tidyverse syntax as possible:** Whereas other - meta-packages introduce Julia-centric idioms for working with - DataFrames, this package’s goal is to reimplement parts of tidyverse - in Julia. This means that `Tidier.jl` uses *tidy expressions* as opposed - to idiomatic Julia expressions. An example of a tidy expression is - `a = mean(b)`. - -2. **Make broadcasting mostly invisible:** Broadcasting trips up many R - users switching to Julia because R users are used to most functions - being vectorized. `Tidier.jl` currently uses a lookup table to decide - which functions *not* to vectorize; all other functions are - automatically vectorized. Read the documentation page on "Autovectorization" - to read about how this works, and how to override the defaults. - -3. **Make scalars and tuples mostly interchangeable:** In Julia, the function - `across(a, mean)` is dispatched differently than `across((a, b), mean)`. - The first argument in the first instance above is treated as a scalar, - whereas the second instance is treated as a tuple. This can be very confusing - to R users because `1 == c(1)` is `TRUE` in R, whereas in Julia `1 == (1,)` - evaluates to `false`. The design philosophy in `Tidier.jl` is that the user - should feel free to provide a scalar or a tuple as they see fit anytime - multiple values are considered valid for a given argument, such as in - `across()`, and `Tidier.jl` will figure out how to dispatch it. + + +## Tidier.jl + +Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. + + + +## TidierData.jl + +TidierData.jl is package dedicated to data transformation and reshaping, powered by DataFrames.jl, ShiftedArrays.jl, and Cleaner.jl. It focuses on functionality within the dplyr, tidyr, and janitor R packages. + + + +## TidierPlots.jl + +TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.jl. It focuses on functionality within the ggplot2 R package. + + + +## TidierCats.jl + +TidierCats.jl is a package dedicated to handling categorical variables, powered by CategoricalArrays.jl. It focuses on functionality within the forcats R package. + + + +## TidierDates.jl + +TidierDates.jl is a package dedicated to handling dates and times. It focuses on functionality within the lubridate R package. + + + +## TidierStrings.jl + +TidierStrings.jl is a package dedicated to handling strings. It focuses on functionality within the stringr R package. ## Installation @@ -72,74 +72,10 @@ using Pkg Pkg.add(url="https://github.com/TidierOrg/Tidier.jl") ``` -## What functions does Tidier.jl support? - -To support R-style programming, Tidier.jl is implemented using macros. - -Tidier.jl currently supports the following top-level macros: - -- `@glimpse()` -- `@select()`, `@rename()`, and `@distinct()` -- `@mutate()` and `@transmute()` -- `@summarize()` and `@summarise()` -- `@filter()` and `@slice()` -- `@group_by()` and `@ungroup()` -- `@arrange()` -- `@pull()` -- `@count()` and `@tally()` -- `@left_join()`, `@right_join()`, `@inner_join()`, and `@full_join()` -- `@bind_rows()` and `@bind_cols()` -- `@pivot_wider()` and `@pivot_longer()` -- `@drop_na()` -- `@clean_names()` (as in R's `janitor::clean_names()` function) - -Tidier.jl also supports the following helper functions: - -- `across()` -- `desc()` -- `if_else()` and `case_when()` -- `n()` and `row_number()` -- `ntile()` -- `lag()` and `lead()` -- `starts_with()`, `ends_with()`, `matches()`, and `contains()` -- `as_float()`, `as_integer()`, and `as_string()` - -See the documentation [Home](https://tidierorg.github.io/Tidier.jl/dev/) page for a guide on how to get started, or the [Reference](https://tidierorg.github.io/Tidier.jl/dev/reference/) page for a detailed guide to each of the macros and functions. - -## Example - -Let's select the first five movies in our dataset whose budget exceeds the mean budget. Unlike in R, where we pass an `na.rm = TRUE` argument to remove missing values, in Julia we wrap the variable with a `skipmissing()` to remove the missing values before the `mean()` is calculated. - -```julia -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @filter(Budget >= mean(skipmissing(Budget))) - @select(Title, Budget) - @slice(1:5) -end -``` - -``` -5×2 DataFrame - Row │ Title Budget - │ String Float64? -─────┼────────────────────────────────────── - 1 │ 'Til There Was You 23.0 - 2 │ 10 Things I Hate About You 16.0 - 3 │ 102 Dalmatians 85.0 - 4 │ 13 Going On 30 37.0 - 5 │ 13th Warrior, The 85.0 -``` - ## What’s new See [NEWS.md](https://github.com/TidierOrg/Tidier.jl/blob/main/NEWS.md) for the latest updates. ## What's missing -Is there a tidyverse feature missing that you would like to see in Tidier.jl? Please file a GitHub issue. Because Tidier.jl primarily wraps DataFrames.jl, our decision to integrate a new feature will be guided by how well-supported it is within DataFrames.jl and how likely other users are to benefit from it. \ No newline at end of file +Is there a tidyverse feature missing that you would like to see in Tidier.jl? Please file a GitHub issue. \ No newline at end of file diff --git a/docs/examples/UserGuide/across.jl b/docs/examples/UserGuide/across.jl deleted file mode 100644 index 9d75759..0000000 --- a/docs/examples/UserGuide/across.jl +++ /dev/null @@ -1,36 +0,0 @@ -# `across()` is a helper function that is typically used inside `@mutate()` or `@summarize` to operate on multiple columns and/or multiple functions. Notice that `across()` accepts two arguments, a set of variables and a set of functions. If providing multiple variables or functions, these should be provided as a tuple -- in other words, wrapped in parentheses and separated by commas. If you want to skip missing values, you can "fuse" the summary function (such as `mean()`) with the `skipmissing()` function by using the fuction fusion operator, which you can type out in Julia by typing `\circ` and then pressing `[Tab]` such that it reads `mean∘skipmissing`. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## One variable, one function - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @summarize(across(Budget, mean∘skipmissing)) -end - -# ## One variable, one anonymous function - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @summarize(across(Budget, (x -> mean(skipmissing(x))))) -end - -# Note: compound functions are not correctly supported inside of anonymous functions. As of right now, the above function works, but `(x -> mean∘skipmissing(x))` does not work. This is a known bug and will be fixed in a future update. - -# ## Multiple variables, multiple functions - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @summarize(across((Rating, Budget), (mean∘skipmissing, median∘skipmissing))) -end - -# ## Multiple selection helpers, multiple functions - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @summarize(across((starts_with("Bud"), ends_with("ting")), (mean∘skipmissing, median∘skipmissing))) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/arrange.jl b/docs/examples/UserGuide/arrange.jl deleted file mode 100644 index eceea8c..0000000 --- a/docs/examples/UserGuide/arrange.jl +++ /dev/null @@ -1,26 +0,0 @@ -# Arranging is the way to sort a data frame. `@arrange()` can take multiple arguments. Arguments refer to columns that are sorted in ascending order by default. If you want to sort in descending order, make sure to wrap the column name in `desc()` as shown below. - -# `DataFrames.jl` does not currently support the `sort()` function on grouped data frames. In order to make this work in `Tidier.jl`, if you apply `@arrange()` to a GroupedDataFrame, `@arrange()` will temporarily ungroup the data, perform the `sort()`, and then re-group by the original grouping variables. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Sort both variables in ascending order - -@chain movies begin - @arrange(Year, Rating) - @select(1:5) - @slice(1:5) -end - -# ## Sort in a mix of ascending and descending order - -# To sort in descending order, make sure to wrap the variable inside of `desc()`. - -@chain movies begin - @arrange(Year, desc(Rating)) - @select(1:5) - @slice(1:5) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/autovec.jl b/docs/examples/UserGuide/autovec.jl deleted file mode 100644 index 724c12a..0000000 --- a/docs/examples/UserGuide/autovec.jl +++ /dev/null @@ -1,44 +0,0 @@ -# In general, Tidier.jl uses a lookup table to decide which functions *not* to vectorize. For example, `mean()` is listed as a function that should never be vectorized. Also, any function used inside of `@summarize()` is also never automatically vectorized. Any function that is not included in this list *and* is used in a context other than `@summarize()` is automatically vectorized. - -# This "auto-vectorization" makes working with Tidier.jl more R-like and convenient. However, if you ever define your own function and try to use it, Tidier.jl may unintentionally vectorize it for you. To prevent auto-vectorization, you can prefix your function with a `~`. - -using Tidier -using RDatasets - -df = DataFrame(a = repeat('a':'e', inner = 2), b = [1,1,1,2,2,2,3,3,3,4], c = 11:20) - -# For example, let's define a function `new_mean()` that calculates a mean. - -new_mean(exprs...) = mean(exprs...) - -# If we try to use `new_mean()` inside of `@mutate()`, it will give us the wrong result. This is because `new_mean()` is vectorized, which results in the mean being calculated element-wise, which is almost never what we actually want. - -@chain df begin - @mutate(d = c - new_mean(c)) -end - -# To prevent `new_mean()` from being vectorized, we need to prefix it with a `~` like this: - -@chain df begin - @mutate(d = c - ~new_mean(c)) -end - -# This gives us the correct answer. Notice that adding a `~` is not needed with `mean()` because `mean()` is already included on our look-up table of functions not requiring vectorization. - -@chain df begin - @mutate(d = c - mean(c)) -end - -# If you're not sure if a function is vectorized and want to prevent it from being vectorized, you can always prefix it with a ~ to prevent vectorization. Even though `mean()` is not vectorized anyway, prefixing it with a ~ will not cause any harm. - -@chain df begin - @mutate(d = c - ~mean(c)) -end - -# If for some crazy reason, you *did* want to vectorize `mean()`, you are always allowed to vectorize it, and Tidier.jl won't un-vectorize it. - -@chain df begin - @mutate(d = c - mean.(c)) -end - -# Note: `~` also works with operators, so if you want to *not* vectorize an operator, you can prefix it with `~`, for example, `a ~* b` will perform a matrix multiplication rather than element-wise multiplication. Remember that this is only needed outside of `@summarize()` because `@summarize()` never performs auto-vectorization. \ No newline at end of file diff --git a/docs/examples/UserGuide/binding.jl b/docs/examples/UserGuide/binding.jl deleted file mode 100644 index decbaa1..0000000 --- a/docs/examples/UserGuide/binding.jl +++ /dev/null @@ -1,29 +0,0 @@ -# Whereas joins are useful for combining data frames based on matching keys, another way to combine data frames is to bind them together, which can be done either by rows or by columns. `Tidier.jl` implements these actions using `@bind_rows()` and `@bind_cols()`, respectively. - -# Let's generate three data frames to combine. - -using Tidier - -df1 = DataFrame(a=1:3, b=1:3); - -df2 = DataFrame(a=4:6, b=4:6); - -df3 = DataFrame(a=7:9, c=7:9); - -# ## `@bind_rows()` - -@bind_rows(df1, df2) - -# `@bind_rows()` keeps columns that are present in at least one of the provided data frames. Any missing columns will be filled with `missing` values. - -@bind_rows(df1, df3) - -# There is an optional `id` argument to add an identifier for combined data frames. Note that both `@bind_rows` and `@bind_cols` accept multiple (i.e., more than 2) data frames, as in the example below. - -@bind_rows(df1, df2, df3, id = "id") - -# ## `@bind_cols()` - -# `@bind_cols` works similarly to R's `tidyverse` although the `.name_repair` argument is not supported. - -@bind_cols(df1, df2) \ No newline at end of file diff --git a/docs/examples/UserGuide/column_names.jl b/docs/examples/UserGuide/column_names.jl deleted file mode 100644 index 906e9f8..0000000 --- a/docs/examples/UserGuide/column_names.jl +++ /dev/null @@ -1,46 +0,0 @@ -# When referring to column names, Tidier.jl is a bit unusual for a Julia package in that it does not use symbols. This is because Tidier.jl uses *tidy expressions*, which in R lingo equates to a style of programming referred to as "non-standard evaluation." If you are creating a new column `a` containing a value that is the mean of column `b`, you would simply write `a = mean(b)`. - -# However, there may be times when you wish to create or refer to a column containing a space in it. Let's start by creating some column names containing a space in their name. - -using Tidier - -df = DataFrame(var"my name" = ["Ada", "Twist"], - var"my age" = [40, 50]) - -# To create a column name containing a space, we used the `var"column name"` notation. Because `DataFrame()` is a regular Julia function, this is the standard way to refer to a variable containing a space, which is why we need to use this here. - -# This notation *also* works inside of Tidier.jl. - -# ## `var"column name"` notation - -# If we want to figure out the age for the people in our dataset a decade from today, we could use this same `var"column name"` notation inside of `@mutate`. - -@chain df begin - @mutate(var"age in 10 years" = var"my age" + 10) -end - -# However, typing out the `var"column name"` can become cumbersome. Tidier.jl also supports another shorthand notation to refer to column names containing spaces or other special characters: backticks. - -# ## Backtick notation - -# This same code could be written more concisely like this: - -@chain df begin - @mutate(`age in 10 years` = `my age` + 10) -end - -# Backticks are an R convention. While they are not specific to tidyverse, they are a convenient way to refer to column names that otherwise would not parse correctly as a single entity. Backticks are supported in *all* Tidier.jl functions where column names may be referenced. - -# ## Cleaning up column names - -# Another option is to clean up the column names so that you do not have spaces to begin with. In R, this is usually accomplished using the `janitor` package. In Julia, the Cleaner.jl package provides this functionality, which we have wrapped inside of Tidier.jl. - -@chain df begin - @clean_names -end - -# Although the default value for the `case` argument is "snake_case", you can also set this to "camelCase". - -@chain df begin - @clean_names(case = "camelCase") -end \ No newline at end of file diff --git a/docs/examples/UserGuide/conditionals.jl b/docs/examples/UserGuide/conditionals.jl deleted file mode 100644 index 303a964..0000000 --- a/docs/examples/UserGuide/conditionals.jl +++ /dev/null @@ -1,75 +0,0 @@ -# Conditional functions are a useful tool to update or create new columns conditional on the values of a column of data. When continuous variables are converted to categories, this is sometimes referred to as "recoding" a column. - -# Tidier.jl provides two functions to recode data: `if_else()` and `case_when()`. - -# ## `if_else()` - -# Why do we need another `if_else()` function if base Julia already comes with an `ifelse()` function. Similar to R, the base Julia implementation of `if_else()` does not include a way to designate what value to return if the enclosed vector contains a missing value. Additionally, the base Julia implementation of `ifelse()` produces an error if presented with a `missing` value in the condition. The Tidier.jl `if_else()` can handle missing values and includes an optional 4th argument that is used to designate what to return in the event of a `missing`` value for the condition. Let's take a look at some examples. - -using Tidier - -df = DataFrame(a = [1, 2, missing, 4, 5]) - -# Here, we have created a `DataFrame` containing a single column `a` with 5 values, for which the 3rd value is missing. - -# Now, let's create a new column `b` that contains a "yes" if `a` is greater than or equal to 3, and a "no" otherwise. Notice that when we do this, the `missing` values remains as `missing`. - -@chain df begin - @mutate(b = if_else(a >= 3, "yes", "no")) -end - -# What if we wanted to fill in the missing value with "unknown"? All we need to do is provide an optional 4th argument containing the value to return in the event of a missing condition. When we run this version, `missing` values in `a` are converted to "unknown" in `b`. - -@chain df begin - @mutate(b = if_else(a >= 3, "yes", "no", "unknown")) -end - -# Although both of these examples showed how to return a single value (like "yes" and "no"), you can also return a vector of values, which is useful for updating only a subset of the values of a column. For example, if we wanted to create a column `b` that contains a 3 when `a` is greater than or equal to 3 but otherwise remains unchanged, we could provide a 3 for the `yes` condition and a vector (column) `a` in the `no` condition. If we do not provide the optional 4th argument, `missing` values remain `missing`. - -@chain df begin - @mutate(b = if_else(a >= 3, 3, a)) -end - -# ## `case_when()` - -# Although `if_else()` is convenient when evaluating a single condition, it can be cumbersome when evaluating multiple conditions because subsequent conditions need to be nested within the `no` condition for the preceding argument. For situations where multiple conditions need to be evaluated, `case_when()` is more convenient. - -# Let's first consider a similar example from above and recreate it using `case_when()`. The following code creates a column `b` that assigns a value if 3 if `a >= 3` and otherwise leaves the value unchanged. - -@chain df begin - @mutate(b = case_when(a >= 3 => 3, - true => a)) -end - -# What is going on here? `case_when()` uses a `condition => return_value` syntax, which are encoded as pairs in Julia. You can provide a single pair, or multiple pairs separated by commas. Because the pairs operator (`=>`) might be confused with a greater than or equal to sign (`>=`), we have padded two spaces on either side of the `=>` to make sure that the pair remains visually distinct. We do not use a `~` operator in `case_when()` (as is used in R) because the `~` operator is used to denote de-vectorized functions in Tidier.jl. - -# There are 2 other things to note above. First, the `true` condition evaluates to `true` for all remaining values of `a`. The only reason that the `b` contains a `missing` value here is that the `true` condition was met, leading to the value of `a` (in this case, `missing`) to be assigned to `b`. Second, we were able to return a single value (3) in the first condition, and a vector (column) of data (`a`) in the second condition. - -# What if we wanted to fill in the missing values with something else? In this case, we would need to create an explicit condition that checks for missing values and assigns a return value to that condition. - -@chain df begin - @mutate(b = case_when(a >= 3 => 3, - ismissing(a) => 0, - true => a)) -end - -# Do our conditions have to be mutually exclusive? No. The return value for the *first* matching condition is assigned to `b` because the conditions are evaluated sequentially from first to last. - -@chain df begin - @mutate(b = case_when(a > 4 => "hi", - a > 2 => "medium", - a > 0 => "low")) -end - -# Again, if we want to fill in remaining values (which in this case are the `missing` ones), we can map the final condition `true` to the value of "unknown". Because the ordering of the conditions matters, the `true` condition should always be listed last if it is included. - -@chain df begin - @mutate(b = case_when(a > 4 => "hi", - a > 2 => "medium", - a > 0 => "low", - true => "unknown")) -end - -# ## Do these functions work outside of Tidier.jl? - -# Yes, both `if_else()` and `case_when()` work outside of Tidier.jl. However, you'll need to remember that if working with vectors, both the functions and conditions will need to be vectorized, and in the case of `case_when()`, the `=>` will need to be written as `.=>`. The reason this is not needed when using these functions inside of Tidier.jl is because they are auto-vectorized. \ No newline at end of file diff --git a/docs/examples/UserGuide/dataset_movies.jl b/docs/examples/UserGuide/dataset_movies.jl deleted file mode 100644 index 03ce894..0000000 --- a/docs/examples/UserGuide/dataset_movies.jl +++ /dev/null @@ -1,18 +0,0 @@ -# To get started, we will load the `movies` dataset from the `RDatasets.jl` package. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# To work with this dataset, we will use the `@chain` macro. This macro initiates a pipe, and every function or macro provided to it between the `begin` and `end` blocks modifies the dataframe mentioned at the beginning of the pipe. You don't have to necessarily spread a chain over multiple lines of code, but when working with data frames it's often easiest to do so. Before going further, take a look at the [Chain.jl GitHub page](https://github.com/jkrumbiegel/Chain.jl) to see all the cool things that are possible with this, including mid-chain side effects using `@aside` and mid-chain assignment of variables. - -# Let's take a look at the first 5 rows of the `movies` dataset using `@slice()`. - -@chain movies begin - @slice(1:5) -end - -# Let's use `@glimpse()` to preview the dataset. - -@glimpse(movies) \ No newline at end of file diff --git a/docs/examples/UserGuide/distinct.jl b/docs/examples/UserGuide/distinct.jl deleted file mode 100644 index 5f54592..0000000 --- a/docs/examples/UserGuide/distinct.jl +++ /dev/null @@ -1,23 +0,0 @@ -# The `@distinct()` macro in `Tidier.jl` is useful to select distinct rows. Like it's R counterpart, it can be used with or without arguments. When arguments are provided, it behaves slightly differently than the R version. Whereas the R function only returns the provided columns, the Tidier.jl version returns all columns, where the first match is returned for the non-selected columns. - -using Tidier - -df = DataFrame(a = 1:10, b = repeat('a':'e', inner = 2)) - -# ## Select distinct values overall - -# Since there are no duplicate rows, this will return all rows. - -@chain df begin - @distinct() -end - -# ## Select distinct values based on column `b` - -# Notice that the first matching row for column `a` is returned for every distinct value of column `b`. This is slightly different behavior than R's tidyverse, which would have returned only column `b`. - -@chain df begin - @distinct(b) -end - -# In Tidier.jl, `@distinct()` works with grouped data frames. If grouped, `@distinct()` will ignore the grouping when determining distinct values but will return the data frame in grouped form based on the original groupings. \ No newline at end of file diff --git a/docs/examples/UserGuide/filter.jl b/docs/examples/UserGuide/filter.jl deleted file mode 100644 index 2b59d69..0000000 --- a/docs/examples/UserGuide/filter.jl +++ /dev/null @@ -1,64 +0,0 @@ -# Filtering is a mechanism to indicate which rows you want to keep in a dataset based on criteria. This is also referred to as subsetting. Filtering rows is normally a bit tricky in `DataFrames.jl` because comparison operators like `>=` actually need to be vectorized as `.>=`, which can catch new Julia users by surprise. `@filter()` mimics R's `tidyverse` behavior by auto-vectorizing the code and then only selecting those rows that evaluate to `true`. Similar to `dplyr`, rows that evaluate to `missing` are skipped. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Let’s take a look at the movies whose budget was more than average. We will select only the first 5 rows for the sake of brevity. - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @filter(Budget >= mean(skipmissing(Budget))) - @select(Title, Budget) - @slice(1:5) -end - -# ## Let's search for movies that have at least 200 votes and a rating of greater than or equal to 8. There are 3 ways you can specify an "and" condition inside of `Tidier.jl`. - -# ### The first option is to use the short-circuiting `&&` operator as shown below. This is the preferred approach because the second expression is only evaluated (per element) if the first one is true. - -@chain movies begin - @filter(Votes >= 200 && Rating >= 8) - @select(Title, Votes, Rating) - @slice(1:5) -end - -# ### The second option is to use the bitwise `&` operator. Note that there is a key difference in syntax between `&` and `&&`. Because the `&` operator takes a higher operator precedence than `>=`, you have to wrap the comparison expressions inside of parentheses to ensure that the overall expression is evaluated correctly. - -@chain movies begin - @filter((Votes >= 200) & (Rating >= 8)) - @select(Title, Votes, Rating) - @slice(1:5) -end - -# ### The third option for "and" conditions only is to separate the expressions with commas. This is similar to the behavior of `filter()` in `tidyverse`. - -@chain movies begin - @filter(Votes >= 200, Rating >= 8) - @select(Title, Votes, Rating) - @slice(1:5) -end - -# ## Now let's see how to use `@filter()` with `in`. Here's an example with a tuple. - -@chain movies begin - @filter(Title in ("101 Dalmatians", - "102 Dalmatians")) - @select(1:5) -end - -# ## We can also use `@filter()` with `in` using a vector, denoted by a `[]`. - -@chain movies begin - @filter(Title in ["101 Dalmatians", - "102 Dalmatians"]) - @select(1:5) -end - -# ## Finally, we can combine `@filter` with `row_number()` to retrieve the first 5 rows, which can be used to mimic the functionality provided by `@slice`. - -@chain movies begin - @filter(row_number() <= 5) - @select(1:5) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/group_by.jl b/docs/examples/UserGuide/group_by.jl deleted file mode 100644 index 4906540..0000000 --- a/docs/examples/UserGuide/group_by.jl +++ /dev/null @@ -1,51 +0,0 @@ -# Grouping and ungrouping behavior is one of the nicest parts of using R's tidyverse. Once a data frame is grouped, *all* verbs applied to that data frame respect the grouping, including but not limited to `@mutate()`, `@summarize()`, `@slice()` and `@filter`, which allows for really powerful abstractions. For example, with `@group_by()` followed by `@filter()`, you can limit the rows of a dataset to the maximum or minimum values for each group. - -# Exactly as in R's `tidyverse`, once a data frame is grouped, it remains grouped until either `@summarize()` is called (which "peels off" one layer of grouping) or `@ungroup()` is called, which removes all layers of grouping. Also as in R's `tidyverse`, `@group_by()` sorts the groups in ascending order. Unlike in R, there is never any question about whether a data frame is currently grouped because GroupedDataFrames print out in a *very* different form than DataFrames, making them easy to tell apart. - -# When using `@chain`, note that you can write either `@ungroup` or `@ungroup()`. Both are considered valid. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Combining `@group_by()` with `@mutate()` - -@chain movies begin - @group_by(Year) - @mutate(Mean_Yearly_Rating = mean(skipmissing(Rating))) - @select(Year, Rating, Mean_Yearly_Rating) - @ungroup - @slice(1:5) -end - -# ## Combining @group_by() with @summarize() - -@chain movies begin - @group_by(Year) - @summarize(Mean_Yearly_Rating = mean(skipmissing(Rating)), - Median_Yearly_Rating = median(skipmissing(Rating))) - @slice(1:5) -end - -# ## Grouping by multiple columns - -@chain movies begin - @group_by(Year, Comedy) - @summarize(Mean_Yearly_Rating = mean(skipmissing(Rating)), - Median_Yearly_Rating = median(skipmissing(Rating))) - @ungroup # Need to ungroup to peel off grouping by Year - @arrange(desc(Year), Comedy) - @slice(1:5) -end - -# ## Combining @group_by() with @filter() - -@chain movies begin - @group_by(Year) - @filter(Rating == minimum(Rating)) - @ungroup - @select(Year, Rating) - @arrange(desc(Year)) - @slice(1:10) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/interpolation.jl b/docs/examples/UserGuide/interpolation.jl deleted file mode 100644 index 2054a01..0000000 --- a/docs/examples/UserGuide/interpolation.jl +++ /dev/null @@ -1,139 +0,0 @@ -# The `!!` ("bang bang") operator can be used to interpolate values of variables from the global environment into your code. This operator is borrowed from the R `rlang` package. At some point, we may switch to using native Julia interpolation, but for a variety of reasons that introduce some complexity with native interpolation, we plan to continue to support `!!` interpolation. - -# To interpolate multiple variables, the `rlang` R package uses the `!!!` "triple bang" operator. However, in `Tidier.jl`, the `!!` "bang bang" operator can be used to interpolate either single or multiple values as shown in the examples below. - -# Since the `!!` operator can only access variables in the global environment, we will set these variables in a somewhat roundabout way for the purposes of documentation. However, in interactive use, you can simply write `myvar = :b` instead of wrapping this code inside of an `@eval()` macro as is done here. - -# Note: `myvar = :b`, `myvar = (:a, :b)`, and `myvar = [:a, :b]` all refer to *columns* with those names. On the other hand, `myvar = "b"`, `myvar = ("a", "b")` and `myvar = ["a", "b"]` will interpolate those *values*. See below for examples. - -using Tidier - -df = DataFrame(a = string.(repeat('a':'e', inner = 2)), - b = [1,1,1,2,2,2,3,3,3,4], - c = 11:20) - -# ## Select the column (because `myvar` contains a symbol) - -@eval(Main, myvar = :b) - -@chain df begin - @select(!!myvar) -end - -# ## Select multiple variables (tuple of symbols) - -@eval(Main, myvars_tuple = (:a, :b)) - -@chain df begin - @select(!!myvars_tuple) -end - -# ## Select multiple variables (vector of symbols) - -@eval(Main, myvars_vector = [:a, :b]) - -@chain df begin - @select(!!myvars_vector) -end - -# ## Filter rows containing the *value* of `myvar_string` (because `myvar_string` does) - -@eval(Main, myvar_string = "b") - -@chain df begin - @filter(a == !!myvar_string) -end - -# ## Filtering rows works similarly using `in`. - -# Note that for `in` to work here, we have to wrap it in `[]` because otherwise, the string will be converted into a collection of characters, which are a different data type. - -@eval(Main, myvar_string = "b") - -@chain df begin - @filter(a in [!!myvar_string]) -end - -# ## You can also use this for a tuple or vector of strings. - -@eval(Main, myvars_string = ("a", "b")) - -@chain df begin - @filter(a in !!myvars_string) -end - -# ## Mutate one variable - -@eval(Main, myvar = :b) - -@chain df begin - @mutate(!!myvar = !!myvar + 1) -end - -# ## Summarize across one variable - -@eval(Main, myvar = :b) - -@chain df begin - @summarize(across(!!myvar, mean)) -end - -# ## Summarize across multiple variables - -@eval(Main, myvars_tuple = (:b, :c)) - -@chain df begin - @summarize(across(!!myvars_tuple, (mean, minimum, maximum))) -end - -# ## Group by multiple interpolated variables - -@eval(Main, myvars_tuple = (:a, :b)) - -@chain df begin - @group_by(!!myvars_tuple) - @summarize(c = mean(c)) -end - -# ## Global constants - -# Because global constants like `pi` exist in the `Main` module, they can also be accessed using interpolation. For example, let's calculate the area of circles with a radius of 1 up to 5. - -df = DataFrame(radius = 1:5) - -# We can interpolate `pi` (from the `Main` module) to help with this. - -@chain df begin - @mutate(area = !!pi * radius^2) -end - -# ## Alternative interpolation syntax - -# While interpolation using `!!` is concise and handy, it's not required. You can also access user-defined globals and global constant variables using the following syntax: - -@chain df begin - @mutate(area = Main.pi * radius^2) -end - -# The key lesson with interpolation is that any bare unquoted variable is assumed to refer to a column name in the DataFrame. If you are referring to any variable outside of the DataFrame, you need to either use `!!variable` or `Main.variable` syntax to refer to this variable. - -# ## There's one other situation when `!!` interpolation may not work correctly: inside a `for` loop. - -# This is only a problem if the variable being interpolated using `!!` is the iterator. Because macros as expanded during *parsing* of the code (before it is compiled), the expanded code contains the last value of the global variable *before* the loop is run and does not update with each iteration of the loop. - -# To get around this, we can use `@eval(Main, variable)` inside our code, where `variable` refers to the iterator. Let's show a simple example of this where we print out each column one at a time using a `for` loop. - -# We first need to initialize the global variable using `global_col = Symbol()`. - -# ```julia -# global_col = Symbol() -# for col in [:a, :b, :c] -# global global_col = col -# @chain df begin -# @select(@eval(Main, global_col)) -# println -# end -# end -# ``` - -# The reason this works is because the `@eval()` macro inside `@select()` is not evaluated right away (unlike `!!`) but rather is evaluated at a later stage and thus is updated with each iteration. Instead of using the `@eval()` macro, we could instead have instead written `Main.eval(:global_col)`, which is functionally the same. \ No newline at end of file diff --git a/docs/examples/UserGuide/joins.jl b/docs/examples/UserGuide/joins.jl deleted file mode 100644 index ebaadba..0000000 --- a/docs/examples/UserGuide/joins.jl +++ /dev/null @@ -1,31 +0,0 @@ -# One really nice thing about the R `tidyverse` implementation of joins is that they support natural joins. If you don't specify which columns to join on, these column names are inferred from the overlapping columns. While you can override this behavior by specifying which columns to join on, it's convenient that this is not strictly required. We have adopted a similar approach to joins in `Tidier.jl`. - -# Here, we will *only* show examples of natural joins. For additional ways to join, take a look at the examples in the [Reference](https://TidierOrg.github.io/Tidier.jl/dev/reference/). - -using Tidier - -# Let's generate two data frames to join on. Here's the first one. - -df1 = DataFrame(a = ["a", "b"], b = 1:2); - -# And here's the second one. - -df2 = DataFrame(a = ["a", "c"], c = 3:4); - -# All the joins work similarly to R's `tidyverse` although the new `join_by` syntax for non-equijoins is not (yet) supported. - -# ## Left join - -@left_join(df1, df2) - -# ## Right join - -@right_join(df1, df2) - -# ## Inner join - -@inner_join(df1, df2) - -# ## Full join - -@full_join(df1, df2) \ No newline at end of file diff --git a/docs/examples/UserGuide/mutate_transmute.jl b/docs/examples/UserGuide/mutate_transmute.jl deleted file mode 100644 index 97f6f83..0000000 --- a/docs/examples/UserGuide/mutate_transmute.jl +++ /dev/null @@ -1,61 +0,0 @@ -# The primary purpose of `@mutate()` is to either create a new column or to update an existing column *without* changing the number of rows in the dataset. If you only plan to select the mutated columns, then you can use `@transmute()` instead of `@mutate()`. However, in `Tidier.jl`, `@select()` can also be used to create and select new columns (unlike R's `tidyverse`), which means that `@transmute()` is a redundant function in that it has the same functionality as `@select()`. `@transmute` is included in `Tidier.jl` for convenience but is not strictly required. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Using `@mutate()` to add a new column - -# Let's create a new column that contains the budget for each movie expressed in millions of dollars, and the select a handful of columns and rows for the sake of brevity. Notice that the underscores in in `1_000_000` are strictly optional and included only for the sake of readability. Underscores within numbers are ignored by Julia, such that `1_000_000` is read by Julia exactly the same as `1000000`. - -@chain movies begin - @filter(!ismissing(Budget)) - @mutate(Budget_Millions = Budget/1_000_000) - @select(Title, Budget, Budget_Millions) - @slice(1:5) -end - -# ## Using `@mutate()` to update an existing column - -# Here we will repeat the same exercise, except that we will overwrite the existing `Budget` column. - -@chain movies begin - @filter(!ismissing(Budget)) - @mutate(Budget = Budget/1_000_000) - @select(Title, Budget) - @slice(1:5) -end - -# ## Using `@mutate()` with `in` - -# Here's an example of using `@mutate` with `in`. - -@chain movies begin - @filter(!ismissing(Budget)) - @mutate(Nineties = Year in 1990:1999) - @select(Title, Year, Nineties) - @slice(1:5) -end - -# ## Using `@mutate` with `n()` and `row_number()` - -# Here's an example of using `@mutate` with both `n()` and `row_number()`. Within the context of `mutate()`, `n()` and `row_number()` are created into temporarily columns, which means that they can be used inside of expressions. - -@chain movies begin - @mutate(Row_Num = row_number(), - Total_Rows = n()) - @filter(!ismissing(Budget)) - @select(Title, Year, Row_Num, Total_Rows) - @slice(1:5) -end - -# ## Using `@transmute` to update *and* select columns. - -# If we knew we wanted to select only the `Title` and `Budget` columns, we could have also used`@transmute()`, which (again) is just an alias for `@select()`. - -@chain movies begin - @filter(!ismissing(Budget)) - @transmute(Title = Title, Budget = Budget/1_000_000) - @slice(1:5) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/pivots.jl b/docs/examples/UserGuide/pivots.jl deleted file mode 100644 index d83627c..0000000 --- a/docs/examples/UserGuide/pivots.jl +++ /dev/null @@ -1,47 +0,0 @@ -# Pivoting a dataset is needed when information sitting inside of cell values needs to be converted into column names (to make the dataset wider) or vice verse (to make the dataset longer). Either action can be referred to as "reshaping" a dataset, and various frameworks refer to the actions as unstacking/stacking or spreading/gathering. In R's tidyverse, these actions are referred to as pivoting, where the two accompanying actions are `@pivot_wider()` and `@pivot_longer()`. - -# ## `@pivot_wider()` - -# Pivoting a dataset to make it wider is needed when information sitting inside of cell values needs to be converted into column names. The wider format is sometimes required for the purposes of calculating correlations or running statistical tests. - -# Let's start with a "long" DataFrame and make it wide. Why would we want to make it wide? Well, if we wanted to calculate a correlation between `A` and `B` for rows with corresponding `id` numbers, we may need to first make sure that `A` and `B` are represented in adjacent columns. - -using Tidier - -df_long = DataFrame(id = [1, 1, 2, 2], - variable = ["A", "B", "A", "B"], - value = [1, 2, 3, 4]) - -# To make this dataset wider, we can do the following: - -@pivot_wider(df_long, names_from = variable, values_from = value) - -# In `@pivot_wider()`, both the `names_from` and `values_from` arguments are required. `@pivot_wider()` also supports string values for the `names_from` and `values_from` arguments. - -@pivot_wider(df_long, names_from = "variable", values_from = "value") - -# ## `@pivot_longer()` - -# For calculating summary statistics (e.g., mean) by groups, or for plotting purposes, DataFrames often need to be converted to their longer form. For this, we can use `@pivot_longer`. First, let's start with a "wide" DataFrame. - -df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4]) - -# Now, let's transform this wide dataset into the longer form. Unlike `@pivot_wider()`, where providing the `names_from` and `values_from` arguments is required, the only item that's required in `@pivot_wider()` is a set of columns to pivot. The `names_to` and `values_to` arguments are optional, and if not provided, they will default to "variable" and "value", respectively. - -# We can recreate the original long dataset by doing the following. Multiple columns must be provided using selection syntax or a selection helper. Tuples containing multiple columns are not yet supported. - -@pivot_longer(df_wide, A:B) - -# Here is another way of providing the same result using a different type of selection syntax. - -@pivot_longer(df_wide, -id) - -# In this example, we set the `names_to` and `values_to` arguments. Either argument can be left out and will revert to the default value. The `names_to` and `values_to` arguments can be provided as strings or as bare unquoted variable names. - -# Here is an example with `names_to` and `values_to` containing strings: - -@pivot_longer(df_wide, A:B, names_to = "letter", values_to = "number") - -# And here is an example with `names_to` and `values_to` containing bare unquoted variables: - -@pivot_longer(df_wide, A:B, names_to = letter, values_to = number) diff --git a/docs/examples/UserGuide/rename.jl b/docs/examples/UserGuide/rename.jl deleted file mode 100644 index 8e376fe..0000000 --- a/docs/examples/UserGuide/rename.jl +++ /dev/null @@ -1,25 +0,0 @@ -# Renaming columns follows the same syntax as in R's `tidyverse`, where the "tidy expression" is `new_name = old_name`. While the main function to rename columns is `@rename()`, you can also use `@select()` if you additionally plan to select only the renamed columns. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Rename using `@rename()` - -# If you only want to rename the columns without selecting them, then this is where `@rename()` comes in handy. For the sake of brevity, we are selecting the first 5 columns and rows after performing the `@rename()`. - -@chain movies begin - @rename(title = Title, Minutes = Length) - @select(1:5) - @slice(1:5) -end - -# ## Rename using `@select()` - -# If you plan to only select those columns that you would like to rename, then you can use `@select()` to *both* rename and select the columns of interest. - -@chain movies begin - @select(title = Title, Minutes = Length) - @slice(1:5) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/select.jl b/docs/examples/UserGuide/select.jl deleted file mode 100644 index f115444..0000000 --- a/docs/examples/UserGuide/select.jl +++ /dev/null @@ -1,71 +0,0 @@ -# The `@select()` macro in `Tidier.jl` supports many of the nuances of the R `tidyverse` implementation, including indexing columns individually by name or number, indexing by ranges of columns using the `:` operator between column names or numbers, and negative selection using negated column names or numbers. Selection helpers such as `starts_with()`, `ends_with()`, `matches()`, and `contains()` are also supported. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Select the first 5 columns individually by name - -@chain movies begin - @select(Title, Year, Length, Budget, Rating) - @slice(1:5) -end - -# ## Select the first 5 columns individually by number - -@chain movies begin - @select(1, 2, 3, 4, 5) - @slice(1:5) -end - -# ## Select the first 5 columns by name (using a range) - -@chain movies begin - @select(Title:Rating) - @slice(1:5) -end - -# ## Select the first 5 columns by number (using a range) - -@chain movies begin - @select(1:5) - @slice(1:5) -end - -# ## Select all but the first 5 columns by name - -# Here we will limit the results to the first 5 remaining columns and the first 5 rows for the sake of brevity. - -@chain movies begin - @select(-(Title:Rating)) - @select(1:5) - @slice(1:5) -end - -# We can also use `!` for inverted selection instead of `-`. - -@chain movies begin - @select(!(Title:Rating)) - @select(1:5) - @slice(1:5) -end - -# ## Select all but the first 5 columns by number - -# We will again limit the results to the first 5 remaining columns and the first 5 rows for the sake of brevity. - -@chain movies begin - @select(-(1:5)) - @select(1:5) - @slice(1:5) -end - -# ## Mix and match selection - -# Just like in R's `tidyverse`, you can separate multiple selections with commas and mix and match different ways of selecting columns. - -@chain movies begin - @select(1, Budget:Rating) - @slice(1:5) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/slice.jl b/docs/examples/UserGuide/slice.jl deleted file mode 100644 index 90a40e1..0000000 --- a/docs/examples/UserGuide/slice.jl +++ /dev/null @@ -1,62 +0,0 @@ -# Slicing rows is similar to filtering rows, except that slicing is performed based on row numbers rather tha filter criteria. In `Tidier.jl`, slicing works similarly to R's `tidyverse` in that both positive (which rows to keep) and negative (which rows to remove) slicing is supported. For `@slice()`, any valid `UnitRange` of integers is considered valid; this is not the case for `@select()` or `across()`. - -# Remember: Just like every other `Tidier.jl` top-level macro, `@slice()` respects group. This means that in a grouped data frame, `@slice(1:2)` will select the first 2 rows *from each group*. - -using Tidier - -df = DataFrame(row_num = 1:10, - a = string.(repeat('a':'e', inner = 2)), - b = [1,1,1,2,2,2,3,3,3,4]) - -# ## Slicing using a range of numbers - -# This is an easy way of retrieving 5 consecutive rows. - -@chain df begin - @slice(1:5) -end - -# ## Slicing using a more complex UnitRange of numbers - -# How would we obtain every other from 1 to 7 (counting up by 2)? Note that `range()` is similar to `seq()` in R. - -@chain df begin - @slice(range(start = 1, step = 2, stop = 7)) -end - -# This same code can also be written using Julia's shorthand syntax for unit ranges. - -@chain df begin - @slice(1:2:7) -end - - -# ## Separate multiple row selections with commas - -# If you have multiple different row selections, you can separate them with commas. - -@chain df begin - @slice(1:5, 10) -end - -# ## Use `n()` as short-hand to indicate the number of rows - -# Select the last 2 rows. - -@chain df begin - @slice(n()-1, n()) -end - -# You can even use `n()` inside of UnitRanges, just like in R. Notice that the order of operations is slightly different in Julia as compared to R, so you don't have to wrap the `n()-1` expression inside of parentheses. - -@chain df begin - @slice(n()-1:n()) -end - -# ## Inverted selection using negative numbers - -# This line selects all rows except the first 5 rows. - -@chain df begin - @slice(-(1:5)) -end \ No newline at end of file diff --git a/docs/examples/UserGuide/summarize.jl b/docs/examples/UserGuide/summarize.jl deleted file mode 100644 index ddc0b27..0000000 --- a/docs/examples/UserGuide/summarize.jl +++ /dev/null @@ -1,44 +0,0 @@ -# Summarizing a dataset involves aggregating multiple rows down to (usually) a single row of data. This can be performed across the entire dataset, or if the dataset is grouped, then for each row in the dataset. This is implemented similarly to R's tidyverse using `@summarize()`. Out of admiration for Hadley Wickham, and to be consistent with the R `tidyverse`, both `@summarize()` and `@summarise()` are supported. - -# Note that summarization is different from other verbs in the `Tidier.jl` in 2 respects: - -# 1. No auto-vectorization is performed when using `@summarize()` -# 2. One layer of grouping is removed after each `@summarize()` function. - -# If you require further changes to grouping beyond the defaults, you can either `@ungroup()` or call `@group_by()` to regroup by a different set of variables. - -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -# ## Using `@summarize()` with `n()` to count the number of movies in the dataset. - -# Within the context of `@summarize()` only, `n()` is converted to DataFrames.jl's `nrow()` function. - -@chain movies begin - @summarize(n = n()) -end - -# ## Using `@summarize()` to calculate average budget of movies in the dataset. - -# The median budget in this dataset is $3 million, and the mean budget is $13 million! Making movies must be way more lucrative than making Julia packages. - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @summarize(median_budget = median(skipmissing(Budget)), - mean_budget = mean(skipmissing(Budget))) -end - -# ## Combining `@group_by()` with `@summarise()` - -# How many movies came out in each of the last 5 years? - -@chain movies begin - @group_by(Year) - @summarise(n = n()) - @arrange(desc(Year)) - @slice(1:5) -end - -# Notice that there was no need to explicitly `@ungroup()` the dataset after summarizing here. The `@summarise()` function removed one layer of grouping. Since this dataset was only grouped by one variable (`Year`), it was no longer grouped after the `@summarise` was performed. \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 87aa635..339bd76 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -115,24 +115,4 @@ plugins: # - mknotebooks # Jupyter notebooks - mkdocs-video nav: - - "Home": "index.md" - - "Movies dataset" : "examples/generated/UserGuide/dataset_movies.md" - - "@select" : "examples/generated/UserGuide/select.md" - - "@rename" : "examples/generated/UserGuide/rename.md" - - "@mutate" : "examples/generated/UserGuide/mutate_transmute.md" - - "@summarize" : "examples/generated/UserGuide/summarize.md" - - "@filter" : "examples/generated/UserGuide/filter.md" - - "@slice" : "examples/generated/UserGuide/slice.md" - - "@group_by" : "examples/generated/UserGuide/group_by.md" - - "@arrange" : "examples/generated/UserGuide/arrange.md" - - "@distinct" : "examples/generated/UserGuide/distinct.md" - - "across" : "examples/generated/UserGuide/across.md" - - "Conditionals": "examples/generated/UserGuide/conditionals.md" - - "Joins" : "examples/generated/UserGuide/joins.md" - - "Binding" : "examples/generated/UserGuide/binding.md" - - "Pivoting": "examples/generated/UserGuide/pivots.md" - - "Column names": "examples/generated/UserGuide/column_names.md" - - "Interpolation" : "examples/generated/UserGuide/interpolation.md" - - "Auto-vectorization" : "examples/generated/UserGuide/autovec.md" - - "Contribute" : "examples/generated/Contributors/Howto.md" - - "Reference" : "reference.md" \ No newline at end of file + - "Home": "index.md" \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index fbcd68c..30d7cb3 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,54 +1,38 @@ + - - -## What is Tidier.jl? - -Tidier.jl is a 100% Julia implementation of the R tidyverse -mini-language in Julia. Powered by the DataFrames.jl package and Julia’s -extensive meta-programming capabilities, Tidier.jl is an R user’s love -letter to data analysis in Julia. - -`Tidier.jl` has three goals, which differentiate it from other data analysis -meta-packages in Julia: - -```@raw html -??? tip "Stick as closely to tidyverse syntax as possible." - Whereas other meta-packages introduce Julia-centric idioms for working with - DataFrames, this package’s goal is to reimplement parts of tidyverse - in Julia. This means that `Tidier.jl` uses *tidy expressions* as opposed - to idiomatic Julia expressions. An example of a tidy expression is - `a = mean(b)`. In Julia, `a` and `b` are variables and are thus "eagerly" - evaluated. This means that if `b` is merely referring to a column in a - data frame and *not* an object in the global namespace, then an error - will be generated because `b` was not found. In idiomatic Julia, `b` - would need to be expressed as a symbol, or `:b`. Even then, - `a = mean(:b)` would generate an error because it's not possible to - calculate the mean value of a symbol. To handle this using idiomatic - Julia, `DataFrames.jl` introduces a mini-language that relies heavily - on the creation of anonymous functions, with explicit directional - pairs syntax using a `source => function => destination` syntax. While - this is quite elegant, it can be verbose. `Tidier.jl` aims to - reduce this complexity by exposing an R-like syntax, which is then - converted into valid `DataFrames.jl` code. The reason that - *tidy expressions* are considered valid by Julia in `Tidier.jl` is - because they are implemented using macros. Macros "capture" the - expressions they are given, and then they can modify those expressions - before evaluating them. For consistency, all top-level `dplyr` functions - are implemented as macros (whether or not a macro is truly needed), and - all "helper" functions (used inside of those top-level functions) are - implemented as functions or pseudo-functions (functions which only exist - through modification of the abstract syntax tree). -``` +## Tidier.jl -```@raw html -??? tip "Make broadcasting mostly invisible." - Broadcasting trips up many R users switching to Julia because R users are used to most functions being vectorized. `Tidier.jl` currently uses a lookup table to decide which functions *not* to vectorize; all other functions are automatically vectorized. Read the documentation page on "Autovectorization" to read about how this works, and how to override the defaults. An example of where this issue commonly causes errors is when centering a variable. To create a new column `a` that centers the column `b`, `Tidier.jl` lets you simply write `a = b - mean(b)` exactly as you would in R. This works because `Tidier.jl` knows to *not* vectorize `mean()` while also recognizing that `-` *should* be vectorized such that this expression is rewritten in `DataFrames.jl` as `:b => (b -> b .- mean(b)) => :a`. For any user-defined function that you want to "mark" as being non-vectorized, you can prefix it with a `~`. For example, a function `new_mean()`, if it had the same functionality as `mean()` *would* normally get vectorized by `Tidier.jl` unless you write it as `~new_mean()`. -``` +Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. -```@raw html -??? tip "Make scalars and tuples mostly interchangeable." - In Julia, the function `across(a, mean)` is dispatched differently than `across((a, b), mean)`. The first argument in the first instance above is treated as a scalar, whereas the second instance is treated as a tuple. This can be very confusing to R users because `1 == c(1)` is `TRUE` in R, whereas in Julia `1 == (1,)` evaluates to `false`. The design philosophy in `Tidier.jl` is that the user should feel free to provide a scalar or a tuple as they see fit anytime multiple values are considered valid for a given argument, such as in `across()`, and `Tidier.jl` will figure out how to dispatch it. -``` + + +## TidierData.jl + +TidierData.jl is package dedicated to data transformation and reshaping, powered by DataFrames.jl, ShiftedArrays.jl, and Cleaner.jl. It focuses on functionality within the dplyr, tidyr, and janitor R packages. + + + +## TidierPlots.jl + +TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.jl. It focuses on functionality within the ggplot2 R package. + + + +## TidierCats.jl + +TidierCats.jl is a package dedicated to handling categorical variables, powered by CategoricalArrays.jl. It focuses on functionality within the forcats R package. + + + +## TidierDates.jl + +TidierDates.jl is a package dedicated to handling dates and times. It focuses on functionality within the lubridate R package. + + + +## TidierStrings.jl + +TidierStrings.jl is a package dedicated to handling strings. It focuses on functionality within the stringr R package. ## Installation @@ -81,79 +65,10 @@ using Pkg Pkg.add(url="https://github.com/TidierOrg/Tidier.jl") ``` -## What macros and functions does Tidier.jl support? - -To support R-style programming, `Tidier.jl` is implemented using macros. This is because macros are able to "capture" the code before executing it, which allows the package to support R-like "tidy expressions" that would otherwise not be considered valid Julia code. - -Tidier.jl currently supports the following top-level macros: - -```@raw html -!!! example "Top-level macros:" - - `@glimpse()` - - `@select()`, `@rename()`, and `@distinct()` - - `@mutate()` and `@transmute()` - - `@summarize()` and `@summarise()` - - `@filter()` and `@slice()` - - `@group_by()` and `@ungroup()` - - `@arrange()` - - `@pull()` - - `@count()` and `@tally()` - - `@left_join()`, `@right_join()`, `@inner_join()`, and `@full_join()` - - `@bind_rows()` and `@bind_cols()` - - `@pivot_wider()` and `@pivot_longer()` - - `@drop_na()` - - `@clean_names()` (as in R's `janitor::clean_names()` function) -``` -Tidier.jl also supports the following helper functions: - -```@raw html -!!! example "Helper functions:" - - `across()` - - `desc()` - - `if_else()` and `case_when()` - - `n()` and `row_number()` - - `ntile()` - - `lag()` and `lead()` - - `starts_with()`, `ends_with()`, `matches()`, and `contains()` - - `as_float()`, `as_integer()`, and `as_string()` -``` - -See the [Reference](https://tidierorg.github.io/Tidier.jl/dev/reference/) page for a detailed guide to each of the macros and functions. - -## Example - -Let's select the first five movies in our dataset whose budget exceeds the mean budget. Unlike in R, where we pass an `na.rm = TRUE` argument to remove missing values, in Julia we wrap the variable with a `skipmissing()` to remove the missing values before the `mean()` is calculated. - -```julia -using Tidier -using RDatasets - -movies = dataset("ggplot2", "movies"); - -@chain movies begin - @mutate(Budget = Budget / 1_000_000) - @filter(Budget >= mean(skipmissing(Budget))) - @select(Title, Budget) - @slice(1:5) -end -``` - -``` -5×2 DataFrame - Row │ Title Budget - │ String Float64? -─────┼────────────────────────────────────── - 1 │ 'Til There Was You 23.0 - 2 │ 10 Things I Hate About You 16.0 - 3 │ 102 Dalmatians 85.0 - 4 │ 13 Going On 30 37.0 - 5 │ 13th Warrior, The 85.0 -``` - ## What’s new See [NEWS.md](https://github.com/TidierOrg/Tidier.jl/blob/main/NEWS.md) for the latest updates. ## What's missing -Is there a tidyverse feature missing that you would like to see in Tidier.jl? Please file a GitHub issue. Because Tidier.jl primarily wraps DataFrames.jl, our decision to integrate a new feature will be guided by how well-supported it is within DataFrames.jl and how likely other users are to benefit from it. \ No newline at end of file +Is there a tidyverse feature missing that you would like to see in Tidier.jl? Please file a GitHub issue. \ No newline at end of file diff --git a/src/Tidier.jl b/src/Tidier.jl index 83b8f57..ef07be9 100644 --- a/src/Tidier.jl +++ b/src/Tidier.jl @@ -1,694 +1,11 @@ module Tidier -using DataFrames -using MacroTools -using Chain -using Statistics -using Cleaner using Reexport -# Exporting `Cols` because `summarize(!!vars, funs))` with multiple interpolated -# columns requires `Cols()` to be nested within `Cols()`, so `Cols` needs to be exported. -@reexport using DataFrames: DataFrame, Cols, describe, nrow, proprow -@reexport using Chain -@reexport using Statistics -@reexport using ShiftedArrays: lag, lead - -export Tidier_set, across, desc, n, row_number, starts_with, ends_with, matches, if_else, case_when, ntile, - as_float, as_integer, as_string, @select, @transmute, @rename, @mutate, @summarize, @summarise, @filter, - @group_by, @ungroup, @slice, @arrange, @distinct, @pull, @left_join, @right_join, @inner_join, @full_join, - @pivot_wider, @pivot_longer, @bind_rows, @bind_cols, @clean_names, @count, @tally, @drop_na, @glimpse - -# Package global variables -const code = Ref{Bool}(false) # output DataFrames.jl code? -const log = Ref{Bool}(false) # output tidylog output? (not yet implemented) - -# Includes -include("docstrings.jl") -include("parsing.jl") -include("joins.jl") -include("binding.jl") -include("pivots.jl") -include("compound_verbs.jl") -include("clean_names.jl") -include("conditionals.jl") -include("pseudofunctions.jl") -include("helperfunctions.jl") -include("ntile.jl") -include("type_conversions.jl") - -# Function to set global variables -""" -$docstring_Tidier_set -""" -function Tidier_set(option::AbstractString, value::Bool) - if option == "code" - code[] = value - elseif option == "log" - throw("Logging is not enabled yet") - else - throw("That is not a valid option.") - end -end - -""" -$docstring_select -""" -macro select(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - select($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$")); ungroup = false) - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - select($(tidy_exprs...)) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_transmute -""" -macro transmute(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - select($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$")); ungroup = false) - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - select($(tidy_exprs...)) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_rename -""" -macro rename(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - rename($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$")); ungroup = false) - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - rename($(tidy_exprs...)) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_mutate -""" -macro mutate(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - transform($(tidy_exprs...); ungroup = false) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$")); ungroup = false) - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - transform($(tidy_exprs...)) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_summarize -""" -macro summarize(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs; summarize = true) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs; autovec=false) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - @chain _ begin - if length(col_names) == 1 - @chain _ begin - combine(_, $(tidy_exprs...); ungroup = true) - select(_, Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - else - @chain _ begin - combine(_, $(tidy_exprs...); ungroup = true) - select(_, Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - groupby(_, col_names[1:end-1]; sort = true) - end - end - end - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - combine($(tidy_exprs...)) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_summarize -""" -macro summarise(df, exprs...) - :(@summarize($(esc(df)), $(exprs...))) -end - -""" -$docstring_filter -""" -macro filter(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs; subset=true) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n; ungroup = false) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number; ungroup = false) - else - _ - end - end - subset($(tidy_exprs...); skipmissing = true, ungroup = false) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$")); ungroup = false) - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - subset($(tidy_exprs...); skipmissing = true) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_group_by -""" -macro group_by(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - grouping_exprs = parse_group_by.(exprs) - - df_expr = quote - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - transform($(tidy_exprs...)) - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - groupby(Cols($(grouping_exprs...)); sort = true) - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_ungroup -""" -macro ungroup(df) - :(DataFrame($(esc(df)))) -end - -""" -$docstring_slice -""" -macro slice(df, exprs...) - df_expr = quote - local interpolated_indices = parse_slice_n.($exprs, nrow(DataFrame($(esc(df))))) - local original_indices = [eval.(interpolated_indices)...] - local clean_indices = Int64[] - for index in original_indices - if index isa Number - push!(clean_indices, index) - else - append!(clean_indices, collect(index)) - end - end - - if all(clean_indices .> 0) - if $(esc(df)) isa GroupedDataFrame - combine($(esc(df)); ungroup = false) do sdf - sdf[clean_indices, :] - end - else - combine($(esc(df))) do sdf - sdf[clean_indices, :] - end - end - elseif all(clean_indices .< 0) - clean_indices = -clean_indices - if $(esc(df)) isa GroupedDataFrame - combine($(esc(df)); ungroup = true) do sdf - sdf[Not(clean_indices), :] - end - else - combine($(esc(df))) do sdf - sdf[Not(clean_indices), :] - end - end - else - throw("@slice() indices must either be all positive or all negative.") - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_arrange -""" -macro arrange(df, exprs...) - arrange_exprs = parse_desc.(exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - - @chain $(esc(df)) begin - DataFrame # remove grouping - sort([$(arrange_exprs...)]) # Must use [] instead of Cols() here - groupby(col_names; sort = true) # regroup - end - else - sort($(esc(df)), [$(arrange_exprs...)]) # Must use [] instead of Cols() here - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_distinct -""" -macro distinct(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - any_found_n = any([i[2] for i in interpolated_exprs]) - any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_tidy.(tidy_exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - @chain $(esc(df)) begin - DataFrame # remove grouping because `unique()` does not work on GroupDataFrames - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - @chain _ begin - if length([$tidy_exprs...]) == 0 - unique(_) - else - unique(_, Cols($(tidy_exprs...))) - end - end - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - groupby(col_names; sort = true) # regroup - end - else - @chain $(esc(df)) begin - @chain _ begin - if $any_found_n - transform(_, nrow => :Tidier_n) - else - _ - end - end - @chain _ begin - if $any_found_row_number - transform(_, eachindex => :Tidier_row_number) - else - _ - end - end - @chain _ begin - if length([$tidy_exprs...]) == 0 - unique(_) - else - unique(_, Cols($(tidy_exprs...))) - end - end - select(Cols(Not(r"^(Tidier_n|Tidier_row_number)$"))) - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_pull -""" -macro pull(df, column) - column, found_n, found_row_number = parse_interpolation(column) - column = parse_tidy(column) - vec_expr = quote - $(esc(df))[:, $column] - end - if code[] - @info MacroTools.prettify(vec_expr) - end - return vec_expr -end - -""" -$docstring_drop_na -""" -macro drop_na(df, exprs...) - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - - tidy_exprs = parse_tidy.(tidy_exprs) - num_exprs = length(exprs) - df_expr = quote - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - @chain $(esc(df)) begin - DataFrame # remove grouping because `dropmissing()` does not work on GroupDataFrames - @chain _ begin - if $num_exprs == 0 - dropmissing(_) - else - dropmissing(_, Cols($(tidy_exprs...))) - end - end - groupby(col_names; sort = true) # regroup - end - else - @chain $(esc(df)) begin - @chain _ begin - if $num_exprs == 0 - dropmissing(_) - else - dropmissing(_, Cols($(tidy_exprs...))) - end - end - end - end - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_glimpse -""" -macro glimpse(df, width = 80) - df_expr = quote - # DataFrame() needed to handle grouped data frames - println("Rows: ", nrow(DataFrame($(esc(df))))) - println("Columns: ", ncol(DataFrame($(esc(df))))) - - if $(esc(df)) isa GroupedDataFrame - println("Groups: ", join(string.(groupcols($(esc(df)))), ", "), " [", length(keys($(esc(df)))), "]") - end - - for (name, col) in pairs(eachcol(DataFrame($(esc(df))))) - rpad("." * string(name), 15) * - rpad(eltype(col), 15) * - join(col, ", ") |> - x -> first(x, $width) |> # show the first $width number of characters - println - end - end - return df_expr -end +@reexport using TidierData +@reexport using TidierPlots +@reexport using TidierCats +@reexport using TidierDates +@reexport using TidierStrings end \ No newline at end of file diff --git a/src/binding.jl b/src/binding.jl deleted file mode 100644 index 929917d..0000000 --- a/src/binding.jl +++ /dev/null @@ -1,35 +0,0 @@ -""" -$docstring_bind_rows -""" -macro bind_rows(df, exprs...) - tidy_exprs = parse_bind_args.(exprs) - locate_id = findfirst(i -> i[2], tidy_exprs) - if locate_id isa Nothing - df_vec = [i[1] for i in tidy_exprs] - id_expr = nothing - else - df_vec = deleteat!([tidy_exprs...], locate_id) - df_vec = [i[1] for i in df_vec] - id_expr = tidy_exprs[locate_id][1] - end - - df_expr = quote - vcat( DataFrame($(esc(df))), $(df_vec...), cols = :union, source = $id_expr) - end - return df_expr -end - -""" -$docstring_bind_cols -""" -macro bind_cols(df, exprs...) - tidy_exprs = parse_bind_args.(exprs) - df_vec = [i[1] for i in tidy_exprs] - - df_expr = quote - hcat( DataFrame($(esc(df))), $(df_vec...), makeunique = true) - end - return df_expr -end - - diff --git a/src/clean_names.jl b/src/clean_names.jl deleted file mode 100644 index c79d4ec..0000000 --- a/src/clean_names.jl +++ /dev/null @@ -1,48 +0,0 @@ -macro clean_names(df, case) - df_expr = quote - if $case != "snake_case" && $case != "camelCase" - throw("`case` must be either \"snake_case\" or \"camelCase\".") - end - - local style = Symbol($case) - - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - - @chain $(esc(df)) begin - DataFrame # remove grouping - polish_names(_; style = style) - DataFrame # convert back to DataFrame - groupby(col_names; sort = true) # regroup - end - else - @chain $(esc(df)) begin - polish_names(_; style = style) - DataFrame # convert back to DataFrame - end - end - end - return df_expr -end - -macro clean_names(df) - df_expr = quote - - if $(esc(df)) isa GroupedDataFrame - local col_names = groupcols($(esc(df))) - - @chain $(esc(df)) begin - DataFrame # remove grouping - polish_names - DataFrame # convert back to DataFrame - groupby(col_names; sort = true) # regroup - end - else - @chain $(esc(df)) begin - polish_names - DataFrame # convert back to DataFrame - end - end - end - return df_expr -end \ No newline at end of file diff --git a/src/compound_verbs.jl b/src/compound_verbs.jl deleted file mode 100644 index 447b73e..0000000 --- a/src/compound_verbs.jl +++ /dev/null @@ -1,105 +0,0 @@ -# Compound verbs refer to macros that primarily wrap other core macros in this package. -# This includes verbs like `@count()` and `@tally`. For compound verbs, any relevant parsing -# functions should be bundled after the macro instead of being placed in parsing.jl. - -""" -$docstring_tally -""" -macro tally(df, exprs...) - wt, sort = parse_tally_args(exprs...) - - wt_quoted = QuoteNode(wt) - - df_expr = quote - @chain $(esc(df)) begin - @chain _ begin - if isnothing($wt_quoted) - @summarize(_, n = n()) - else - @summarize(_, n = sum(skipmissing($wt))) - end - end - @chain _ begin - if $sort == true - @arrange(_, desc(n)) - else - _ - end - end - end - end - return df_expr -end - -function parse_tally_args(tidy_exprs::Union{Expr,Symbol}...) - wt = nothing - sort = false - - for tally_expr in tidy_exprs - if @capture(tally_expr, wt = arg_) - wt = arg - elseif @capture(tally_expr, sort = arg_) - sort = arg - else - throw("The only supported arguments are `wt` and `sort`, and both must be named.") - end - end - return wt, sort -end - -""" -$docstring_count -""" -macro count(df, exprs...) - col_names, wt, sort = parse_count_args(exprs...) - - col_names_quoted = QuoteNode(col_names) - wt_quoted = QuoteNode(wt) - - df_expr = quote - @chain $(esc(df)) begin - @chain _ begin - if length($col_names_quoted) > 0 - @group_by(_, $(col_names...)) - else - _ - end - end - @chain _ begin - if isnothing($wt_quoted) - @summarize(_, n = n()) - else - @summarize(_, n = sum(skipmissing($wt))) - end - end - @chain _ begin - if $sort == true - @arrange(_, desc(n)) - else - _ - end - end - @ungroup - end - end - return df_expr -end - -function parse_count_args(tidy_exprs::Union{Expr,Symbol}...) - col_names = Union{Expr,Symbol}[] - wt = nothing - sort = false - - for count_expr in tidy_exprs - if @capture(count_expr, wt = arg_) - wt = arg - elseif @capture(count_expr, sort = arg_) - sort = arg - elseif @capture(count_expr, lhs_ = rhs_) - throw("The only supported arguments are `wt` and `sort`, and both must be named.") - else - push!(col_names, count_expr) - end - end - return col_names, wt, sort -end \ No newline at end of file diff --git a/src/conditionals.jl b/src/conditionals.jl deleted file mode 100644 index 01d48ab..0000000 --- a/src/conditionals.jl +++ /dev/null @@ -1,41 +0,0 @@ -""" -$docstring_if_else -""" -function if_else(condition::Union{Bool, Missing}, yes, no, miss) - if ismissing(condition) - return miss - elseif condition == true - return yes - elseif condition == false - return no - else - throw("condition must be a Boolean (true/false/missing).") - end -end - -function if_else(condition::Union{Bool, Missing}, yes, no) - if ismissing(condition) - return missing - elseif condition == true - return yes - elseif condition == false - return no - else - throw("condition must be a Boolean (true/false/missing).") - end -end - -""" -$docstring_case_when -""" -function case_when(conditions...) - for condition in conditions - if ismissing(condition[1]) - continue - elseif condition[1] - return condition[2] - end - end - return missing -end - diff --git a/src/docstrings.jl b/src/docstrings.jl deleted file mode 100644 index d167dfb..0000000 --- a/src/docstrings.jl +++ /dev/null @@ -1,1991 +0,0 @@ -const docstring_Tidier_set = -""" - Tidier_set(option::AbstractString, value::Bool) - -Set package options. - -Here are the supported options and what they do: - -- "code": Defaults to `false`. If set to `true`, this option displays the DataFrames.jl code generated by the Tidier.jl package. It is useful for debugging whether errors are introduced by Tidier.jl's generated code. - -# Arguments -- `option`: "code" -- `value`: `true` or `false` -""" - - -const docstring_across = -""" - across(variable[s], function[s]) - -Apply functions to multiple variables. If specifying multiple variables or functions, surround them with parentheses so that they are recognized as a tuple. - -This function should only be called inside of Tidier.jl macros. - -# Arguments -- `variable[s]`: An unquoted variable, or if multiple, an unquoted tuple of variables. -- `function[s]`: A function, or if multiple, a tuple of functions. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @summarize(across(b, minimum)) - end -1×1 DataFrame - Row │ b_minimum - │ Int64 -─────┼─────────── - 1 │ 1 - -julia> @chain df begin - @summarize(across((b,c), (minimum, maximum))) - end -1×4 DataFrame - Row │ b_minimum c_minimum b_maximum c_maximum - │ Int64 Int64 Int64 Int64 -─────┼──────────────────────────────────────────── - 1 │ 1 11 5 15 - -julia> @chain df begin - @mutate(across((b,c), (minimum, maximum))) - end -5×7 DataFrame - Row │ a b c b_minimum c_minimum b_maximum c_maximum - │ Char Int64 Int64 Int64 Int64 Int64 Int64 -─────┼──────────────────────────────────────────────────────────────── - 1 │ a 1 11 1 11 5 15 - 2 │ b 2 12 1 11 5 15 - 3 │ c 3 13 1 11 5 15 - 4 │ d 4 14 1 11 5 15 - 5 │ e 5 15 1 11 5 15 - -julia> @chain df begin - @mutate(across((b, starts_with("c")), (minimum, maximum))) - end -5×7 DataFrame - Row │ a b c b_minimum c_minimum b_maximum c_maximum - │ Char Int64 Int64 Int64 Int64 Int64 Int64 -─────┼──────────────────────────────────────────────────────────────── - 1 │ a 1 11 1 11 5 15 - 2 │ b 2 12 1 11 5 15 - 3 │ c 3 13 1 11 5 15 - 4 │ d 4 14 1 11 5 15 - 5 │ e 5 15 1 11 5 15 - -``` -""" - -const docstring_desc = -""" - desc(col) - -Orders the rows of a DataFrame column in descending order when used inside of `@arrange()`. This function should only be called inside of `@arrange()``. - -# Arguments -- `col`: An unquoted column name. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20); - -julia> @chain df begin - @arrange(a, desc(b)) - end -10×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 2 12 - 2 │ a 1 11 - 3 │ b 4 14 - 4 │ b 3 13 - 5 │ c 6 16 - 6 │ c 5 15 - 7 │ d 8 18 - 8 │ d 7 17 - 9 │ e 10 20 - 10 │ e 9 19 -``` -""" - -const docstring_select = -""" - @select(df, exprs...) - -Select variables in a DataFrame. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: One or more unquoted variable names separated by commas. Variable names - can also be used as their positions in the data, like `x:y`, to select - a range of variables. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @select(a, b, c) - end -5×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 2 12 - 3 │ c 3 13 - 4 │ d 4 14 - 5 │ e 5 15 - -julia> @chain df begin - @select(a:b) - end -5×2 DataFrame - Row │ a b - │ Char Int64 -─────┼───────────── - 1 │ a 1 - 2 │ b 2 - 3 │ c 3 - 4 │ d 4 - 5 │ e 5 - -julia> @chain df begin - @select(1:2) - end -5×2 DataFrame - Row │ a b - │ Char Int64 -─────┼───────────── - 1 │ a 1 - 2 │ b 2 - 3 │ c 3 - 4 │ d 4 - 5 │ e 5 - -julia> @chain df begin - @select(-(a:b)) - end -5×1 DataFrame - Row │ c - │ Int64 -─────┼─────── - 1 │ 11 - 2 │ 12 - 3 │ 13 - 4 │ 14 - 5 │ 15 - -julia> @chain df begin - @select(!(a:b)) - end -5×1 DataFrame - Row │ c - │ Int64 -─────┼─────── - 1 │ 11 - 2 │ 12 - 3 │ 13 - 4 │ 14 - 5 │ 15 - -julia> @chain df begin - @select(contains("b"), starts_with("c")) - end -5×2 DataFrame - Row │ b c - │ Int64 Int64 -─────┼────────────── - 1 │ 1 11 - 2 │ 2 12 - 3 │ 3 13 - 4 │ 4 14 - 5 │ 5 15 - -julia> @chain df begin - @select(-(1:2)) - end -5×1 DataFrame - Row │ c - │ Int64 -─────┼─────── - 1 │ 11 - 2 │ 12 - 3 │ 13 - 4 │ 14 - 5 │ 15 - -julia> @chain df begin - @select(!(1:2)) - end -5×1 DataFrame - Row │ c - │ Int64 -─────┼─────── - 1 │ 11 - 2 │ 12 - 3 │ 13 - 4 │ 14 - 5 │ 15 - -julia> @chain df begin - @select(-c) - end -5×2 DataFrame - Row │ a b - │ Char Int64 -─────┼───────────── - 1 │ a 1 - 2 │ b 2 - 3 │ c 3 - 4 │ d 4 - 5 │ e 5 - -julia> @chain df begin - @select(-contains("a")) - end -5×2 DataFrame - Row │ b c - │ Int64 Int64 -─────┼────────────── - 1 │ 1 11 - 2 │ 2 12 - 3 │ 3 13 - 4 │ 4 14 - 5 │ 5 15 - -julia> @chain df begin - @select(!contains("a")) - end -5×2 DataFrame - Row │ b c - │ Int64 Int64 -─────┼────────────── - 1 │ 1 11 - 2 │ 2 12 - 3 │ 3 13 - 4 │ 4 14 - 5 │ 5 15 -``` -""" - -const docstring_transmute = -""" - @transmute(df, exprs...) - -Create a new DataFrame with only computed columns. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: add new columns or replace values of existed columns using - `new_variable = values` syntax. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @transmute(d = b + c) - end -5×1 DataFrame - Row │ d - │ Int64 -─────┼─────── - 1 │ 12 - 2 │ 14 - 3 │ 16 - 4 │ 18 - 5 │ 20 -``` -""" - -const docstring_rename = -""" - @rename(df, exprs...) - -Change the names of individual column names in a DataFrame. Users can also use `@select()` -to rename and select columns. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: Use `new_name = old_name` syntax to rename selected columns. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @rename(d = b, e = c) - end -5×3 DataFrame - Row │ a d e - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 2 12 - 3 │ c 3 13 - 4 │ d 4 14 - 5 │ e 5 15 -``` -""" - -const docstring_mutate = -""" - @mutate(df, exprs...) - -Create new columns as functions of existing columns. The results have the same number of -rows as `df`. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: add new columns or replace values of existed columns using - `new_variable = values` syntax. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @mutate(d = b + c, b_minus_mean_b = b - mean(b)) - end -5×5 DataFrame - Row │ a b c d b_minus_mean_b - │ Char Int64 Int64 Int64 Float64 -─────┼─────────────────────────────────────────── - 1 │ a 1 11 12 -2.0 - 2 │ b 2 12 14 -1.0 - 3 │ c 3 13 16 0.0 - 4 │ d 4 14 18 1.0 - 5 │ e 5 15 20 2.0 - -julia> @chain df begin - @mutate(d = b in (1,3)) - end -5×4 DataFrame - Row │ a b c d - │ Char Int64 Int64 Bool -─────┼─────────────────────────── - 1 │ a 1 11 true - 2 │ b 2 12 false - 3 │ c 3 13 true - 4 │ d 4 14 false - 5 │ e 5 15 false - -julia> @chain df begin - @mutate(across((b, c), mean)) - end -5×5 DataFrame - Row │ a b c b_mean c_mean - │ Char Int64 Int64 Float64 Float64 -─────┼────────────────────────────────────── - 1 │ a 1 11 3.0 13.0 - 2 │ b 2 12 3.0 13.0 - 3 │ c 3 13 3.0 13.0 - 4 │ d 4 14 3.0 13.0 - 5 │ e 5 15 3.0 13.0 - -julia> @chain df begin - @summarize(across(contains("b"), mean)) - end -1×1 DataFrame - Row │ b_mean - │ Float64 -─────┼───────── - 1 │ 3.0 - -julia> @chain df begin - @summarize(across(-contains("a"), mean)) - end -1×2 DataFrame - Row │ b_mean c_mean - │ Float64 Float64 -─────┼────────────────── - 1 │ 3.0 13.0 -``` -""" - -const docstring_summarize = -""" - @summarize(df, exprs...) - @summarise(df, exprs...) - -Create a new DataFrame with one row that aggregating all observations from the input DataFrame or GroupedDataFrame. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: a `new_variable = function(old_variable)` pair. `function()` should be an aggregate function that returns a single value. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @summarize(mean_b = mean(b), median_b = median(b)) - end -1×2 DataFrame - Row │ mean_b median_b - │ Float64 Float64 -─────┼─────────────────── - 1 │ 3.0 3.0 - -julia> @chain df begin - @summarise(mean_b = mean(b), median_b = median(b)) - end -1×2 DataFrame - Row │ mean_b median_b - │ Float64 Float64 -─────┼─────────────────── - 1 │ 3.0 3.0 - -julia> @chain df begin - @summarize(across((b,c), (minimum, maximum))) - end -1×4 DataFrame - Row │ b_minimum c_minimum b_maximum c_maximum - │ Int64 Int64 Int64 Int64 -─────┼──────────────────────────────────────────── - 1 │ 1 11 5 15 -``` -""" - -const docstring_filter = -""" - @filter(df, exprs...) - -Subset a DataFrame and return a copy of DataFrame where specified conditions are satisfied. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: transformation(s) that produce vectors containing `true` or `false`. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @filter(b >= mean(b)) - end -3×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ c 3 13 - 2 │ d 4 14 - 3 │ e 5 15 - -julia> @chain df begin - @filter(b >= 3 && c >= 14) - end -2×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ d 4 14 - 2 │ e 5 15 - -julia> @chain df begin - @filter(b in (1, 3)) - end -2×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ c 3 13 -``` -""" - -const docstring_group_by = -""" - @group_by(df, exprs...) - -Return a `GroupedDataFrame` where operations are performed by groups specified by unique -sets of `cols`. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: DataFrame columns to group by or tidy expressions. Can be a single tidy expression or multiple expressions separated by commas. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @group_by(a) - @summarize(b = mean(b)) - end -5×2 DataFrame - Row │ a b - │ Char Float64 -─────┼─────────────── - 1 │ a 1.0 - 2 │ b 2.0 - 3 │ c 3.0 - 4 │ d 4.0 - 5 │ e 5.0 - -julia> @chain df begin - @group_by(d = uppercase(a)) - @summarize(b = mean(b)) - end -5×2 DataFrame - Row │ d b - │ Char Float64 -─────┼─────────────── - 1 │ A 1.0 - 2 │ B 2.0 - 3 │ C 3.0 - 4 │ D 4.0 - 5 │ E 5.0 -``` -""" - -const docstring_ungroup = -""" - @ungroup(df) - -Return a `DataFrame` with all groups removed. - -If this is applied to a `GroupedDataFrame`, then it removes the grouping. If this is applied to a `DataFrame` (without any groups), then it returns the `DataFrame` unchanged. - -# Arguments -- `df`: A `GroupedDataFrame` or `DataFrame``. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @group_by(a) - end -GroupedDataFrame with 5 groups based on key: a -First Group (1 row): a = 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase) - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 -⋮ -Last Group (1 row): a = 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase) - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ e 5 15 - -julia> @chain df begin - @group_by(a) - @ungroup - end -5×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 2 12 - 3 │ c 3 13 - 4 │ d 4 14 - 5 │ e 5 15 -``` -""" - -const docstring_slice = -""" - @slice(df, exprs...) - -Select, remove or duplicate rows by indexing their integer positions. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: integer row values. Use positive values to keep the rows, or negative values to drop. Values provided must be either all positive or all negative, and they must be within the range of DataFrames' row numbers. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e'), b = 1:5, c = 11:15); - -julia> @chain df begin - @slice(1:5) - end -5×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 2 12 - 3 │ c 3 13 - 4 │ d 4 14 - 5 │ e 5 15 - -julia> @chain df begin - @slice(-(1:2)) - end -3×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ c 3 13 - 2 │ d 4 14 - 3 │ e 5 15 - -julia> @chain df begin - @group_by(a) - @slice(1) - @ungroup - end -5×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 2 12 - 3 │ c 3 13 - 4 │ d 4 14 - 5 │ e 5 15 -``` -""" - -const docstring_arrange = -""" - @arrange(df, exprs...) - -Order the rows of a DataFrame by the values of specified columns. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: Variables from the input DataFrame. Use `desc()` to sort in descending order. Multiple variables can be specified, separated by commas. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20); - -julia> @chain df begin - @arrange(a) - end -10×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ a 2 12 - 3 │ b 3 13 - 4 │ b 4 14 - 5 │ c 5 15 - 6 │ c 6 16 - 7 │ d 7 17 - 8 │ d 8 18 - 9 │ e 9 19 - 10 │ e 10 20 - -julia> @chain df begin - @arrange(a, desc(b)) - end -10×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 2 12 - 2 │ a 1 11 - 3 │ b 4 14 - 4 │ b 3 13 - 5 │ c 6 16 - 6 │ c 5 15 - 7 │ d 8 18 - 8 │ d 7 17 - 9 │ e 10 20 - 10 │ e 9 19 -``` -""" - -const docstring_distinct = -""" - distinct(df, exprs...) - -Return distinct rows of a DataFrame. - -If no columns or expressions are provided, then unique rows across all columns are returned. Otherwise, unique rows are determined based on the columns or expressions provided, and then all columns are returned. - -# Arguments -- `df`: A DataFrame. -- `exprs...`: One or more unquoted variable names separated by commas. Variable names - can also be used as their positions in the data, like `x:y`, to select - a range of variables. - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = repeat(1:5, 2), c = 11:20); - -julia> @chain df begin - @distinct() - end -10×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ a 2 12 - 3 │ b 3 13 - 4 │ b 4 14 - 5 │ c 5 15 - 6 │ c 1 16 - 7 │ d 2 17 - 8 │ d 3 18 - 9 │ e 4 19 - 10 │ e 5 20 - -julia> @chain df begin - @distinct(a) - end -5×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 3 13 - 3 │ c 5 15 - 4 │ d 2 17 - 5 │ e 4 19 - -julia> @chain df begin - @distinct(starts_with("a")) - end -5×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ b 3 13 - 3 │ c 5 15 - 4 │ d 2 17 - 5 │ e 4 19 - -julia> @chain df begin - @distinct(a, b) - end -10×3 DataFrame - Row │ a b c - │ Char Int64 Int64 -─────┼──────────────────── - 1 │ a 1 11 - 2 │ a 2 12 - 3 │ b 3 13 - 4 │ b 4 14 - 5 │ c 5 15 - 6 │ c 1 16 - 7 │ d 2 17 - 8 │ d 3 18 - 9 │ e 4 19 - 10 │ e 5 20 -``` -""" - -const docstring_pull = -""" - @pull(df, column) - -Pull (or extract) a column as a vector. - -# Arguments -- `df`: A DataFrame. -- `column`: A single column, referred to either by its name or number. - -# Examples -```jldoctest -julia> df = DataFrame(a = 'a':'e', b = 1:5, c = 11:15); - -julia> @chain df begin - @pull(a) - end -5-element Vector{Char}: - 'a': ASCII/Unicode U+0061 (category Ll: Letter, lowercase) - 'b': ASCII/Unicode U+0062 (category Ll: Letter, lowercase) - 'c': ASCII/Unicode U+0063 (category Ll: Letter, lowercase) - 'd': ASCII/Unicode U+0064 (category Ll: Letter, lowercase) - 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase) - -julia> @chain df begin - @pull(2) - end -5-element Vector{Int64}: - 1 - 2 - 3 - 4 - 5 -``` -""" - -const docstring_left_join = -""" - @left_join(df1, df2, [by]) - -Perform a left join on `df1` and `df` with an optional `by`. - -# Arguments -- `df1`: A DataFrame. -- `df2`: A DataFrame. -- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. - -# Examples -```jldoctest -julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); - -julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - -julia> @left_join(df1, df2) -2×3 DataFrame - Row │ a b c - │ String Int64 Int64? -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - -julia> @left_join(df1, df2, a) -2×3 DataFrame - Row │ a b c - │ String Int64 Int64? -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - -julia> @left_join(df1, df2, a = a) -2×3 DataFrame - Row │ a b c - │ String Int64 Int64? -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - -julia> @left_join(df1, df2, "a") -2×3 DataFrame - Row │ a b c - │ String Int64 Int64? -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - -julia> @left_join(df1, df2, "a" = "a") -2×3 DataFrame - Row │ a b c - │ String Int64 Int64? -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing -``` -""" - -const docstring_right_join = -""" - @right_join(df1, df2, [by]) - -Perform a right join on `df1` and `df` with an optional `by`. - -# Arguments -- `df1`: A DataFrame. -- `df2`: A DataFrame. -- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. - -# Examples -```jldoctest -julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); - -julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - -julia> @right_join(df1, df2) -2×3 DataFrame - Row │ a b c - │ String Int64? Int64 -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ c missing 4 - -julia> @right_join(df1, df2, a) -2×3 DataFrame - Row │ a b c - │ String Int64? Int64 -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ c missing 4 - -julia> @right_join(df1, df2, a = a) -2×3 DataFrame - Row │ a b c - │ String Int64? Int64 -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ c missing 4 - -julia> @right_join(df1, df2, "a") -2×3 DataFrame - Row │ a b c - │ String Int64? Int64 -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ c missing 4 - -julia> @right_join(df1, df2, "a" = "a") -2×3 DataFrame - Row │ a b c - │ String Int64? Int64 -─────┼──────────────────────── - 1 │ a 1 3 - 2 │ c missing 4 -``` -""" - -const docstring_inner_join = -""" - @inner_join(df1, df2, [by]) - -Perform a inner join on `df1` and `df` with an optional `by`. - -# Arguments -- `df1`: A DataFrame. -- `df2`: A DataFrame. -- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. - -# Examples -```jldoctest -julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); - -julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - -julia> @inner_join(df1, df2) -1×3 DataFrame - Row │ a b c - │ String Int64 Int64 -─────┼────────────────────── - 1 │ a 1 3 - -julia> @inner_join(df1, df2, a) -1×3 DataFrame - Row │ a b c - │ String Int64 Int64 -─────┼────────────────────── - 1 │ a 1 3 - -julia> @inner_join(df1, df2, a = a) -1×3 DataFrame - Row │ a b c - │ String Int64 Int64 -─────┼────────────────────── - 1 │ a 1 3 - -julia> @inner_join(df1, df2, "a") -1×3 DataFrame - Row │ a b c - │ String Int64 Int64 -─────┼────────────────────── - 1 │ a 1 3 - -julia> @inner_join(df1, df2, "a" = "a") -1×3 DataFrame - Row │ a b c - │ String Int64 Int64 -─────┼────────────────────── - 1 │ a 1 3 -``` -""" - -const docstring_full_join = -""" - @full_join(df1, df2, [by]) - -Perform a full join on `df1` and `df` with an optional `by`. - -# Arguments -- `df1`: A DataFrame. -- `df2`: A DataFrame. -- `by`: An optional column or tuple of columns. `by` supports interpolation of individual columns. If `by` is not supplied, then it will be inferred from shared names of columns between `df1` and `df2`. - -# Examples -```jldoctest -julia> df1 = DataFrame(a = ["a", "b"], b = 1:2); - -julia> df2 = DataFrame(a = ["a", "c"], c = 3:4); - -julia> @full_join(df1, df2) -3×3 DataFrame - Row │ a b c - │ String Int64? Int64? -─────┼────────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - 3 │ c missing 4 - -julia> @full_join(df1, df2, a) -3×3 DataFrame - Row │ a b c - │ String Int64? Int64? -─────┼────────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - 3 │ c missing 4 - -julia> @full_join(df1, df2, a = a) -3×3 DataFrame - Row │ a b c - │ String Int64? Int64? -─────┼────────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - 3 │ c missing 4 - -julia> @full_join(df1, df2, "a") -3×3 DataFrame - Row │ a b c - │ String Int64? Int64? -─────┼────────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - 3 │ c missing 4 - -julia> @full_join(df1, df2, "a" = "a") -3×3 DataFrame - Row │ a b c - │ String Int64? Int64? -─────┼────────────────────────── - 1 │ a 1 3 - 2 │ b 2 missing - 3 │ c missing 4 -``` -""" - -const docstring_pivot_wider = -""" - @pivot_wider(df, names_from, values_from) - -Reshapes the DataFrame to make it wider, increasing the number of columns and reducing the number of rows. - -# Arguments -- `df`: A DataFrame. -- `names_from`: The name of the column to get the name of the output columns from. -- `values_from`: The name of the column to get the cell values from. - -# Examples -```jldoctest -julia> df_long = DataFrame(id = [1, 1, 2, 2], - variable = ["A", "B", "A", "B"], - value = [1, 2, 3, 4]); - -julia> df_long_missing = DataFrame(id = [1, 1, 2], - variable = ["A", "B", "B"], - value = [1, 2, 4]); - -julia> @pivot_wider(df_long, names_from = variable, values_from = value) -2×3 DataFrame - Row │ id A B - │ Int64 Int64? Int64? -─────┼─────────────────────── - 1 │ 1 1 2 - 2 │ 2 3 4 - -julia> @pivot_wider(df_long, names_from = "variable", values_from = "value") -2×3 DataFrame - Row │ id A B - │ Int64 Int64? Int64? -─────┼─────────────────────── - 1 │ 1 1 2 - 2 │ 2 3 4 - -julia> @pivot_wider(df_long_missing, names_from = variable, values_from = value, values_fill = 0) -2×3 DataFrame - Row │ id A B - │ Int64 Int64 Int64 -─────┼───────────────────── - 1 │ 1 1 2 - 2 │ 2 0 4 -``` -""" - -const docstring_pivot_longer = -""" - @pivot_longer(df, cols, [names_to], [values_to]) - -Reshapes the DataFrame to make it longer, increasing the number of rows and reducing the number of columns. - -# Arguments -- `df`: A DataFrame. -- `cols`: Columns to pivot into longer format. Multiple columns can be selected but providing tuples of columns is not yet supported. -- `names_to`: Optional, defaults to `variable`. The name of the newly created column whose values will contain the input DataFrame's column names. -- `values_to`: Optional, defaults to `value`. The name of the newly created column containing the input DataFrame's cell values. - -# Examples -```jldoctest -julia> df_wide = DataFrame(id = [1, 2], A = [1, 3], B = [2, 4]); - -julia> @pivot_longer(df_wide, A:B) -4×3 DataFrame - Row │ id variable value - │ Int64 String Int64 -─────┼──────────────────────── - 1 │ 1 A 1 - 2 │ 2 A 3 - 3 │ 1 B 2 - 4 │ 2 B 4 - -julia> @pivot_longer(df_wide, -id) -4×3 DataFrame - Row │ id variable value - │ Int64 String Int64 -─────┼──────────────────────── - 1 │ 1 A 1 - 2 │ 2 A 3 - 3 │ 1 B 2 - 4 │ 2 B 4 - -julia> @pivot_longer(df_wide, A:B, names_to = "letter", values_to = "number") -4×3 DataFrame - Row │ id letter number - │ Int64 String Int64 -─────┼─────────────────────── - 1 │ 1 A 1 - 2 │ 2 A 3 - 3 │ 1 B 2 - 4 │ 2 B 4 - -julia> @pivot_longer(df_wide, A:B, names_to = letter, values_to = number) -4×3 DataFrame - Row │ id letter number - │ Int64 String Int64 -─────┼─────────────────────── - 1 │ 1 A 1 - 2 │ 2 A 3 - 3 │ 1 B 2 - 4 │ 2 B 4 - -julia> @pivot_longer(df_wide, A:B, names_to = "letter") -4×3 DataFrame - Row │ id letter value - │ Int64 String Int64 -─────┼────────────────────── - 1 │ 1 A 1 - 2 │ 2 A 3 - 3 │ 1 B 2 - 4 │ 2 B 4 - -``` -""" - -const docstring_if_else = -""" - if_else(condition, yes, no, [miss]) - -Return the `yes` value if the `condition` is `true` and the `no` value if the `condition` is `false`. If `miss` is specified, then the provided `miss` value is returned when the `condition` contains a `missing` value. If `miss` is not specified, then the returned value is an explicit `missing` value. - -# Arguments -- `condition`: A condition that evaluates to `true`, `false`, or `missing`. -- `yes`: Value to return if the condition is `true`. -- `no`: Value to return if the condition is `false`. -- `miss`: Optional. Value to return if the condition is `missing`. - -# Examples -```jldoctest -julia> df = DataFrame(a = [1, 2, missing, 4, 5]); - -julia> @chain df begin - @mutate(b = if_else(a >= 3, "yes", "no")) - end -5×2 DataFrame - Row │ a b - │ Int64? String? -─────┼────────────────── - 1 │ 1 no - 2 │ 2 no - 3 │ missing missing - 4 │ 4 yes - 5 │ 5 yes - -julia> @chain df begin - @mutate(b = if_else(a >= 3, "yes", "no", "unknown")) - end -5×2 DataFrame - Row │ a b - │ Int64? String -─────┼────────────────── - 1 │ 1 no - 2 │ 2 no - 3 │ missing unknown - 4 │ 4 yes - 5 │ 5 yes - -julia> @chain df begin - @mutate(b = if_else(a >= 3, 3, a)) - end -5×2 DataFrame - Row │ a b - │ Int64? Int64? -─────┼────────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ missing missing - 4 │ 4 3 - 5 │ 5 3 - -julia> @chain df begin - @mutate(b = if_else(a >= 3, 3, a, 0)) - end -5×2 DataFrame - Row │ a b - │ Int64? Int64 -─────┼──────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ missing 0 - 4 │ 4 3 - 5 │ 5 3 -``` -""" - -const docstring_case_when = -""" - case_when(condition => return_value) - case_when(condition_1 => return_value_1, condition_2 => return_value_2, ...) - -Return the corresponding `return_value` for the first `condition` that evaluates to `true`. - -The most specific condition should be listed first and most general condition should be listed last. If none of the conditions evaluate to `true`, then a `missing` value is returned. - -# Arguments -- `condition`: A condition that evaluates to `true`, `false`, or `missing`. -- `return_value`: The value to return if the condition is `true`. - -# Examples -```jldoctest -julia> df = DataFrame(a = [1, 2, missing, 4, 5]); - -julia> @chain df begin - @mutate(b = case_when(a > 4 => "hi", - a > 2 => "medium", - a > 0 => "low")) - end -5×2 DataFrame - Row │ a b - │ Int64? String? -─────┼────────────────── - 1 │ 1 low - 2 │ 2 low - 3 │ missing missing - 4 │ 4 medium - 5 │ 5 hi - -julia> @chain df begin - @mutate(b = case_when(a > 4 => "hi", - a > 2 => "medium", - a > 0 => "low", - true => "unknown")) - end -5×2 DataFrame - Row │ a b - │ Int64? String -─────┼────────────────── - 1 │ 1 low - 2 │ 2 low - 3 │ missing unknown - 4 │ 4 medium - 5 │ 5 hi - -julia> @chain df begin - @mutate(b = case_when(a >= 3 => 3, - true => a)) - end -5×2 DataFrame - Row │ a b - │ Int64? Int64? -─────┼────────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ missing missing - 4 │ 4 3 - 5 │ 5 3 - -julia> @chain df begin - @mutate(b = case_when(a >= 3 => 3, - ismissing(a) => 0, - true => a)) - end -5×2 DataFrame - Row │ a b - │ Int64? Int64 -─────┼──────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ missing 0 - 4 │ 4 3 - 5 │ 5 3 -``` -""" - -const docstring_n = -""" - n() - -Return the number of rows in the DataFrame or in the group if used in the context of a GroupedDataFrame. - -# Arguments -- None - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e', inner = 2), b = 1:10, c = 11:20); - -julia> @chain df begin - @summarize(n = n()) - end -1×1 DataFrame - Row │ n - │ Int64 -─────┼─────── - 1 │ 10 - -julia> @chain df begin - @group_by(a) - @summarize(n = n()) - end -5×2 DataFrame - Row │ a n - │ Char Int64 -─────┼───────────── - 1 │ a 2 - 2 │ b 2 - 3 │ c 2 - 4 │ d 2 - 5 │ e 2 -``` -""" - -const docstring_row_number = -""" - row_number() - -Return each row's number in a DataFrame or in the group if used in the context of a GroupedDataFrame. - -# Arguments -- None - -# Examples -```jldoctest -julia> df = DataFrame(a = repeat('a':'e', inner = 2)); - -julia> @chain df begin - @mutate(row_num = row_number()) - end -10×2 DataFrame - Row │ a row_num - │ Char Int64 -─────┼─────────────── - 1 │ a 1 - 2 │ a 2 - 3 │ b 3 - 4 │ b 4 - 5 │ c 5 - 6 │ c 6 - 7 │ d 7 - 8 │ d 8 - 9 │ e 9 - 10 │ e 10 - -julia> @chain df begin - @mutate(row_num = row_number() + 1) - end -10×2 DataFrame - Row │ a row_num - │ Char Int64 -─────┼─────────────── - 1 │ a 2 - 2 │ a 3 - 3 │ b 4 - 4 │ b 5 - 5 │ c 6 - 6 │ c 7 - 7 │ d 8 - 8 │ d 9 - 9 │ e 10 - 10 │ e 11 - -julia> @chain df begin - @filter(row_number() <= 5) - end -5×1 DataFrame - Row │ a - │ Char -─────┼────── - 1 │ a - 2 │ a - 3 │ b - 4 │ b - 5 │ c -``` -""" - -const docstring_bind_rows = -""" - @bind_rows(dfs..., id) - -Bind many DataFrames into one by row. - -Columns present in at least one of the provided DataFrames are kept. Columns not present in some DataFrames are filled with missing values where necessary. - -# Arguments -- `dfs...`: DataFrames to combine. -- `id`: string DataFrame identifier. When id is supplied, a new column of numeric identifiers is created to link each row to its original DataFrame. - -# Examples -```jldoctest bind_rows -julia> df1 = DataFrame(a=1:3, b=1:3); - -julia> df2 = DataFrame(a=4:6, b=4:6); - -julia> df3 = DataFrame(a=7:9, c=7:9); - -julia> @chain df1 begin - @bind_rows(df2) - end -6×2 DataFrame - Row │ a b - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 3 - 4 │ 4 4 - 5 │ 5 5 - 6 │ 6 6 -``` -When columns are not present in some DataFrames, they are filled with missing values. - -```jldoctest bind_rows -julia> @chain df1 begin - @bind_rows(df2, df3) - end -9×3 DataFrame - Row │ a b c - │ Int64 Int64? Int64? -─────┼───────────────────────── - 1 │ 1 1 missing - 2 │ 2 2 missing - 3 │ 3 3 missing - 4 │ 4 4 missing - 5 │ 5 5 missing - 6 │ 6 6 missing - 7 │ 7 missing 7 - 8 │ 8 missing 8 - 9 │ 9 missing 9 - -julia> @chain df1 begin - @bind_rows(df2, df3, id = "id") - end -9×4 DataFrame - Row │ a b c id - │ Int64 Int64? Int64? Int64 -─────┼──────────────────────────────── - 1 │ 1 1 missing 1 - 2 │ 2 2 missing 1 - 3 │ 3 3 missing 1 - 4 │ 4 4 missing 2 - 5 │ 5 5 missing 2 - 6 │ 6 6 missing 2 - 7 │ 7 missing 7 3 - 8 │ 8 missing 8 3 - 9 │ 9 missing 9 3 -``` -""" - -const docstring_bind_cols = -""" - @bind_cols(dfs...) - -Bind many DataFrames into one by column. - -# Arguments -- `dfs...`: DataFrames to combine. - -# Examples -```jldoctest -julia> df1 = DataFrame(a=1:3, b=1:3); - -julia> df2 = DataFrame(a=4:6, b=4:6); - -julia> df3 = DataFrame(a=7:9, c=7:9); - -julia> @chain df1 begin - @bind_cols(df2, df3) - end -3×6 DataFrame - Row │ a b a_1 b_1 a_2 c - │ Int64 Int64 Int64 Int64 Int64 Int64 -─────┼────────────────────────────────────────── - 1 │ 1 1 4 4 7 7 - 2 │ 2 2 5 5 8 8 - 3 │ 3 3 6 6 9 9 -``` -""" - -const docstring_clean_names = -""" - @clean_names(df, [case]) - -Clean column names using Cleaner.jl. - -# Arguments -- `df`: DataFrame or GroupedDataFrame. -- `case`: Optional string argument taking the value of either "snake_case" or "camelCase". Defaults to "snake_case". - -# Examples -```jldoctest -julia> df = DataFrame(var" A bad column name " = 1:5) -5×1 DataFrame - Row │ A bad column name - │ Int64 -─────┼───────────────────── - 1 │ 1 - 2 │ 2 - 3 │ 3 - 4 │ 4 - 5 │ 5 - -julia> @chain df begin - @clean_names - end -5×1 DataFrame - Row │ a_bad_column_name - │ Int64 -─────┼─────────────────── - 1 │ 1 - 2 │ 2 - 3 │ 3 - 4 │ 4 - 5 │ 5 - -julia> @chain df begin - @clean_names(case = "camelCase") - end -5×1 DataFrame - Row │ aBadColumnName - │ Int64 -─────┼──────────────── - 1 │ 1 - 2 │ 2 - 3 │ 3 - 4 │ 4 - 5 │ 5 -``` -""" - -const docstring_ntile = -""" - ntile(x, n::Integer) - -Break the input vector into `n` equal-sized buckets. - -`ntile()` is a rough rank that breaks the input vector into `n` buckets. If `length(x)` is not an integer multiple of `n`, the size of the buckets will differ by up to one, with larger buckets coming first. - -Unlike other ranking functions, `ntile()` ignores ties: it will create evenly sized buckets even if the same value of `x` ends up in different buckets. - -# Arguments -- `x`: A vector to rank. By default, the smallest values will get the smallest ranks. Missing values will be given rank `missing`. -- `n`: Number of groups to bucket into. - -# Examples -```jldoctest -julia> x = [5,1,3,2,2, missing] -6-element Vector{Union{Missing, Int64}}: - 5 - 1 - 3 - 2 - 2 - missing - -julia> ntile(x, 2) -6-element Vector{Union{Missing, Int64}}: - 2 - 1 - 2 - 1 - 1 - missing - -julia> ntile(x, 4) -6-element Vector{Union{Missing, Int64}}: - 4 - 1 - 3 - 1 - 2 - missing - -julia> ntile(1:8, 3) -8-element Vector{Int64}: - 1 - 1 - 1 - 2 - 2 - 2 - 3 - 3 - -julia> df = DataFrame(a = 1:8); - -julia> @chain df begin - @mutate(buckets = ntile(a, 3)) - end -8×2 DataFrame - Row │ a buckets - │ Int64 Int64 -─────┼──────────────── - 1 │ 1 1 - 2 │ 2 1 - 3 │ 3 1 - 4 │ 4 2 - 5 │ 5 2 - 6 │ 6 2 - 7 │ 7 3 - 8 │ 8 3 -``` -""" - -const docstring_count = -""" - @count(df, exprs..., [wt], [sort]) - -Count the unique values of one or more variables, with an optional weighting. - -`@chain df @count(a, b)` is roughly equivalent to `@chain df @group_by(a, b) @summarize(n = n())`. Supply `wt` to perform weighted counts, switching the summary from `n = n()` to `n = sum(wt)`. Note that if grouping columns are provided, the result will be an ungrouped data frame, which is slightly different behavior than R's `tidyverse`. - -# Arguments -- `df`: A DataFrame or GroupedDataFrame. -- `exprs...`: Column names, separated by commas. -- `wt`: Optional parameter. Used to calculate a sum over the provided `wt` variable instead of counting the rows. -- `sort`: Defaults to `false`. Whether the result should be sorted from highest to lowest `n`. - -# Examples -```jldoctest -julia> df = DataFrame(a = vcat(repeat(["a"], inner = 3), - repeat(["b"], inner = 3), - repeat(["c"], inner = 1), - missing), - b = 1:8) -8×2 DataFrame - Row │ a b - │ String? Int64 -─────┼──────────────── - 1 │ a 1 - 2 │ a 2 - 3 │ a 3 - 4 │ b 4 - 5 │ b 5 - 6 │ b 6 - 7 │ c 7 - 8 │ missing 8 - -julia> @chain df begin - @count() - end -1×1 DataFrame - Row │ n - │ Int64 -─────┼─────── - 1 │ 8 - -julia> @chain df begin - @count(a) - end -4×2 DataFrame - Row │ a n - │ String? Int64 -─────┼──────────────── - 1 │ a 3 - 2 │ b 3 - 3 │ c 1 - 4 │ missing 1 - -julia> @chain df begin - @count(a, wt = b) - end -4×2 DataFrame - Row │ a n - │ String? Int64 -─────┼──────────────── - 1 │ a 6 - 2 │ b 15 - 3 │ c 7 - 4 │ missing 8 - -julia> @chain df begin - @count(a, wt = b, sort = true) - end -4×2 DataFrame - Row │ a n - │ String? Int64 -─────┼──────────────── - 1 │ b 15 - 2 │ missing 8 - 3 │ c 7 - 4 │ a 6 -``` -""" - -const docstring_tally = -""" - @tally(df, [wt], [sort]) - -Tally the unique values of one or more variables, with an optional weighting. - -`@tally()` is a low-level helper macro for `@count()` that assumes that any grouping has already been performed. `@chain @tally()` is roughly equivalent to `@chain df @summarize(n = n())`. Supply `wt` to perform weighted counts, switching the summary from `n = n()` to `n = sum(wt)`. - -# Arguments -- `df`: A DataFrame or GroupedDataFrame. -- `wt`: Optional parameter. Used to calculate a sum over the provided `wt` variable instead of counting the rows. -- `sort`: Defaults to `false`. Whether the result should be sorted from highest to lowest `n`. - -# Examples -```jldoctest -julia> df = DataFrame(a = vcat(repeat(["a"], inner = 3), - repeat(["b"], inner = 3), - repeat(["c"], inner = 1), - missing), - b = 1:8) -8×2 DataFrame - Row │ a b - │ String? Int64 -─────┼──────────────── - 1 │ a 1 - 2 │ a 2 - 3 │ a 3 - 4 │ b 4 - 5 │ b 5 - 6 │ b 6 - 7 │ c 7 - 8 │ missing 8 - -julia> @chain df begin - @tally() - end -1×1 DataFrame - Row │ n - │ Int64 -─────┼─────── - 1 │ 8 - -julia> @chain df begin - @group_by(a) - @tally() - end -4×2 DataFrame - Row │ a n - │ String? Int64 -─────┼──────────────── - 1 │ a 3 - 2 │ b 3 - 3 │ c 1 - 4 │ missing 1 - -julia> @chain df begin - @group_by(a) - @tally(wt = b) - end -4×2 DataFrame - Row │ a n - │ String? Int64 -─────┼──────────────── - 1 │ a 6 - 2 │ b 15 - 3 │ c 7 - 4 │ missing 8 - -julia> @chain df begin - @group_by(a) - @tally(wt = b, sort = true) - end -4×2 DataFrame - Row │ a n - │ String? Int64 -─────┼──────────────── - 1 │ b 15 - 2 │ missing 8 - 3 │ c 7 - 4 │ a 6 -``` -""" - -const docstring_drop_na = -""" - @drop_na(df, [cols...]) - -Drop all rows with missing values. - -When called without arguments, `@drop_na()` drops all rows with missing values in any column. If columns are provided as an optional argument, only missing values from named columns are considered when dropping rows. - -# Arguments -- `df`: A DataFrame or GroupedDataFrame. -- `cols...`: An optional column, or multiple columns separated by commas or specified using selection helpers. - -# Examples -```jldoctest -julia> df = DataFrame( - a = [1, 2, missing, 4], - b = [1, missing, 3, 4] - ) -4×2 DataFrame - Row │ a b - │ Int64? Int64? -─────┼────────────────── - 1 │ 1 1 - 2 │ 2 missing - 3 │ missing 3 - 4 │ 4 4 - -julia> @chain df @drop_na() -2×2 DataFrame - Row │ a b - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 4 4 - -julia> @chain df @drop_na(a) -3×2 DataFrame - Row │ a b - │ Int64 Int64? -─────┼──────────────── - 1 │ 1 1 - 2 │ 2 missing - 3 │ 4 4 - -julia> @chain df @drop_na(a, b) -2×2 DataFrame - Row │ a b - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 4 4 - -julia> @chain df @drop_na(starts_with("a")) -3×2 DataFrame - Row │ a b - │ Int64 Int64? -─────┼──────────────── - 1 │ 1 1 - 2 │ 2 missing - 3 │ 4 4 -``` -""" - -const docstring_glimpse = -""" - @glimpse(df, width = 80) - -Preview a DataFrame (or GroupedDataFrame). - -The `@glimpse` macro is used to preview a DataFrame or GroupedDataFrame. Each column is printed on a separate row, along with its data type and first few elements, with the output truncated based on the `width`. - -# Arguments -- `df`: A DataFrame or GroupedDataFrame. -- `width`: The width of the output, measured in the number of characters. Defaults to 80. - -# Examples -```jldoctest -julia> df = DataFrame( - a = 1:100, - b = 1:100, - c = repeat(["a"], 100) - ); - -julia> @chain df @glimpse -Rows: 100 -Columns: 3 -.a Int64 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -.b Int64 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -.c String a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, - -julia> @chain df begin - @group_by(a) - @glimpse() - end -Rows: 100 -Columns: 3 -Groups: a [100] -.a Int64 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -.b Int64 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -.c String a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, -``` -""" - -const docstring_as_float = -""" - as_float(value) - -Convert a number or string to a Float64 data type. - -This is a useful helper for type conversions. Missing values are propagated. - -# Arguments -- `value`: An `AbstractString`, `Number`, or `missing` value. - -# Examples -```jldoctest -julia> as_float(1) -1.0 - -julia> as_float("1.5") -1.5 - -julia> as_float(missing) -missing -``` -""" - -const docstring_as_integer = -""" - as_integer(value) - -Convert a number or string to an Int64 data type. - -This is a useful helper for type conversions. Missing values are propagated. Any values after the decimal point are removed. - -# Arguments -- `value`: An `AbstractString`, `Number`, or `missing` value. - -# Examples -```jldoctest -julia> as_integer(1) -1 - -julia> as_integer(1.5) -1 - -julia> as_integer("2") -2 - -julia> as_integer("2.5") -2 - -julia> as_integer(missing) -missing -``` -""" - -const docstring_as_string = -""" - as_string(value) - -Convert a number or string to a String data type. - -This is a useful helper for type conversions. Missing values are propagated. - -# Arguments -- `value`: An `AbstractString`, `Number`, or `missing` value. - -# Examples -```jldoctest -julia> as_string(1) -"1" - -julia> as_string(1.5) -"1.5" - -julia> as_string(missing) -missing -``` -""" \ No newline at end of file diff --git a/src/helperfunctions.jl b/src/helperfunctions.jl deleted file mode 100644 index f2c4712..0000000 --- a/src/helperfunctions.jl +++ /dev/null @@ -1,8 +0,0 @@ -# This file is intended for any catch-all helper functions that don't deserve -# their own documentation page and don't have any outside licenses. - -# Need to expand with docs -# These are just aliases -starts_with(args...) = startswith(args...) -ends_with(args...) = endswith(args...) -matches(pattern, flags...) = Regex(pattern, flags...) \ No newline at end of file diff --git a/src/joins.jl b/src/joins.jl deleted file mode 100644 index 057b4c4..0000000 --- a/src/joins.jl +++ /dev/null @@ -1,107 +0,0 @@ -""" -$docstring_left_join -""" -macro left_join(df1, df2, by) - by = parse_join_by(by) - - df_expr = quote - leftjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -macro left_join(df1, df2) - by = :(intersect(names(DataFrame($(esc(df1)))), names(DataFrame($(esc(df2)))))) - - df_expr = quote - leftjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_right_join -""" -macro right_join(df1, df2, by) - by = parse_join_by(by) - - df_expr = quote - rightjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -macro right_join(df1, df2) - by = :(intersect(names(DataFrame($(esc(df1)))), names(DataFrame($(esc(df2)))))) - - df_expr = quote - rightjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_inner_join -""" -macro inner_join(df1, df2, by) - by = parse_join_by(by) - - df_expr = quote - innerjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -macro inner_join(df1, df2) - by = :(intersect(names(DataFrame($(esc(df1)))), names(DataFrame($(esc(df2)))))) - - df_expr = quote - innerjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -""" -$docstring_full_join -""" -macro full_join(df1, df2, by) - by = parse_join_by(by) - - df_expr = quote - outerjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end - -macro full_join(df1, df2) - by = :(intersect(names(DataFrame($(esc(df1)))), names(DataFrame($(esc(df2)))))) - - df_expr = quote - outerjoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $by) - end - if code[] - @info MacroTools.prettify(df_expr) - end - return df_expr -end diff --git a/src/ntile.jl b/src/ntile.jl deleted file mode 100644 index a75e333..0000000 --- a/src/ntile.jl +++ /dev/null @@ -1,35 +0,0 @@ -# The `ntile()` function is a line-by-line R-to-Julia translation of the -# `dplyr::ntile()` function. We have reproduced the `dplyr` MIT License below. - -# MIT License -# Copyright (c) 2023 dplyr authors -# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -""" -$docstring_ntile -""" -function ntile(x, n::Integer) - - x = if_else.(ismissing.(x), missing, invperm(sortperm(x))) - x_length = length(x) - sum(ismissing.(x)) - if n <= 0 - throw("`n` must be a positive number.") - end - - if x_length == 0 - return repeat([missing], length(x)) - else - n_larger = x_length % n - n_smaller = n - n_larger - size = x_length / n - larger_size = ceil(size) - smaller_size = floor(size) - larger_threshold = larger_size * n_larger - bins = if_else.(x .<= larger_threshold, - (x .+ (larger_size - 1)) / larger_size, - (x .+ (-larger_threshold + smaller_size - 1)) / smaller_size .+ n_larger) - return passmissing(convert).(Int, floor.(bins)) - end -end \ No newline at end of file diff --git a/src/parsing.jl b/src/parsing.jl deleted file mode 100644 index 3cc3b9b..0000000 --- a/src/parsing.jl +++ /dev/null @@ -1,433 +0,0 @@ -# Not exported -function parse_tidy(tidy_expr::Union{Expr,Symbol,Number}; autovec::Bool=true, subset::Bool=false, from_across::Bool=false) # Can be symbol or expression - if @capture(tidy_expr, across(vars_, funcs_)) - return parse_across(vars, funcs) - elseif @capture(tidy_expr, -(startindex_:endindex_) | !(startindex_:endindex_)) - if startindex isa Symbol - startindex = QuoteNode(startindex) - end - if endindex isa Symbol - endindex = QuoteNode(endindex) - end - return :(Not(Between($startindex, $endindex))) - elseif @capture(tidy_expr, startindex_:endindex_) - if startindex isa Symbol - startindex = QuoteNode(startindex) - end - if endindex isa Symbol - endindex = QuoteNode(endindex) - end - return :(Between($startindex, $endindex)) - elseif @capture(tidy_expr, (lhs_ = fn_(args__)) | (lhs_ = fn_.(args__))) - if length(args) == 0 - lhs = QuoteNode(lhs) - return :($fn => $lhs) - else - @capture(tidy_expr, lhs_ = rhs_) - return parse_function(lhs, rhs; autovec, subset) - end - elseif @capture(tidy_expr, lhs_ = rhs_) - if rhs isa Symbol - lhs = QuoteNode(lhs) - rhs = QuoteNode(rhs) - return :($rhs => $lhs) - else # handles @mutate(b = 10) - return parse_function(lhs, :(identity($rhs)); autovec, subset) - end - elseif @capture(tidy_expr, -var_Symbol) - var = QuoteNode(var) - return :(Not($var)) - elseif @capture(tidy_expr, !var_Symbol) - var = QuoteNode(var) - return :(Not($var)) - elseif @capture(tidy_expr, var_Symbol) - return QuoteNode(var) - elseif @capture(tidy_expr, var_Number) - if var > 0 - return var - elseif var < 0 - var = -var - return :(Not($var)) - else - throw("Numeric selections cannot be zero.") - end - elseif @capture(tidy_expr, !var_Number) - return :(Not($var)) - elseif !subset & @capture(tidy_expr, -fn_(args__)) # negated selection helpers - return :(Cols(!($(esc(fn))($(args...))))) # change the `-` to a `!` and return - elseif !subset & @capture(tidy_expr, fn_(args__)) # selection helpers - if from_across || fn == :Cols # fn == :Cols is to deal with interpolated columns - return tidy_expr - else - return :(Cols($(esc(tidy_expr)))) - end - elseif subset - return parse_function(:ignore, tidy_expr; autovec, subset) - else - return tidy_expr - # return :($(esc(tidy_expr))) - # Do not throw error because multiple functions within across() where some are anonymous require this condition - # throw("Expression not recognized by parse_tidy()") - end -end - -# Not exported -function parse_pivot_arg(tidy_expr::Union{Expr,Symbol,Number}) - if @capture(tidy_expr, lhs_ = rhs_Symbol) - lhs = QuoteNode(lhs) - rhs = QuoteNode(rhs) - return :($lhs => $rhs) - elseif @capture(tidy_expr, lhs_ = rhs_String) - lhs = QuoteNode(lhs) - rhs = QuoteNode(rhs) - return :($lhs => $rhs) - - # Need to avoid QuoteNode-ing rhs when rhs is an expression. - # You can't use !! interpolation inside of for-loops because - # macros are expanded at parse-time, so you instead need to do - # Main.eval(:globalvar) or @eval(Main, globalvar) where globalvar - # is assigned to equal the iterator instead of using !!globalvar, - # which gets expanded before the for-loop is run. - elseif @capture(tidy_expr, lhs_ = rhs_) - lhs = QuoteNode(lhs) - return :($lhs => $rhs) - - else - tidy_expr = parse_tidy(tidy_expr) - return :(:cols => $(tidy_expr)) - end -end - -# Not exported -function parse_function(lhs::Symbol, rhs::Expr; autovec::Bool=true, subset::Bool=false) - - lhs = QuoteNode(lhs) - - src = Symbol[] - MacroTools.postwalk(rhs) do x - if @capture(x, (fn_(args__)) | (fn_.(args__))) - args = args[isa.(args, Symbol)] - push!(src, args...) - end - return x - end - - src = unique(src) - func_left = :($(src...),) - - if autovec - rhs = parse_autovec(rhs) - end - - rhs = parse_escape_function(rhs) # ensure that functions in user space are available - - if subset - return :($src => ($func_left -> $rhs)) # to ensure that missings are replace by false - else - return :($src => ($func_left -> $rhs) => $lhs) - end -end - -# Not exported -# Note: `parse_across` currently does not support the use of numbers for selecting columns -function parse_across(vars::Union{Expr,Symbol}, funcs::Union{Expr,Symbol}) - - src = Union{Expr,QuoteNode}[] # type can be either a QuoteNode or a expression containing a selection helper function - - if @capture(vars, (args__,)) - for arg in args - push!(src, parse_tidy(arg)) - end - else - push!(src, parse_tidy(vars)) - end - - func_array = Union{Expr,Symbol}[] # expression containing functions - - if funcs isa Symbol - push!(func_array, esc(funcs)) # fixes bug where single function is used inside across - elseif @capture(funcs, (args__,)) - for arg in args - if arg isa Symbol - push!(func_array, esc(arg)) - else - push!(func_array, esc(parse_tidy(arg; from_across=true))) # fixes bug with compound and anonymous functions getting wrapped in Cols() - end - end - else # for compound functions like mean or anonymous functions - push!(func_array, esc(funcs)) - end - - num_funcs = length(func_array) - - return :(Cols($(src...)) .=> reshape([$(func_array...)], 1, $num_funcs)) -end - -# Not exported -function parse_desc(tidy_expr::Union{Expr,Symbol}) - tidy_expr, found_n, found_row_number = parse_interpolation(tidy_expr) - if @capture(tidy_expr, Cols(args__)) # from parse_interpolation - return :(Cols($(args...),)) - elseif @capture(tidy_expr, desc(var_)) - var = QuoteNode(var) - return :(order($var, rev=true)) - else - return QuoteNode(tidy_expr) - end -end - -# Not exported -function parse_join_by(tidy_expr::Union{Expr,Symbol,String}) - tidy_expr, found_n, found_row_number = parse_interpolation(tidy_expr) - - src = Union{Expr,QuoteNode}[] # type can be either a QuoteNode or a expression containing a selection helper function - - if @capture(tidy_expr, expr_Symbol) - push!(src, QuoteNode(expr)) - elseif @capture(tidy_expr, expr_String) - push!(src, QuoteNode(Symbol(expr))) - elseif @capture(tidy_expr, lhs_Symbol = rhs_Symbol) - lhs = QuoteNode(lhs) - rhs = QuoteNode(rhs) - push!(src, :($lhs => $rhs)) - elseif @capture(tidy_expr, lhs_String = rhs_String) - lhs = QuoteNode(Symbol(lhs)) - rhs = QuoteNode(Symbol(rhs)) - push!(src, :($lhs => $rhs)) - else - @capture(tidy_expr, (args__,)) - for arg in args - if @capture(arg, expr_Symbol) - push!(src, QuoteNode(expr)) - elseif @capture(arg, expr_String) - push!(src, QuoteNode(Symbol(expr))) - elseif @capture(arg, lhs_Symbol = rhs_Symbol) - lhs = QuoteNode(lhs) - rhs = QuoteNode(rhs) - push!(src, :($lhs => $rhs)) - elseif @capture(arg, lhs_String = rhs_String) - lhs = QuoteNode(Symbol(lhs)) - rhs = QuoteNode(Symbol(rhs)) - push!(src, :($lhs => $rhs)) - else - push!(src, QuoteNode(arg)) - end - end - end - - return :([$(src...)]) -end - -# Not exported -function parse_group_by(tidy_expr::Union{Expr,Symbol}) - tidy_expr, found_n, found_row_number = parse_interpolation(tidy_expr) - - if @capture(tidy_expr, Cols(args__)) # from parse_interpolation - return :(Cols($(args...),)) - elseif @capture(tidy_expr, lhs_ = rhs_) - return QuoteNode(lhs) - else - return QuoteNode(tidy_expr) - end -end - -# Not exported -function parse_autovec(tidy_expr::Union{Expr,Symbol}) - - # Use postwalk so that we capture smallest expressions first. - # In the future, may want to consider switching to prewalk() so that we can - # capture the largest expression first and functions haven't already been vectorized first. - # Because prewalk() can result in infinite loops, would require lots of careful testing. - autovec_expr = MacroTools.postwalk(tidy_expr) do x - - # don't vectorize if starts with ~ (compound function) - # The reason we have a . is that bc this is postwalk, the function will first have been - # vectorized, and we need to unvectorize it. - # Adding the non-vectorized condition in case a non-vectorized function like mean is accidentally - # prefixed with a ~. - if @capture(x, (~fn1_ ∘ fn2_.(args__)) | (~fn1_ ∘ fn2_(args__))) - return :($fn1 ∘ $fn2($(args...))) - - # Don't vectorize if starts with ~ (regular function) - # The reason we have a . is that bc this is postwalk, the function will first have been - # vectorized, and we need to unvectorize it. - # Adding the non-vectorized condition in case a non-vectorized function like mean is accidentally - # prefixed with a ~. - elseif @capture(x, (~fn_.(args__)) | (~fn_(args__))) - return :($fn($(args...))) - - # Don't vectorize if starts with ~ (operator) e.g., a ~+ b - elseif @capture(x, args1_ ~ fn_(args2_)) - # We need to remove the . from the start bc was already vectorized and we need to - # unvectorize it - fn_new = Symbol(string(fn)[2:end]) - return :($fn_new($args1, $args2)) - - # If user already does Ref(Set(arg2)), then vectorize and leave as-is - elseif @capture(x, in(arg1_, Ref(Set(arg2_)))) - return :(in.($arg1, Ref(Set($arg2)))) - - # If user already does Ref(arg2), then wrap arg2 inside of a Set(). - # Set requires additional allocation but is much faster. - # See: https://bkamins.github.io/julialang/2023/02/10/in.html - elseif @capture(x, in(arg1_, Ref(arg2_))) - return :(in.($arg1, Ref(Set($arg2)))) - - # If user already does Set(arg2), then wrap this inside of Ref(). - # This is required to prevent vectorization of arg2. - elseif @capture(x, in(arg1_, Set(arg2_))) - return :(in.($arg1, Ref(Set($arg2)))) - - # If user did provides bare vector or tuple for arg2, then wrap - # arg2 inside of a Ref(Set(arg2)) - # This is required to prevent vectorization of arg2. - elseif @capture(x, in(arg1_, arg2_)) - return :(in.($arg1, Ref(Set($arg2)))) - - # Handle ∈ - elseif @capture(x, ∈(arg1_, Ref(Set(arg2_)))) - return :((∈).($arg1, Ref(Set($arg2)))) - elseif @capture(x, ∈(arg1_, Ref(arg2_))) - return :((∈).($arg1, Ref(Set($arg2)))) - elseif @capture(x, ∈(arg1_, Set(arg2_))) - return :((∈).($arg1, Ref(Set($arg2)))) - elseif @capture(x, ∈(arg1_, arg2_)) - return :((∈).($arg1, Ref(Set($arg2)))) - - # Handle ∉ - elseif @capture(x, ∉(arg1_, Ref(Set(arg2_)))) - return :((∉).($arg1, Ref(Set($arg2)))) - elseif @capture(x, ∉(arg1_, Ref(arg2_))) - return :((∉).($arg1, Ref(Set($arg2)))) - elseif @capture(x, ∉(arg1_, Set(arg2_))) - return :((∉).($arg1, Ref(Set($arg2)))) - elseif @capture(x, ∉(arg1_, arg2_)) - return :((∉).($arg1, Ref(Set($arg2)))) - - elseif @capture(x, fn_(args__)) - # This is the do-not-vectorize "list" - # `in` should be vectorized so do not add to this exclusion list - if fn in [:Ref :Set :Cols :(:) :∘ :lag :lead :ntile :repeat :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :cumsum :cumprod :accumulate] - return x - elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name - return :($fn.($(args...))) - elseif startswith(string(fn), ".") # already vectorized operator - return x - else # operator - fn_new = Symbol("." * string(fn)) - return :($fn_new($(args...))) - end - elseif hasproperty(x, :head) && (x.head == :&& || x.head == :||) - x.head = Symbol("." * string(x.head)) - return x - end - return x - end - return autovec_expr -end - -# Not exported -function parse_escape_function(rhs_expr::Union{Expr,Symbol}) - rhs_expr = MacroTools.postwalk(rhs_expr) do x - if @capture(x, fn_(args__)) - - # `in`, `∈`, and `∉` should be vectorized in auto-vec but not escaped - if fn in [:in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] - return x - elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name - return :($(esc(fn))($(args...))) - else - return x - end - elseif @capture(x, fn_.(args__)) - if fn in [:in :∈ :∉ :Ref :Set :Cols :(:) :∘ :across :desc :mean :std :var :median :first :last :minimum :maximum :sum :length :skipmissing :quantile :passmissing :startswith :contains :endswith] - return x - elseif contains(string(fn), r"[^\W0-9]\w*$") # valid variable name - return :($(esc(fn)).($(args...))) - else - return x - end - end - return x - end - return rhs_expr -end - -# Not exported -# String is for parse_join_by -function parse_interpolation(var_expr::Union{Expr,Symbol,Number,String}; summarize::Bool = false) - found_n = false - found_row_number = false - - var_expr = MacroTools.postwalk(var_expr) do x - if @capture(x, !!variable_Symbol) - variable = Main.eval(variable) - if variable isa AbstractString - return variable # Strings are now treated as Strings and not columns - elseif variable isa Symbol - return variable - else # Tuple or Vector of columns - if variable[1] isa Symbol - variable = QuoteNode.(variable) - return :(Cols($(variable...),)) - else - return variable - end - end - # `hello` in Julia is converted to Core.@cmd("hello") - # Since MacroTools is unable to match this pattern, we can directly - # evaluate the expression to see if it matches. If it does, the 3rd argument - # contains the string containing the values inside the backticks. - elseif hasproperty(x, :head) && x.head == :macrocall && - hasproperty(x.args[1], :mod) && hasproperty(x.args[1], :name) && - x.args[1].mod == Core && x.args[1].name == Symbol("@cmd") - return Symbol(x.args[3]) - elseif @capture(x, fn_()) - if fn == :n - if summarize - return :(nrow()) - else - found_n = true - return :Tidier_n - end - elseif fn == :row_number - found_row_number = true - return :Tidier_row_number - else - return :($fn()) - end - end - return x - end - return var_expr, found_n, found_row_number -end - -# Simply to convert n() to a number -function parse_slice_n(var_expr::Union{Expr,Symbol,Number,String}, n::Integer) - var_expr = MacroTools.postwalk(var_expr) do x - if @capture(x, fn_()) - if fn == :n - return n - else - return :($fn()) - end - end - return x - end - return var_expr -end - -# Not export -# parse DataFrame and Expr -function parse_bind_args(tidy_expr::Union{Expr,Symbol}) - found_id = false - if @capture(tidy_expr, lhs_ = rhs_) - if lhs != :id - throw("$(String(lhs)) is not implemented") - else - found_id = true - return rhs, found_id - end - end - return esc(tidy_expr), found_id -end \ No newline at end of file diff --git a/src/pivots.jl b/src/pivots.jl deleted file mode 100644 index 877af10..0000000 --- a/src/pivots.jl +++ /dev/null @@ -1,78 +0,0 @@ -""" -$docstring_pivot_wider -""" -macro pivot_wider(df, exprs...) - # take the expressions and return arg => value dictionary - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - # commented out because not needed here - # any_found_n = any([i[2] for i in interpolated_exprs]) - # any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_pivot_arg.(tidy_exprs) - expr_dict = Dict(x.args[2] => x.args[3] for x in tidy_exprs) - - # we need to define a dictionary - # to hold arguments in the format expected by unstack() - arg_dict = Dict{Symbol, Any}() - - if haskey(expr_dict, QuoteNode(:values_fill)) - arg_dict[:fill] = eval(expr_dict[QuoteNode(:values_fill)]) - end - - df_expr = quote - unstack(DataFrame($(esc(df))), - $(expr_dict[QuoteNode(:names_from)]), - $(expr_dict[QuoteNode(:values_from)]); - $(arg_dict)...) - end - - if code[] - @info MacroTools.prettify(df_expr) - end - - return(df_expr) -end - -""" -$docstring_pivot_longer -""" -macro pivot_longer(df, exprs...) - # take the expressions and return arg => value dictionary - interpolated_exprs = parse_interpolation.(exprs) - - tidy_exprs = [i[1] for i in interpolated_exprs] - # commented out because not needed here - # any_found_n = any([i[2] for i in interpolated_exprs]) - # any_found_row_number = any([i[3] for i in interpolated_exprs]) - - tidy_exprs = parse_pivot_arg.(tidy_exprs) - expr_dict = Dict(x.args[2] => x.args[3] for x in tidy_exprs) - - # we need to define a dictionary - # to hold arguments in the format expected by stack() - arg_dict = Dict{Symbol, Any}() - - # if names_to was specified, pass that argument to variable_name - if haskey(expr_dict, QuoteNode(:names_to)) - arg_dict[:variable_name] = (expr_dict[QuoteNode(:names_to)]).value - end - - # if values_to was specified, pass that argument to value_name - if haskey(expr_dict, QuoteNode(:values_to)) - arg_dict[:value_name] = (expr_dict[QuoteNode(:values_to)]).value - end - - # splat any specified arguments in to stack() - df_expr = quote - stack(DataFrame($(esc(df))), $(expr_dict[QuoteNode(:cols)]); $(arg_dict)...) - end - - if code[] - @info MacroTools.prettify(df_expr) - end - - return df_expr -end - diff --git a/src/pseudofunctions.jl b/src/pseudofunctions.jl deleted file mode 100644 index 64c5c77..0000000 --- a/src/pseudofunctions.jl +++ /dev/null @@ -1,27 +0,0 @@ -""" -$docstring_across -""" -function across(args...) - throw("This function should only be called inside of Tidier.jl macros.") -end - -""" -$docstring_desc -""" -function desc(args...) - throw("This function should only be called inside of @arrange().") -end - -""" -$docstring_n -""" -function n() - throw("This function should only be called inside of Tidier.jl macros.") -end - -""" -$docstring_row_number -""" -function row_number() - throw("This function should only be called inside of Tidier.jl macros.") -end \ No newline at end of file diff --git a/src/type_conversions.jl b/src/type_conversions.jl deleted file mode 100644 index d77f402..0000000 --- a/src/type_conversions.jl +++ /dev/null @@ -1,47 +0,0 @@ -""" -$docstring_as_float -""" -function as_float(value) - try - passmissing(convert)(Float64, value) - catch - missing # if parsing failure - end -end - -function as_float(value::AbstractString) - try - passmissing(parse)(Float64, value) - catch - missing # if parsing failure - end -end - -""" -$docstring_as_integer -""" -function as_integer(value) - try - passmissing(floor)(value) |> - x -> passmissing(convert)(Int64, x) - catch - missing # if parsing failure - end -end - -function as_integer(value::AbstractString) - try - passmissing(parse)(Float64, value) |> - x -> passmissing(floor)(x) |> - x -> passmissing(convert)(Int64, x) - catch - missing # if parsing failure - end -end - -""" -$docstring_as_string -""" -function as_string(value) - passmissing(string)(value) -end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index a0b6b4a..079e32f 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,7 +1,4 @@ [deps] Tidier = "f0413319-3358-4bb0-8e7c-0c83523a93bd" -Chain = "8be319e6-bccf-4806-a6f7-6fae938471bc" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/test/runtests.jl b/test/runtests.jl index 8a1eda5..1f8b734 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,8 +4,8 @@ using Tidier using Test using Documenter -DocMeta.setdocmeta!(Tidier, :DocTestSetup, :(using Tidier); recursive=true) +# DocMeta.setdocmeta!(Tidier, :DocTestSetup, :(using Tidier); recursive=true) -doctest(Tidier) +# doctest(Tidier) end \ No newline at end of file From de882d528a7a7130a4cc80ed5606302a73b7394b Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Mon, 7 Aug 2023 00:56:12 -0400 Subject: [PATCH 2/6] Fix README links, turn off caching of registries for building the docs --- .github/workflows/Documenter.yml | 2 +- README.md | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml index 7bd6425..f2c268e 100644 --- a/.github/workflows/Documenter.yml +++ b/.github/workflows/Documenter.yml @@ -21,7 +21,7 @@ jobs: - uses: julia-actions/setup-julia@v1 - uses: julia-actions/cache@v1 with: - cache-registries: "true" + cache-registries: "false" - name: Install documentation dependencies run: julia --project=docs -e 'using Pkg; pkg"dev ."; Pkg.instantiate()' - name: Build and deploy diff --git a/README.md b/README.md index 624464d..ebd6b27 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,13 @@ [![Build Status](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/Tidier&label=Downloads)](https://pkgs.genieframework.com?packages=Tidier) - + ## Tidier.jl Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. - + ## TidierData.jl @@ -23,19 +23,19 @@ TidierData.jl is package dedicated to data transformation and reshaping, powered TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.jl. It focuses on functionality within the ggplot2 R package. - + ## TidierCats.jl TidierCats.jl is a package dedicated to handling categorical variables, powered by CategoricalArrays.jl. It focuses on functionality within the forcats R package. - + ## TidierDates.jl TidierDates.jl is a package dedicated to handling dates and times. It focuses on functionality within the lubridate R package. - + ## TidierStrings.jl From f961f22dbd11b961654b0be9b1701af80dc2a3df Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Mon, 7 Aug 2023 01:23:37 -0400 Subject: [PATCH 3/6] Cleaned up links in README and docs --- README.md | 12 +++++++++++- docs/src/index.md | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ebd6b27..e129741 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Build Status](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/Tidier&label=Downloads)](https://pkgs.genieframework.com?packages=Tidier) - + ## Tidier.jl @@ -17,30 +17,40 @@ Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Simila TidierData.jl is package dedicated to data transformation and reshaping, powered by DataFrames.jl, ShiftedArrays.jl, and Cleaner.jl. It focuses on functionality within the dplyr, tidyr, and janitor R packages. +

+ ## TidierPlots.jl TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.jl. It focuses on functionality within the ggplot2 R package. +

+ ## TidierCats.jl TidierCats.jl is a package dedicated to handling categorical variables, powered by CategoricalArrays.jl. It focuses on functionality within the forcats R package. +

+ ## TidierDates.jl TidierDates.jl is a package dedicated to handling dates and times. It focuses on functionality within the lubridate R package. +

+ ## TidierStrings.jl TidierStrings.jl is a package dedicated to handling strings. It focuses on functionality within the stringr R package. +

+ ## Installation For the stable version: diff --git a/docs/src/index.md b/docs/src/index.md index 30d7cb3..bab804d 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,4 +1,4 @@ - + ## Tidier.jl @@ -10,30 +10,40 @@ Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Simila TidierData.jl is package dedicated to data transformation and reshaping, powered by DataFrames.jl, ShiftedArrays.jl, and Cleaner.jl. It focuses on functionality within the dplyr, tidyr, and janitor R packages. +

+ ## TidierPlots.jl TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.jl. It focuses on functionality within the ggplot2 R package. +

+ ## TidierCats.jl TidierCats.jl is a package dedicated to handling categorical variables, powered by CategoricalArrays.jl. It focuses on functionality within the forcats R package. +

+ ## TidierDates.jl TidierDates.jl is a package dedicated to handling dates and times. It focuses on functionality within the lubridate R package. +

+ ## TidierStrings.jl TidierStrings.jl is a package dedicated to handling strings. It focuses on functionality within the stringr R package. +

+ ## Installation For the stable version: From 4b9ae3ed5c66cc1d7c0ce60daff45d9ee70f5d2e Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Mon, 7 Aug 2023 01:30:49 -0400 Subject: [PATCH 4/6] Fix semi-colon in img tags --- README.md | 13 +++++++------ docs/src/index.md | 12 ++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e129741..5b85324 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,14 @@ [![Build Status](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/Tidier&label=Downloads)](https://pkgs.genieframework.com?packages=Tidier) - + + ## Tidier.jl Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. - + ## TidierData.jl @@ -19,7 +20,7 @@ TidierData.jl is package dedicated to data transformation and reshaping, powered

- + ## TidierPlots.jl @@ -27,7 +28,7 @@ TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.

- + ## TidierCats.jl @@ -35,7 +36,7 @@ TidierCats.jl is a package dedicated to handling categorical variables, powered

- + ## TidierDates.jl @@ -43,7 +44,7 @@ TidierDates.jl is a package dedicated to handling dates and times. It focuses on

- + ## TidierStrings.jl diff --git a/docs/src/index.md b/docs/src/index.md index bab804d..e345272 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,10 +1,10 @@ - + ## Tidier.jl Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. - + ## TidierData.jl @@ -12,7 +12,7 @@ TidierData.jl is package dedicated to data transformation and reshaping, powered

- + ## TidierPlots.jl @@ -20,7 +20,7 @@ TidierPlots.jl is a package dedicated to plotting, powered by AlgebraOfGraphics.

- + ## TidierCats.jl @@ -28,7 +28,7 @@ TidierCats.jl is a package dedicated to handling categorical variables, powered

- + ## TidierDates.jl @@ -36,7 +36,7 @@ TidierDates.jl is a package dedicated to handling dates and times. It focuses on

- + ## TidierStrings.jl From 9678e8b90027d3a331fb7f6f170dbf0dc39ab208 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Mon, 7 Aug 2023 01:33:15 -0400 Subject: [PATCH 5/6] insert space in README.md and index.md --- README.md | 2 ++ docs/src/index.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 5b85324..366873c 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. +

+ ## TidierData.jl diff --git a/docs/src/index.md b/docs/src/index.md index e345272..ebd1e40 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -4,6 +4,8 @@ Tidier.jl is a 100% Julia implementation of the R tidyverse meta-package. Similar to the R tidyverse, Tidier.jl re-exports several other packages, each focusing on a specific set of functionalities. +

+ ## TidierData.jl From d3480c567f3181f180c113baf616216040ae86f8 Mon Sep 17 00:00:00 2001 From: Karandeep Singh Date: Mon, 7 Aug 2023 01:34:59 -0400 Subject: [PATCH 6/6] Left-align Tidier.jl in README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 366873c..d4d7823 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,7 @@ [![Build Status](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TidierOrg/Tidier.jl/actions/workflows/CI.yml?query=branch%3Amain) [![Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/Tidier&label=Downloads)](https://pkgs.genieframework.com?packages=Tidier) - - + ## Tidier.jl