In [11]:
using DataFrames 
using CategoricalArrays
using Chain

## Constructors

In [12]:
x = categorical(["A", "B", "B", "C"]) # unordered


4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

In [13]:
y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order


4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

In [14]:
z = categorical(["A","B","B","C", missing]) # unordered with missings

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "A"
 "B"
 "B"
 "C"
 missing

In [15]:
c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks

10-element CategoricalArray{String,1,UInt32}:
 "Q1: [1.0, 2.8)"
 "Q1: [1.0, 2.8)"
 "Q2: [2.8, 4.6)"
 "Q2: [2.8, 4.6)"
 "Q3: [4.6, 6.4)"
 "Q3: [4.6, 6.4)"
 "Q4: [6.4, 8.2)"
 "Q4: [6.4, 8.2)"
 "Q5: [8.2, 10.0]"
 "Q5: [8.2, 10.0]"

In [16]:
@chain DataFrame(x=cut(randn(100000), 10)) begin
      groupby(:x)
      combine(nrow) # just to make sure cut works right
end

Unnamed: 0_level_0,x,nrow
Unnamed: 0_level_1,Cat…,Int64
1,"Q1: [-4.885584813499916, -1.2821395095707495)",10000
2,"Q2: [-1.2821395095707495, -0.8346515537182392)",10000
3,"Q3: [-0.8346515537182392, -0.5207600560995914)",10000
4,"Q4: [-0.5207600560995914, -0.24806337801963377)",10000
5,"Q5: [-0.24806337801963377, 0.0066712286013243405)",10000
6,"Q6: [0.0066712286013243405, 0.26051090857630654)",10000
7,"Q7: [0.26051090857630654, 0.5328862044883617)",10000
8,"Q8: [0.5328862044883617, 0.8484329745741447)",10000
9,"Q9: [0.8484329745741447, 1.2947720997099736)",10000
10,"Q10: [1.2947720997099736, 3.8548032344547867]",10000


In [17]:
v = categorical([1,2,2,3,3]) # contains integers not strings

5-element CategoricalArray{Int64,1,UInt32}:
 1
 2
 2
 3
 3

In [18]:
Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector

5-element Vector{Union{Missing, String}}:
 "A"
 "B"
 "B"
 "C"
 missing

## Managing levels

In [19]:
arr = [x,y,z,c,v]


5-element Vector{CategoricalVector{T, UInt32, V, C, U} where {T, V, C, U}}:
 CategoricalValue{String, UInt32}["A", "B", "B", "C"]
 CategoricalValue{String, UInt32}["A", "B", "B", "C"]
 Union{Missing, CategoricalValue{String, UInt32}}["A", "B", "B", "C", missing]
 CategoricalValue{String, UInt32}["Q1: [1.0, 2.8)", "Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]", "Q5: [8.2, 10.0]"]
 CategoricalValue{Int64, UInt32}[1, 2, 2, 3, 3]

In [20]:
isordered.(arr) # chcek if categorical array is orderd


5-element BitVector:
 0
 1
 0
 1
 0

In [21]:
ordered!(x, true), isordered(x) # make x ordered


(CategoricalValue{String, UInt32}["A", "B", "B", "C"], true)

In [22]:
ordered!(x, false), isordered(x) # and unordered again


(CategoricalValue{String, UInt32}["A", "B", "B", "C"], false)

In [23]:
levels.(arr) # list levels

5-element Vector{Vector{T} where T}:
 ["A", "B", "C"]
 ["A", "B", "C"]
 ["A", "B", "C"]
 ["Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]"]
 [1, 2, 3]

In [24]:
unique.(arr) # missing will be included

5-element Vector{Vector{T} where T}:
 ["A", "B", "C"]
 ["A", "B", "C"]
 Union{Missing, String}["A", "B", "C", missing]
 ["Q1: [1.0, 2.8)", "Q2: [2.8, 4.6)", "Q3: [4.6, 6.4)", "Q4: [6.4, 8.2)", "Q5: [8.2, 10.0]"]
 [1, 2, 3]

In [25]:
y[1] < y[2] # can compare as y is ordered

true

In [26]:
v[1] < v[2] # not comparable, v is unordered although it contains integers


LoadError: ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this

In [27]:
y[2] < "A" # comparison against type underlying categorical value is not allowed


LoadError: ArgumentError: cannot compare a `CategoricalValue` to value `v` of type `String`: wrap `v` using `CategoricalValue(v, catvalue)` or `CategoricalValue(v, catarray)` first

In [28]:
y[2] < CategoricalValue("A", y) # you need to explicitly convert a value to a level


false

In [29]:
y[2] < CategoricalValue("Z", y) # but it is treated as a level, and thus only valid levels are allowed


LoadError: ArgumentError: level Z not found in source pool

In [30]:
levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays


4-element CategoricalArray{String,1,UInt32}:
 "A"
 "B"
 "B"
 "C"

In [31]:
y[1] < y[2] # observe that the order is changed


false

In [32]:
levels!(z, ["A", "B"]) # you have to specify all levels that are present

LoadError: ArgumentError: cannot remove level "C" as it is used at position 4 and allowmissing=false.

In [33]:
levels!(z, ["A", "B"], allowmissing=true) # unless the underlying array allows for missings and force removal of levels

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "A"
 "B"
 "B"
 missing
 missing

In [34]:
z[1] = "B"
z # now z has only "B" entries

5-element CategoricalArray{Union{Missing, String},1,UInt32}:
 "B"
 "B"
 "B"
 missing
 missing

In [35]:
levels(z) # but it remembers the levels it had (the reason is mostly performance)

2-element Vector{String}:
 "A"
 "B"

In [36]:
droplevels!(z) # this way we can clean it up
levels(z)

1-element Vector{String}:
 "B"

## Data manipulation

In [37]:
x, levels(x)

(CategoricalValue{String, UInt32}["A", "B", "B", "C"], ["A", "B", "C"])

In [38]:
x[2] = "0"
x, levels(x) # new level added at the end (works only for unordered)

(CategoricalValue{String, UInt32}["A", "0", "B", "C"], ["A", "B", "C", "0"])

In [39]:
v, levels(v)

(CategoricalValue{Int64, UInt32}[1, 2, 2, 3, 3], [1, 2, 3])

In [40]:
v[1] + v[2] # even though the underlying data is Int, we cannot operate on it

LoadError: MethodError: no method matching +(::CategoricalValue{Int64, UInt32}, ::CategoricalValue{Int64, UInt32})
[0mClosest candidates are:
[0m  +(::Any, ::Any, [91m::Any[39m, [91m::Any...[39m) at operators.jl:560

In [41]:
Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive)

5-element Vector{Int64}:
 1
 2
 2
 3
 3

In [42]:
unwrap(v[1]) + unwrap(v[2]) # or get a single value

3

In [43]:
unwrap.(v) # this will work for arrays witout missings

5-element Vector{Int64}:
 1
 2
 2
 3
 3

In [44]:
unwrap.(z) # also works on missing values

5-element Vector{Union{Missing, String}}:
 "B"
 "B"
 "B"
 missing
 missing

In [45]:
Vector{Union{String, Missing}}(z) # or do the conversion


5-element Vector{Union{Missing, String}}:
 "B"
 "B"
 "B"
 missing
 missing

In [46]:
recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent

6-element Vector{Union{Missing, Int64}}:
 10
  2
  3
  4
  5
   missing

In [47]:
recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings

6-element Vector{Union{Missing, Int64, String}}:
 10
 20
   "a"
   "a"
   "a"
   missing

In [48]:
recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly

6-element Vector{Union{Int64, String}}:
 10
  2
  3
  4
  5
   "missing"

In [49]:
t = categorical([1:5; missing])
t, levels(t)

(Union{Missing, CategoricalValue{Int64, UInt32}}[1, 2, 3, 4, 5, missing], [1, 2, 3, 4, 5])

In [50]:
recode!(t, [1,3]=>2)
t, levels(t) # note that the levels are dropped after recode

(Union{Missing, CategoricalValue{Int64, UInt32}}[2, 2, 2, 4, 5, missing], [2, 4, 5])

In [51]:
t = categorical([1,2,3], ordered=true)
levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance

3-element Vector{Int64}:
  3
  0
 -1

In [52]:
t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level
levels(recode(t, 300, [1,2]=>100, 3=>200))

3-element Vector{Int64}:
 100
 200
 300

## Comparisons

In [53]:
x = categorical([1,2,3])
xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)]
levels!(xs[2], [3,2,1])
levels!(xs[4], [2,3,1])
[a == b for a in xs, b in xs] # all are equal - comparison only by contents


4×4 Matrix{Bool}:
 1  1  1  1
 1  1  1  1
 1  1  1  1
 1  1  1  1

In [54]:
signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray
# all are different, notice that x[1] and x[2] are unordered but have a different order of levels
[signature(a) == signature(b) for a in xs, b in xs]

4×4 Matrix{Bool}:
 1  0  0  0
 0  1  0  0
 0  0  1  0
 0  0  0  1

In [55]:
x[1] < x[2] # you cannot compare elements of unordered CategoricalArray

LoadError: ArgumentError: Unordered CategoricalValue objects cannot be tested for order using <. Use isless instead, or call the ordered! function on the parent array to change this

In [56]:
t[1] < t[2] # but you can do it for an ordered one

true

In [57]:
isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered

true

In [58]:
y = deepcopy(x) # but not across categorical arrays
isless(x[1], y[2])

true

In [60]:
isless(unwrap(x[1]), unwrap(y[2])) # you can use get to make a comparison of the contents of CategoricalArray

true

In [61]:
x[1] == y[2] # equality tests works OK across CategoricalArrays

false

## Categorical columns in a DataFrame

In [62]:
df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"])

Unnamed: 0_level_0,x,y,z
Unnamed: 0_level_1,Int64,Char,String
1,1,a,a
2,2,b,b
3,3,c,c


In [63]:
# Convert all String columns to categorical in-place

transform!(df, names(df, String) => categorical, renamecols=false)

Unnamed: 0_level_0,x,y,z
Unnamed: 0_level_1,Int64,Char,Cat…
1,1,a,a
2,2,b,b
3,3,c,c


In [64]:
describe(df)

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,x,2.0,1,2.0,3,0,Int64
2,y,,a,,c,0,Char
3,z,,a,,c,0,"CategoricalValue{String, UInt32}"
