# Handling missing values
A singelton type Missing allows us to deal with missing values.

In [40]:
using DataFrames

In [None]:
missing, typeof(missing)

(missing, Missing)

In [None]:
x = [1, 2, missing, 3]

4-element Vector{Union{Missing, Int64}}:
 1
 2
  missing
 3

In [None]:
ismissing(1), ismissing(missing), ismissing(x), ismissing.(x)

(false, true, false, Bool[0, 0, 1, 0])

We can extract the type combined with Missing from a Union via nonmissingtype

In [None]:
eltype(x), nonmissingtype(eltype(x))

(Union{Missing, Int64}, Int64)

In [None]:
missing == missing, missing != missing, missing < missing

(missing, missing, missing)

In [None]:
1 == missing, 1 != missing, 1 < missing

(missing, missing, missing)

In [None]:
# isequal, isless, and === produce results of type Bool. Notice that missing is considered greater than any numeric value.
isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing)

(true, true, false, true)

In [None]:
map(x -> x(missing), [sin, cos, zero, sqrt]) # part 1

4-element Vector{Missing}:
 missing
 missing
 missing
 missing

In [None]:
map(x -> x(missing, 1), [+, - , *, /, div]) # part 2

5-element Vector{Missing}:
 missing
 missing
 missing
 missing
 missing

In [32]:
using Statistics # needed for mean
map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, float]) # part 3

5-element Vector{Any}:
 missing
 missing
 (missing, missing)
 missing
 Union{Missing, Float64}[1.0, 2.0, missing]

In [33]:
collect(skipmissing([1, missing, 2, missing]))

2-element Vector{Int64}:
 1
 2

In [34]:
# Here we use replace to create a new array that replaces all missing values with some value (NaN in this case).
replace([1.0, missing, 2.0, missing], missing=>NaN)

4-element Vector{Float64}:
   1.0
 NaN
   2.0
 NaN

In [35]:
# Another way to do this:
coalesce.([1.0, missing, 2.0, missing], NaN)

4-element Vector{Float64}:
   1.0
 NaN
   2.0
 NaN

In [41]:
# You can also use recode from CategoricalArrays.jl if you have a default output value.

using CategoricalArrays
recode([1.0, missing, 2.0, missing], false, missing=>true)


4-element Vector{Bool}:
 0
 1
 0
 1

In [42]:
df = DataFrame(a=[1,2,missing], b=["a", "b", missing])

Unnamed: 0_level_0,a,b
Unnamed: 0_level_1,Int64?,String?
1,1,a
2,2,b
3,missing,missing


In [43]:
replace!(df.a, missing=>100)


3-element Vector{Union{Missing, Int64}}:
   1
   2
 100

In [44]:
df.b = coalesce.(df.b, 100)


3-element Vector{Any}:
    "a"
    "b"
 100

In [45]:
unique([1, missing, 2, missing]), levels([1, missing, 2, missing])


(Union{Missing, Int64}[1, missing, 2], [1, 2])

In [46]:
x = [1,2,3]
y = allowmissing(x)

3-element Vector{Union{Missing, Int64}}:
 1
 2
 3

In [47]:
z = disallowmissing(y)
x,y,z

([1, 2, 3], Union{Missing, Int64}[1, 2, 3], [1, 2, 3])

In [48]:
df = allowmissing(DataFrame(ones(2,3), :auto))


Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64?,Float64?,Float64?
1,1.0,1.0,1.0
2,1.0,1.0,1.0


In [49]:
df[1,1] = missing


missing

In [50]:
df


Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64?,Float64?,Float64?
1,missing,1.0,1.0
2,1.0,1.0,1.0


In [51]:
disallowmissing(df) # an error is thrown

LoadError: ArgumentError: Missing value found in column :x1 in row 1

In [52]:
disallowmissing(df, error=false) # column :x1 is left untouched as it contains missing


Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64?,Float64,Float64
1,missing,1.0,1.0
2,1.0,1.0,1.0


In [53]:
x = DataFrame(rand(Int, 2,3), :auto)
println("Before: ", eltype.(eachcol(x)))
allowmissing!(x, 1) # make first column accept missings
allowmissing!(x, :x3) # make :x3 column accept missings
println("After: ", eltype.(eachcol(x)))

Before: DataType[Int64, Int64, Int64]
After: Type[Union{Missing, Int64}, Int64, Union{Missing, Int64}]


In [54]:
x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"])


Unnamed: 0_level_0,A,B
Unnamed: 0_level_1,Int64?,String?
1,1,A
2,missing,B
3,3,missing
4,4,C


In [55]:
println("Complete cases:\n", completecases(x))

Complete cases:
Bool[1, 0, 0, 1]


In [56]:
y = dropmissing(x)
dropmissing!(x)

Unnamed: 0_level_0,A,B
Unnamed: 0_level_1,Int64,String
1,1,A
2,4,C


In [57]:
describe(x)

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,A,2.5,1,2.5,4,0,Int64
2,B,,A,,C,0,String


In [58]:
dropmissing!(x, disallowmissing=false)


Unnamed: 0_level_0,A,B
Unnamed: 0_level_1,Int64,String
1,1,A
2,4,C
