In [18]:
using DataFrames
using CSV
using Random

function download_titantic()
    url = "https://www.openml.org/data/get_csv/16826755/phpMYEkMl"
    return DataFrame(CSV.File(download(url); missingstring = "?"))
end


download_titantic (generic function with 1 method)

In [19]:
df = DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3]), fixed=1)


Unnamed: 0_level_0,A,B,C,fixed
Unnamed: 0_level_1,Int64,Float64,String,Int64
1,1,0.672143,JkX,1
2,2,0.41687,imP,1
3,3,0.77911,fcy,1


In [20]:
x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b'], "fixed" => Ref([1,1]))
df_dict = DataFrame(x)

Unnamed: 0_level_0,A,B,C,fixed
Unnamed: 0_level_1,Int64,Bool,Char,Array…
1,1,1,a,"[1, 1]"
2,2,0,b,"[1, 1]"


In [21]:
DataFrame([rand(3) for i in 1:3], :auto)

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.365407,0.539256,0.530756
2,0.0223058,0.603033,0.20372
3,0.142739,0.686355,0.14341


In [22]:
DataFrame([rand(3) for i in 1:3], [:x1, :x2, :x3])

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.562076,0.166719,0.997541
2,0.694819,0.992491,0.46887
3,0.216673,0.100662,0.289852


In [23]:

DataFrame([1, 2, 3])

LoadError: ArgumentError: 'Vector{Int64}' iterates 'Int64' values, which doesn't satisfy the Tables.jl `AbstractRow` interface

In [24]:
DataFrame(permutedims([1, 2, 3]), :auto)

Unnamed: 0_level_0,x1,x2,x3
Unnamed: 0_level_1,Int64,Int64,Int64
1,1,2,3


In [25]:
v = [(a=1, b=2), (a=3, b=4)]
DataFrame(v)

Unnamed: 0_level_0,a,b
Unnamed: 0_level_1,Int64,Int64
1,1,2
2,3,4


In [26]:
DataFrame(A=Int[], B=Float64[], C=String[])

Unnamed: 0_level_0,A,B,C
Unnamed: 0_level_1,Int64,Float64,String


In [27]:
x = DataFrame(x=1:2, y=["A", "B"])
Matrix(x)


2×2 Matrix{Any}:
 1  "A"
 2  "B"

In [28]:
Array(x)

2×2 Matrix{Any}:
 1  "A"
 2  "B"

In [29]:
x = DataFrame(x=1:2, y=[missing,"B"])


Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


### Iterating data frame by rows or columns
Sometimes it is useful to create a wrapper around a DataFrame that produces its rows or columns.

For iterating columns you can use the eachcol function.

In [30]:
ec = eachcol(x)


Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


In [31]:
ec isa AbstractVector


false

In [32]:
ec[1]


2-element Vector{Int64}:
 1
 2

In [33]:
ec["x"]


2-element Vector{Int64}:
 1
 2

In [34]:
er = eachrow(x)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


In [35]:
er isa AbstractVector


true

In [36]:
er[end]


Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
2,2,B


In [37]:
nti = Tables.namedtupleiterator(x)

Tables.NamedTupleIterator{Tables.Schema{(:x, :y), Tuple{Int64, Union{Missing, String}}}, Tables.RowIterator{NamedTuple{(:x, :y), Tuple{Vector{Int64}, Vector{Union{Missing, String}}}}}}(Tables.RowIterator{NamedTuple{(:x, :y), Tuple{Vector{Int64}, Vector{Union{Missing, String}}}}}((x = [1, 2], y = Union{Missing, String}[missing, "B"]), 2))

In [38]:
for row in enumerate(nti)
    @show row
end

row = (1, NamedTuple{(:x, :y), Tuple{Int64, Union{Missing, String}}}((1, missing)))
row = (2, NamedTuple{(:x, :y), Tuple{Int64, Union{Missing, String}}}((2, "B")))


In [39]:
DataFrame(nti)

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


### Handling of duplicate column names
We can pass the makeunique keyword argument to allow passing duplicate names (they get deduplicated)

In [40]:
df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true)

Unnamed: 0_level_0,a,a_2,a_1
Unnamed: 0_level_1,Int64,Int64,Int64
1,1,2,3


In [41]:
df = DataFrame(:a=>1, :a=>2, :a_1=>3)


LoadError: ArgumentError: Duplicate variable names: :a. Pass makeunique=true to make them unique using a suffix automatically.

In [None]:
df = DataFrame(x=[1, nothing], y=[nothing, "a"], z=[missing, "c"])


Unnamed: 0_level_0,x,y,z
Unnamed: 0_level_1,Union…,Union…,String?
1,1.0,,missing
2,,a,c


In [None]:
empty(df)


Unnamed: 0_level_0,x,y,z
Unnamed: 0_level_1,Union…,Union…,String?


In [42]:
x

Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


In [43]:
size(x), size(x, 1), size(x, 2)


((2, 2), 2, 2)

In [44]:
nrow(x), ncol(x)


(2, 2)

In [45]:
describe(x)


Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,x,1.5,1,1.5,2,0,Int64
2,y,,B,,B,1,"Union{Missing, String}"


In [46]:
names(x)


2-element Vector{String}:
 "x"
 "y"

In [47]:
names(x, String)


String[]

In [48]:
propertynames(x)


2-element Vector{Symbol}:
 :x
 :y

In [51]:
first(x, 5)


Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


In [52]:
last(x, 3)


Unnamed: 0_level_0,x,y
Unnamed: 0_level_1,Int64,String?
1,1,missing
2,2,B


## Load and save DataFrames


In [56]:
using Arrow
using CSV
using Serialization
using JLSO
using JSONTables
using CodecZlib
using ZipFile
using JDF
using StatsPlots # for charts
using Mmap # for compression

LoadError: ArgumentError: Package Arrow not found in current path:
- Run `import Pkg; Pkg.add("Arrow")` to install the Arrow package.


In [57]:
using Pkg; Pkg.add("Arrow")

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General`
┌ Error: curl_easy_setopt: 48
└ @ Downloads.Curl /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.6/Downloads/src/Curl/utils.jl:36
┌ Error: curl_easy_setopt: 48
└ @ Downloads.Curl /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.6/Downloads/src/Curl/utils.jl:36
┌ Error: curl_easy_setopt: 48
└ @ Downloads.Curl /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.6/Downloads/src/Curl/utils.jl:36
┌ Error: curl_easy_setopt: 48
└ @ Downloads.Curl /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.6/Downloads/src/Curl/utils.jl:36
┌ Error: curl_easy_setopt: 48
└ @ Downloads.Curl /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.6/Downloads/src/Curl/utils.jl:36
┌ Error: curl_easy_setopt: 48
└ @ Downloads.Curl /Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.6/Dow

LoadError: "/var/folders/7v/qz7vjqss5556pm4pc6hg3r8c0000gn/T/jl_WkN4Je/Registry.toml": No such file