Merge pull request #45 from JuliaDiffEq/sampling

Sampling Methods for Data
SciML · Feb 11, 2020 · b4fa3f6 · b4fa3f6
2 parents de617a8 + 83703ea
commit b4fa3f6
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 2 deletions.
diff --git a/Project.toml b/Project.toml
@@ -10,13 +10,15 @@ ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537"
 QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 Compat = "2.2, 3.0"
-ModelingToolkit = "1.1.3"
+ModelingToolkit = "1.2.5"
 ProximalOperators = "0.10"
 QuadGK = "2.3.1"
+StatsBase = "0.32.0"
 julia = "1"
 
 [extras]

diff --git a/src/DataDrivenDiffEq.jl b/src/DataDrivenDiffEq.jl
@@ -2,7 +2,8 @@ module DataDrivenDiffEq
 
 using LinearAlgebra
 using ModelingToolkit
-using QuadGK, Statistics
+using QuadGK
+using Statistics
 using Compat
 
 abstract type abstractBasis end;
@@ -46,5 +47,6 @@ export ISInDy
 include("./utils.jl")
 export AIC, AICC, BIC
 export hankel, optimal_shrinkage, optimal_shrinkage!
+export burst_sampling, subsample
 
 end # module
diff --git a/src/utils.jl b/src/utils.jl
@@ -1,3 +1,5 @@
+import StatsBase: sample
+
 # Model selection
 
 # Taken from https://royalsocietypublishing.org/doi/pdf/10.1098/rspa.2017.0009
@@ -100,3 +102,72 @@ function optimal_shrinkage!(X::AbstractArray{T, 2}) where T <: Number
     X .= U*Diagonal(S)*V'
     return
 end
+
+
+@inline function burst_sampling(x::AbstractArray, samplesize::Int64, bursts::Int64)
+    @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!"
+    inds = sample(1:size(x)[end]-samplesize, bursts, replace = false)
+    inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...)))
+    return resample(x, inds)
+end
+
+
+@inline function burst_sampling(x::AbstractArray, y::AbstractArray, samplesize::Int64, bursts::Int64)
+    @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!"
+    @assert size(x)[end] == size(y)[end]
+    inds = sample(1:size(x)[end]-samplesize, bursts, replace = false)
+    inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...)))
+    return resample(x, inds), resample(y, inds)
+end
+
+
+@inline function burst_sampling(x::AbstractArray, t::AbstractVector, period::T, bursts::Int64) where T <: AbstractFloat
+    @assert period > zero(typeof(period)) "Sampling period has to be positive."
+    @assert size(x)[end] == size(t)[end] "Provide consistent data."
+    @assert bursts >= 1 "Number of bursts has to be positive."
+    @assert t[end]-t[1]>= period*bursts "Bursting impossible. Please provide more data or reduce bursts."
+    t_ids = zero(eltype(t)) .<= t .- period  .<= t[end] .- 2*period
+    samplesize = Int64(floor(period/(t[end]-t[1])*length(t)))
+    inds = sample(collect(1:length(t))[t_ids], bursts, replace = false)
+    inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...)))
+    return resample(x, inds), resample(t, inds)
+end
+
+
+@inline function subsample(x::AbstractVector, frequency::Int64)
+    @assert frequency > 1
+    return x[1:frequency:end]
+end
+
+
+@inline function subsample(x::AbstractArray, frequency::Int64)
+    @assert frequency > 1
+    return x[:, 1:frequency:end]
+end
+
+@inline function subsample(x::AbstractArray, t::AbstractVector, period::T) where T <: AbstractFloat
+    @assert period > zero(typeof(period)) "Sampling period has to be positive."
+    @assert size(x)[end] == size(t)[end] "Provide consistent data."
+    @assert t[end]-t[1]>= period "Subsampling impossible. Sampling period exceeds time window."
+    idx = Int64[1]
+    t_now = t[1]
+    @inbounds for (i, t_current) in enumerate(t)
+        if t_current - t_now >= period
+            push!(idx, i)
+            t_now = t_current
+        end
+    end
+    return resample(x, idx), resample(t, idx)
+end
+
+@inline function resample(x::AbstractArray{T,1}, indx::AbstractArray{Int64}) where T <: Number
+    @assert maximum(indx) <= length(x)
+    @assert minimum(indx) >= 1
+    return x[indx]
+end
+
+@inline function resample(x::AbstractArray{T,2}, indx::AbstractArray{Int64}) where T <: Number
+    @assert maximum(indx) <= size(x, 2)
+    @assert minimum(indx) >= 1
+    return x[:, indx]
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -291,4 +291,37 @@ end
     @test BIC(k, X, Y) == -2*log(sum(abs2, X -Y)) + k*log(size(X)[2])
     @test AICC(k, X, Y, likelyhood = (X,Y)->sum(abs, X-Y)) == AIC(k, X, Y, likelyhood = (X,Y)->sum(abs, X-Y))+ 2*(k+1)*(k+2)/(size(X)[2]-k-2)
 
+
+    # Sampling
+    X = randn(Float64, 2, 100)
+    t = collect(0:0.1:9.99)
+    Y = randn(size(X))
+    xt = burst_sampling(X, 5, 10)
+    @test 10 <= size(xt)[end] <= 60
+    @test all([any(xi .≈ X) for xi in eachcol(xt)])
+    xt, tt = burst_sampling(X, t, 5, 10)
+    @test all(diff(tt) .> 0.0)
+    @test size(xt)[end] == size(tt)[end]
+    @test all([any(xi .≈ X) for xi in eachcol(xt)])
+    @test !all([any(xi .≈ Y) for xi in eachcol(xt)])
+    xs, ts = burst_sampling(X, t, 2.0, 1)
+    @test all([any(xi .≈ X) for xi in eachcol(xs)])
+    @test size(xs)[end] == size(ts)[end]
+    @test ts[end]-ts[1] ≈ 2.0
+    X2n = subsample(X, 2)
+    t2n = subsample(t, 2)
+    @test size(X2n)[end] == size(t2n)[end]
+    @test size(X2n)[end] == Int(round(size(X)[end]/2))
+    @test X2n[:, 1] == X[:, 1]
+    @test X2n[:, end] == X[:, end-1]
+    @test all([any(xi .≈ X) for xi in eachcol(X2n)])
+    xs, ts = subsample(X, t, 0.5)
+    @test size(xs)[end] == size(ts)[end]
+    @test size(xs)[1] == size(X)[1]
+    @test all(diff(ts) .≈ 0.5)
+    # Loop this a few times to be sure its right
+    @test_nowarn for i in 1:20
+        xs, ts = burst_sampling(X, t, 2.0, 1)
+        xs, ts = subsample(X, t, 0.5)
+    end
 end