From f0fbf361a59fbbd5671c289cf4b6a6ffc711e068 Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Mon, 10 Feb 2020 19:58:07 +0100 Subject: [PATCH 1/8] Add subsampling and burst_sampling with indices --- src/utils.jl | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 20 +++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/src/utils.jl b/src/utils.jl index 3f28e703c..11cb5d84b 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -100,3 +100,55 @@ function optimal_shrinkage!(X::AbstractArray{T, 2}) where T <: Number X .= U*Diagonal(S)*V' return end + +function hankel(a::AbstractArray, b::AbstractArray) where T <: Number + p = vcat([a; b[2:end]]) + n = length(b)-1 + m = length(p)-n+1 + H = Array{eltype(p)}(undef, n, m) + @inbounds for i in 1:n + for j in 1:m + H[i,j] = p[i+j-1] + end + end + return H +end + +@inline function burst_sampling(x::AbstractArray, samplesize::Int64, bursts::Int64) + @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!" + inds = collect(rand(1:size(x)[end]-samplesize, bursts)) # Starting points + inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) + return resample(x, inds) +end + + +@inline function burst_sampling(x::AbstractArray, y::AbstractArray, samplesize::Int64, bursts::Int64) + @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!" + @assert size(x)[end] == size(y)[end] + inds = collect(rand(1:size(x)[end]-samplesize, bursts)) # Starting points + inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) + return resample(x, inds), resample(y, inds) +end + +@inline function subsample(x::AbstractVector, frequency::Int64) + @assert frequency > 1 + return x[1:frequency:end] +end + + +@inline function subsample(x::AbstractArray, frequency::Int64) + @assert frequency > 1 + return x[:, 1:frequency:end] +end + +@inline function resample(x::AbstractArray{T,1}, indx::AbstractArray{Int64}) where T <: Number + @assert maximum(indx) <= length(x) + @assert minimum(indx) >= 1 + return x[indx] +end + +@inline function resample(x::AbstractArray{T,2}, indx::AbstractArray{Int64}) where T <: Number + @assert maximum(indx) <= size(x, 2) + @assert minimum(indx) >= 1 + return x[:, indx] +end diff --git a/test/runtests.jl b/test/runtests.jl index 66f8ca48b..e10f9c3cc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -291,4 +291,24 @@ end @test BIC(k, X, Y) == -2*log(sum(abs2, X -Y)) + k*log(size(X)[2]) @test AICC(k, X, Y, likelyhood = (X,Y)->sum(abs, X-Y)) == AIC(k, X, Y, likelyhood = (X,Y)->sum(abs, X-Y))+ 2*(k+1)*(k+2)/(size(X)[2]-k-2) + + # Sampling + X = randn(Float64, 2, 100) + t = collect(0:0.1:9.99) + Y = randn(size(X)) + xt = burst_sampling(X, 5, 10) + @test 10 <= size(xt)[end] <= 60 + @test all([any(xi .≈ X) for xi in eachcol(xt)]) + xt, tt = burst_sampling(X, t, 5, 10) + @test all(diff(tt) .> 0.0) + @test size(xt)[end] == size(tt)[end] + @test all([any(xi .≈ X) for xi in eachcol(xt)]) + @test !all([any(xi .≈ Y) for xi in eachcol(xt)]) + X2n = subsample(X, 2) + t2n = subsample(t, 2) + @test size(X2n)[end] == size(t2n)[end] + @test size(X2n)[end] == Int(round(size(X)[end]/2)) + @test X2n[:, 1] == X[:, 1] + @test X2n[:, end] == X[:, end-1] + @test all([any(xi .≈ X) for xi in eachcol(X2n)]) end From 724bb99d3fb3956a0b1d06501adc3999f1b5566b Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Mon, 10 Feb 2020 21:31:48 +0100 Subject: [PATCH 2/8] Added burst sampling with time period --- src/utils.jl | 26 ++++++++++++++++++++++++-- test/runtests.jl | 4 ++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/utils.jl b/src/utils.jl index 11cb5d84b..719d73ee5 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,3 +1,5 @@ +import StatsBase: sample + # Model selection # Taken from https://royalsocietypublishing.org/doi/pdf/10.1098/rspa.2017.0009 @@ -116,7 +118,7 @@ end @inline function burst_sampling(x::AbstractArray, samplesize::Int64, bursts::Int64) @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!" - inds = collect(rand(1:size(x)[end]-samplesize, bursts)) # Starting points + inds = sample(1:size(x)[end]-samplesize, bursts, replace = false) # Starting points inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) return resample(x, inds) end @@ -125,11 +127,31 @@ end @inline function burst_sampling(x::AbstractArray, y::AbstractArray, samplesize::Int64, bursts::Int64) @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!" @assert size(x)[end] == size(y)[end] - inds = collect(rand(1:size(x)[end]-samplesize, bursts)) # Starting points + inds = sample(1:size(x)[end]-samplesize, bursts, replace = false) # Starting points inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) return resample(x, inds), resample(y, inds) end + +function burst_sampling(x::AbstractArray, t::AbstractVector, period::T, bursts::Int64) where T <: AbstractFloat + @assert size(x)[end] == size(t)[end] + @assert bursts >= 1 + @assert t[end] >= period*bursts "Length of data array too small for subsampling of size $size !" + tstop = findlast((t[end] .- t) .>= period) + @assert bursts < tstop "Bursting impossible. Please provide more data or reduce bursts." + + inds = collect(sample(1:tstop-1, bursts, replace = false)) # Starting points + idx = Int64[] + for i in inds + _stop = findlast(t[i:tstop-1] .- t[i] .<= period) + isnothing(_stop) ? _stop = 1 : nothing + push!(idx, collect(i:i+_stop-1)...) + end + sort!(unique!(idx)) + return resample(x, idx), resample(t, idx) +end + + @inline function subsample(x::AbstractVector, frequency::Int64) @assert frequency > 1 return x[1:frequency:end] diff --git a/test/runtests.jl b/test/runtests.jl index e10f9c3cc..e4635e4cf 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -304,6 +304,10 @@ end @test size(xt)[end] == size(tt)[end] @test all([any(xi .≈ X) for xi in eachcol(xt)]) @test !all([any(xi .≈ Y) for xi in eachcol(xt)]) + xs, ts = burst_sampling(X, t, 2.0, 1) + @test all([any(xi .≈ X) for xi in eachcol(xs)]) + @test size(xs)[end] == size(ts)[end] + @test ts[end]-ts[1] == 2.0 X2n = subsample(X, 2) t2n = subsample(t, 2) @test size(X2n)[end] == size(t2n)[end] From 74e48dbc6b7ea010275a4fc1c98459294bc424d5 Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Mon, 10 Feb 2020 21:32:03 +0100 Subject: [PATCH 3/8] Update MTK and add StatsBase --- Project.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f02b74e14..2107dc6d7 100644 --- a/Project.toml +++ b/Project.toml @@ -10,13 +10,15 @@ ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78" ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537" QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] Compat = "2.2, 3.0" -ModelingToolkit = "1.1.3" +ModelingToolkit = "1.2.5" ProximalOperators = "0.10" QuadGK = "2.3.1" +StatsBase = "0.32.0" julia = "1" [extras] From 94300e3b8fab46c7591f83ab05f1c6a7fa6db84e Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Mon, 10 Feb 2020 21:40:12 +0100 Subject: [PATCH 4/8] Export functions and correct assert msg --- src/DataDrivenDiffEq.jl | 4 +++- src/utils.jl | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/DataDrivenDiffEq.jl b/src/DataDrivenDiffEq.jl index a7d29be0d..f0b67ceca 100644 --- a/src/DataDrivenDiffEq.jl +++ b/src/DataDrivenDiffEq.jl @@ -2,7 +2,8 @@ module DataDrivenDiffEq using LinearAlgebra using ModelingToolkit -using QuadGK, Statistics +using QuadGK +using Statistics using Compat abstract type abstractBasis end; @@ -46,5 +47,6 @@ export ISInDy include("./utils.jl") export AIC, AICC, BIC export hankel, optimal_shrinkage, optimal_shrinkage! +export burst_sampling, subsample end # module diff --git a/src/utils.jl b/src/utils.jl index 719d73ee5..1bcdfde04 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -133,10 +133,10 @@ end end -function burst_sampling(x::AbstractArray, t::AbstractVector, period::T, bursts::Int64) where T <: AbstractFloat +@inline function burst_sampling(x::AbstractArray, t::AbstractVector, period::T, bursts::Int64) where T <: AbstractFloat @assert size(x)[end] == size(t)[end] @assert bursts >= 1 - @assert t[end] >= period*bursts "Length of data array too small for subsampling of size $size !" + @assert t[end]-t[1]>= period*bursts "Bursting impossible. Please provide more data or reduce bursts." tstop = findlast((t[end] .- t) .>= period) @assert bursts < tstop "Bursting impossible. Please provide more data or reduce bursts." From 16a2be3f2d58b5cc2d961bd14031ea71ecf6c09b Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Tue, 11 Feb 2020 15:46:56 +0100 Subject: [PATCH 5/8] Corrected time based bursting --- src/utils.jl | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/utils.jl b/src/utils.jl index 1bcdfde04..7311efd69 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -137,18 +137,11 @@ end @assert size(x)[end] == size(t)[end] @assert bursts >= 1 @assert t[end]-t[1]>= period*bursts "Bursting impossible. Please provide more data or reduce bursts." - tstop = findlast((t[end] .- t) .>= period) - @assert bursts < tstop "Bursting impossible. Please provide more data or reduce bursts." - - inds = collect(sample(1:tstop-1, bursts, replace = false)) # Starting points - idx = Int64[] - for i in inds - _stop = findlast(t[i:tstop-1] .- t[i] .<= period) - isnothing(_stop) ? _stop = 1 : nothing - push!(idx, collect(i:i+_stop-1)...) - end - sort!(unique!(idx)) - return resample(x, idx), resample(t, idx) + t_ids = zero(eltype(t)) .<= t .- period .<= t[end] .- 2*period + samplesize = Int64(round(period/(t[end]-t[1])*length(t))) + inds = sample(t_ids, bursts, replace = false) + inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) + return resample(x, inds), resample(t, inds) end From b200b808dfed625fcf03b38c3b1eaac818850551 Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Tue, 11 Feb 2020 15:57:09 +0100 Subject: [PATCH 6/8] Added subsampling with time series --- src/utils.jl | 16 ++++++++++++++++ test/runtests.jl | 5 +++++ 2 files changed, 21 insertions(+) diff --git a/src/utils.jl b/src/utils.jl index 7311efd69..7766e0871 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -134,6 +134,7 @@ end @inline function burst_sampling(x::AbstractArray, t::AbstractVector, period::T, bursts::Int64) where T <: AbstractFloat + @assert period > zero(typeof(period)) @assert size(x)[end] == size(t)[end] @assert bursts >= 1 @assert t[end]-t[1]>= period*bursts "Bursting impossible. Please provide more data or reduce bursts." @@ -156,6 +157,21 @@ end return x[:, 1:frequency:end] end +@inline function subsample(x::AbstractArray, t::AbstractVector, period::T) where T <: AbstractFloat + @assert period > zero(typeof(period)) + @assert size(x)[end] == size(t)[end] + @assert t[end]-t[1]>= period "Subsampling impossible. Time window too short.." + idx = Int64[1] + t_now = t[1] + @inbounds for (i, t_current) in enumerate(t) + if t_current - t_now >= period + push!(idx, i) + t_now = t_current + end + end + return resample(x, idx), resample(t, idx) +end + @inline function resample(x::AbstractArray{T,1}, indx::AbstractArray{Int64}) where T <: Number @assert maximum(indx) <= length(x) @assert minimum(indx) >= 1 diff --git a/test/runtests.jl b/test/runtests.jl index e4635e4cf..7eb9b6a42 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -315,4 +315,9 @@ end @test X2n[:, 1] == X[:, 1] @test X2n[:, end] == X[:, end-1] @test all([any(xi .≈ X) for xi in eachcol(X2n)]) + xs, ts = subsample(X, t, 0.5) + @test size(xs)[end] == size(ts)[end] + @test size(xs)[1] == size(X)[1] + @test all(diff(ts) .≈ 0.5) + end From 37ce43c5859121d884afe20d7e6d71c88117e948 Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Tue, 11 Feb 2020 20:13:14 +0100 Subject: [PATCH 7/8] Correct error in sampling ids from burst sampling --- src/utils.jl | 32 ++++++++++---------------------- test/runtests.jl | 6 +++++- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/src/utils.jl b/src/utils.jl index 7766e0871..b5c9240f7 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -103,22 +103,10 @@ function optimal_shrinkage!(X::AbstractArray{T, 2}) where T <: Number return end -function hankel(a::AbstractArray, b::AbstractArray) where T <: Number - p = vcat([a; b[2:end]]) - n = length(b)-1 - m = length(p)-n+1 - H = Array{eltype(p)}(undef, n, m) - @inbounds for i in 1:n - for j in 1:m - H[i,j] = p[i+j-1] - end - end - return H -end @inline function burst_sampling(x::AbstractArray, samplesize::Int64, bursts::Int64) @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!" - inds = sample(1:size(x)[end]-samplesize, bursts, replace = false) # Starting points + inds = sample(1:size(x)[end]-samplesize, bursts, replace = false) inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) return resample(x, inds) end @@ -127,20 +115,20 @@ end @inline function burst_sampling(x::AbstractArray, y::AbstractArray, samplesize::Int64, bursts::Int64) @assert size(x)[end] >= samplesize*bursts "Length of data array too small for subsampling of size $size!" @assert size(x)[end] == size(y)[end] - inds = sample(1:size(x)[end]-samplesize, bursts, replace = false) # Starting points + inds = sample(1:size(x)[end]-samplesize, bursts, replace = false) inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) return resample(x, inds), resample(y, inds) end @inline function burst_sampling(x::AbstractArray, t::AbstractVector, period::T, bursts::Int64) where T <: AbstractFloat - @assert period > zero(typeof(period)) - @assert size(x)[end] == size(t)[end] - @assert bursts >= 1 + @assert period > zero(typeof(period)) "Sampling period has to be positive." + @assert size(x)[end] == size(t)[end] "Provide consistent data." + @assert bursts >= 1 "Number of bursts has to be positive." @assert t[end]-t[1]>= period*bursts "Bursting impossible. Please provide more data or reduce bursts." t_ids = zero(eltype(t)) .<= t .- period .<= t[end] .- 2*period - samplesize = Int64(round(period/(t[end]-t[1])*length(t))) - inds = sample(t_ids, bursts, replace = false) + samplesize = Int64(floor(period/(t[end]-t[1])*length(t))) + inds = sample(collect(1:length(t))[t_ids], bursts, replace = false) inds = sort(unique(vcat([collect(i:i+samplesize) for i in inds]...))) return resample(x, inds), resample(t, inds) end @@ -158,9 +146,9 @@ end end @inline function subsample(x::AbstractArray, t::AbstractVector, period::T) where T <: AbstractFloat - @assert period > zero(typeof(period)) - @assert size(x)[end] == size(t)[end] - @assert t[end]-t[1]>= period "Subsampling impossible. Time window too short.." + @assert period > zero(typeof(period)) "Sampling period has to be positive." + @assert size(x)[end] == size(t)[end] "Provide consistent data." + @assert t[end]-t[1]>= period "Subsampling impossible. Sampling period exceeds time window." idx = Int64[1] t_now = t[1] @inbounds for (i, t_current) in enumerate(t) diff --git a/test/runtests.jl b/test/runtests.jl index 7eb9b6a42..8c4055fcc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -319,5 +319,9 @@ end @test size(xs)[end] == size(ts)[end] @test size(xs)[1] == size(X)[1] @test all(diff(ts) .≈ 0.5) - + # Loop this a few times to be sure its right + @test_nowarn for i in 1:20 + xs, ts = burst_sampling(X, t, 2.0, 1) + xs, ts = subsample(X, t, 0.5) + end end From 83703ea64d0c1083ab847ac5c34ea877017f6992 Mon Sep 17 00:00:00 2001 From: Julius Martensen Date: Tue, 11 Feb 2020 20:28:03 +0100 Subject: [PATCH 8/8] Getting the tests green --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 8c4055fcc..7f353924f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -307,7 +307,7 @@ end xs, ts = burst_sampling(X, t, 2.0, 1) @test all([any(xi .≈ X) for xi in eachcol(xs)]) @test size(xs)[end] == size(ts)[end] - @test ts[end]-ts[1] == 2.0 + @test ts[end]-ts[1] ≈ 2.0 X2n = subsample(X, 2) t2n = subsample(t, 2) @test size(X2n)[end] == size(t2n)[end]