diff --git a/Project.toml b/Project.toml index 1bd335f19..b0e421f21 100644 --- a/Project.toml +++ b/Project.toml @@ -34,6 +34,7 @@ NLSolversBase = "7" NLopt = "0.6, 1" Optim = "1" PrettyTables = "2" +ProximalAlgorithms = "0.5" StatsBase = "0.33, 0.34" Symbolics = "4, 5, 6" SymbolicUtils = "1.4 - 1.5, 1.7, 2, 3" @@ -48,7 +49,7 @@ test = ["Test"] NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" ProximalAlgorithms = "140ffc9f-1907-541a-a177-7475e0a401e9" ProximalCore = "dc4f5ac2-75d1-4f31-931e-60435d74994b" -ProximalOperators = "f3b72e0c-5f3e-4b3e-8f3e-3f4f3e3e3e3e" +ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537" [extensions] SEMNLOptExt = "NLopt" diff --git a/docs/src/developer/observed.md b/docs/src/developer/observed.md index 2b695e597..93eca6ed9 100644 --- a/docs/src/developer/observed.md +++ b/docs/src/developer/observed.md @@ -22,10 +22,10 @@ end To compute some fit indices, you need to provide methods for ```julia -# Number of observed datapoints -n_obs(observed::MyObserved) = ... -# Number of manifest variables -n_man(observed::MyObserved) = ... +# Number of samples (observations) in the dataset +nsamples(observed::MyObserved) = ... +# Number of observed variables +nobserved_vars(observed::MyObserved) = ... ``` As always, you can add additional methods for properties that imply types and loss function want to access, for example (from the `SemObservedCommon` implementation): diff --git a/docs/src/tutorials/inspection/inspection.md b/docs/src/tutorials/inspection/inspection.md index b2eefadb2..88caf5812 100644 --- a/docs/src/tutorials/inspection/inspection.md +++ b/docs/src/tutorials/inspection/inspection.md @@ -1,7 +1,7 @@ # Model inspection ```@setup colored -using StructuralEquationModels +using StructuralEquationModels observed_vars = [:x1, :x2, :x3, :y1, :y2, :y3, :y4, :y5, :y6, :y7, :y8] latent_vars = [:ind60, :dem60, :dem65] @@ -32,7 +32,7 @@ end partable = ParameterTable( graph, - latent_vars = latent_vars, + latent_vars = latent_vars, observed_vars = observed_vars) data = example_data("political_democracy") @@ -128,8 +128,8 @@ BIC χ² df minus2ll -n_man -n_obs +nobserved_vars +nsamples nparams p_value RMSEA diff --git a/src/StructuralEquationModels.jl b/src/StructuralEquationModels.jl index 9e0fc3669..a6677a4ed 100644 --- a/src/StructuralEquationModels.jl +++ b/src/StructuralEquationModels.jl @@ -41,8 +41,9 @@ include("frontend/fit/summary.jl") include("frontend/pretty_printing.jl") # observed include("observed/abstract.jl") -include("observed/covariance.jl") include("observed/data.jl") +include("observed/covariance.jl") +include("observed/missing_pattern.jl") include("observed/missing.jl") include("observed/EM.jl") # constructor diff --git a/src/additional_functions/helper.jl b/src/additional_functions/helper.jl index be559b0d9..71b2559a8 100644 --- a/src/additional_functions/helper.jl +++ b/src/additional_functions/helper.jl @@ -98,11 +98,6 @@ function sparse_outer_mul!(C, A, B::Vector, ind) #computes A*S*B -> C, where ind end end -function cov_and_mean(rows; corrected = false) - obs_mean, obs_cov = StatsBase.mean_and_cov(reduce(hcat, rows), 2, corrected = corrected) - return obs_cov, vec(obs_mean) -end - # n²×(n(n+1)/2) matrix to transform a vector of lower # triangular entries into a vectorized form of a n×n symmetric matrix, # opposite of elimination_matrix() diff --git a/src/frontend/fit/fitmeasures/minus2ll.jl b/src/frontend/fit/fitmeasures/minus2ll.jl index 54a4ce12d..1cddee71d 100644 --- a/src/frontend/fit/fitmeasures/minus2ll.jl +++ b/src/frontend/fit/fitmeasures/minus2ll.jl @@ -31,74 +31,33 @@ minus2ll(minimum::Number, obs, imp::Union{RAM, RAMSymbolic}, loss_ml::SemWLS) = # compute likelihood for missing data - H0 ------------------------------------------------- # -2ll = (∑ log(2π)*(nᵢ + mᵢ)) + F*n function minus2ll(minimum::Number, observed, imp::Union{RAM, RAMSymbolic}, loss_ml::SemFIML) - F = minimum - F *= nsamples(observed) - F += sum(log(2π) * observed.pattern_nsamples .* observed.pattern_nobs_vars) + F = minimum * nsamples(observed) + F += log(2π) * sum(pat -> nsamples(pat) * nmeasured_vars(pat), observed.patterns) return F end # compute likelihood for missing data - H1 ------------------------------------------------- # -2ll = ∑ log(2π)*(nᵢ + mᵢ) + ln(Σᵢ) + (mᵢ - μᵢ)ᵀ Σᵢ⁻¹ (mᵢ - μᵢ)) + tr(SᵢΣᵢ) function minus2ll(observed::SemObservedMissing) - if observed.em_model.fitted - minus2ll( - observed.em_model.μ, - observed.em_model.Σ, - nsamples(observed), - pattern_rows(observed), - observed.patterns, - observed.obs_mean, - observed.obs_cov, - observed.pattern_nsamples, - observed.pattern_nobs_vars, - ) - else - em_mvn(observed) - minus2ll( - observed.em_model.μ, - observed.em_model.Σ, - nsamples(observed), - pattern_rows(observed), - observed.patterns, - observed.obs_mean, - observed.obs_cov, - observed.pattern_nsamples, - observed.pattern_nobs_vars, - ) - end -end - -function minus2ll( - μ, - Σ, - N, - rows, - patterns, - obs_mean, - obs_cov, - pattern_nsamples, - pattern_nobs_vars, -) - F = 0.0 + # fit EM-based mean and cov if not yet fitted + # FIXME EM could be very computationally expensive + observed.em_model.fitted || em_mvn(observed) - for i in 1:length(rows) - nᵢ = pattern_nsamples[i] - # missing pattern - pattern = patterns[i] - # observed data - Sᵢ = obs_cov[i] + Σ = observed.em_model.Σ + μ = observed.em_model.μ + F = sum(observed.patterns) do pat # implied covariance/mean - Σᵢ = Σ[pattern, pattern] - ld = logdet(Σᵢ) - Σᵢ⁻¹ = inv(cholesky(Σᵢ)) - meandiffᵢ = obs_mean[i] - μ[pattern] + Σᵢ = Σ[pat.measured_mask, pat.measured_mask] + Σᵢ_chol = cholesky!(Σᵢ) + ld = logdet(Σᵢ_chol) + Σᵢ⁻¹ = LinearAlgebra.inv!(Σᵢ_chol) + meandiffᵢ = pat.measured_mean - μ[pat.measured_mask] - F += F_one_pattern(meandiffᵢ, Σᵢ⁻¹, Sᵢ, ld, nᵢ) + F_one_pattern(meandiffᵢ, Σᵢ⁻¹, pat.measured_cov, ld, nsamples(pat)) end - F += sum(log(2π) * pattern_nsamples .* pattern_nobs_vars) - #F *= N + F += log(2π) * sum(pat -> nsamples(pat) * nmeasured_vars(pat), observed.patterns) return F end diff --git a/src/frontend/specification/ParameterTable.jl b/src/frontend/specification/ParameterTable.jl index 8b7cc0973..07c24e46e 100644 --- a/src/frontend/specification/ParameterTable.jl +++ b/src/frontend/specification/ParameterTable.jl @@ -27,9 +27,9 @@ empty_partable_columns(nrows::Integer = 0) = Dict{Symbol, Vector}( :param => fill(Symbol(), nrows), ) -# construct using the provided columns data or create and empty table +# construct using the provided columns data or create an empty table function ParameterTable( - columns::Dict{Symbol, Vector} = empty_partable_columns(); + columns::Dict{Symbol, Vector}; observed_vars::Union{AbstractVector{Symbol}, Nothing} = nothing, latent_vars::Union{AbstractVector{Symbol}, Nothing} = nothing, params::Union{AbstractVector{Symbol}, Nothing} = nothing, diff --git a/src/frontend/specification/Sem.jl b/src/frontend/specification/Sem.jl index 741d5f3c6..28984dbe9 100644 --- a/src/frontend/specification/Sem.jl +++ b/src/frontend/specification/Sem.jl @@ -3,6 +3,7 @@ ############################################################################################ function Sem(; + specification = ParameterTable, observed::O = SemObservedData, imply::I = RAM, loss::L = SemML, @@ -12,7 +13,7 @@ function Sem(; set_field_type_kwargs!(kwdict, observed, imply, loss, O, I) - observed, imply, loss = get_fields!(kwdict, observed, imply, loss) + observed, imply, loss = get_fields!(kwdict, specification, observed, imply, loss) sem = Sem(observed, imply, loss) @@ -59,6 +60,7 @@ Returns the loss part of a model. loss(model::AbstractSemSingle) = model.loss function SemFiniteDiff(; + specification = ParameterTable, observed::O = SemObservedData, imply::I = RAM, loss::L = SemML, @@ -68,7 +70,7 @@ function SemFiniteDiff(; set_field_type_kwargs!(kwdict, observed, imply, loss, O, I) - observed, imply, loss = get_fields!(kwdict, observed, imply, loss) + observed, imply, loss = get_fields!(kwdict, specification, observed, imply, loss) sem = SemFiniteDiff(observed, imply, loss) @@ -96,23 +98,27 @@ function set_field_type_kwargs!(kwargs, observed, imply, loss, O, I) end # construct Sem fields -function get_fields!(kwargs, observed, imply, loss) +function get_fields!(kwargs, specification, observed, imply, loss) + if !isa(specification, SemSpecification) + specification = specification(; kwargs...) + end + # observed if !isa(observed, SemObserved) - observed = observed(; kwargs...) + observed = observed(; specification, kwargs...) end kwargs[:observed] = observed # imply if !isa(imply, SemImply) - imply = imply(; kwargs...) + imply = imply(; specification, kwargs...) end kwargs[:imply] = imply kwargs[:nparams] = nparams(imply) # loss - loss = get_SemLoss(loss; kwargs...) + loss = get_SemLoss(loss; specification, kwargs...) kwargs[:loss] = loss return observed, imply, loss diff --git a/src/frontend/specification/StenoGraphs.jl b/src/frontend/specification/StenoGraphs.jl index 5cf87c07a..65bace302 100644 --- a/src/frontend/specification/StenoGraphs.jl +++ b/src/frontend/specification/StenoGraphs.jl @@ -129,6 +129,17 @@ function ParameterTable( return ParameterTable(columns; latent_vars, observed_vars, params) end +############################################################################################ +### keyword only constructor (for call in `Sem` constructor) +############################################################################################ + +# FIXME: this kw-only ctor conflicts with the empty ParTable constructor; +# it is left here for compatibility with the current Sem construction API, +# the proper fix would be to move away from kw-only ctors in general +ParameterTable(; graph::Union{AbstractStenoGraph, Nothing} = nothing, kwargs...) = + !isnothing(graph) ? ParameterTable(graph; kwargs...) : + ParameterTable(empty_partable_columns(); kwargs...) + ############################################################################################ ### constructor for EnsembleParameterTable from graph ############################################################################################ diff --git a/src/loss/ML/FIML.jl b/src/loss/ML/FIML.jl index 20c81b831..bf020d561 100644 --- a/src/loss/ML/FIML.jl +++ b/src/loss/ML/FIML.jl @@ -47,23 +47,27 @@ end ### Constructors ############################################################################################ -function SemFIML(; observed, specification, kwargs...) - inverses = broadcast(x -> zeros(x, x), pattern_nobs_vars(observed)) +function SemFIML(; observed::SemObservedMissing, specification, kwargs...) + inverses = + [zeros(nmeasured_vars(pat), nmeasured_vars(pat)) for pat in observed.patterns] choleskys = Array{Cholesky{Float64, Array{Float64, 2}}, 1}(undef, length(inverses)) - n_patterns = size(pattern_rows(observed), 1) + n_patterns = length(observed.patterns) logdets = zeros(n_patterns) - imp_mean = zeros.(pattern_nobs_vars(observed)) - meandiff = zeros.(pattern_nobs_vars(observed)) + imp_mean = [zeros(nmeasured_vars(pat)) for pat in observed.patterns] + meandiff = [zeros(nmeasured_vars(pat)) for pat in observed.patterns] nobs_vars = nobserved_vars(observed) imp_inv = zeros(nobs_vars, nobs_vars) mult = similar.(inverses) - ∇ind = vec(CartesianIndices(Array{Float64}(undef, nobs_vars, nobs_vars))) - ∇ind = - [findall(x -> !(x[1] ∈ ind || x[2] ∈ ind), ∇ind) for ind in patterns_not(observed)] + # generate linear indicies of co-observed variable pairs for each pattern + Σ_linind = LinearIndices((nobs_vars, nobs_vars)) + ∇ind = map(observed.patterns) do pat + pat_vars = findall(pat.measured_mask) + vec(Σ_linind[pat_vars, pat_vars]) + end return SemFIML( ExactHessian(), @@ -104,10 +108,10 @@ function evaluate!( prepare_SemFIML!(semfiml, model) scale = inv(nsamples(observed(model))) - obs_rows = pattern_rows(observed(model)) - isnothing(objective) || (objective = scale * F_FIML(obs_rows, semfiml, model, params)) + isnothing(objective) || + (objective = scale * F_FIML(observed(model), semfiml, model, params)) isnothing(gradient) || - (∇F_FIML!(gradient, obs_rows, semfiml, model); gradient .*= scale) + (∇F_FIML!(gradient, observed(model), semfiml, model); gradient .*= scale) return objective end @@ -131,16 +135,16 @@ function F_one_pattern(meandiff, inverse, obs_cov, logdet, N) return F * N end -function ∇F_one_pattern(μ_diff, Σ⁻¹, S, pattern, ∇ind, N, Jμ, JΣ, model) +function ∇F_one_pattern(μ_diff, Σ⁻¹, S, obs_mask, ∇ind, N, Jμ, JΣ, model) diff⨉inv = μ_diff' * Σ⁻¹ if N > one(N) JΣ[∇ind] .+= N * vec(Σ⁻¹ * (I - S * Σ⁻¹ - μ_diff * diff⨉inv)) - @. Jμ[pattern] += (N * 2 * diff⨉inv)' + @. Jμ[obs_mask] += (N * 2 * diff⨉inv)' else JΣ[∇ind] .+= vec(Σ⁻¹ * (I - μ_diff * diff⨉inv)) - @. Jμ[pattern] += (2 * diff⨉inv)' + @. Jμ[obs_mask] += (2 * diff⨉inv)' end end @@ -163,32 +167,32 @@ function ∇F_fiml_outer!(G, JΣ, Jμ, imply, model, semfiml) mul!(G, ∇μ', Jμ, -1, 1) end -function F_FIML(rows, semfiml, model, params) +function F_FIML(observed::SemObservedMissing, semfiml, model, params) F = zero(eltype(params)) - for i in 1:size(rows, 1) + for (i, pat) in enumerate(observed.patterns) F += F_one_pattern( semfiml.meandiff[i], semfiml.inverses[i], - obs_cov(observed(model))[i], + pat.measured_cov, semfiml.logdets[i], - pattern_nsamples(observed(model))[i], + nsamples(pat), ) end return F end -function ∇F_FIML!(G, rows, semfiml, model) +function ∇F_FIML!(G, observed::SemObservedMissing, semfiml, model) Jμ = zeros(nobserved_vars(model)) JΣ = zeros(nobserved_vars(model)^2) - for i in 1:size(rows, 1) + for (i, pat) in enumerate(observed.patterns) ∇F_one_pattern( semfiml.meandiff[i], semfiml.inverses[i], - obs_cov(observed(model))[i], - patterns(observed(model))[i], + pat.measured_cov, + pat.measured_mask, semfiml.∇ind[i], - pattern_nsamples(observed(model))[i], + nsamples(pat), Jμ, JΣ, model, @@ -202,29 +206,21 @@ function prepare_SemFIML!(semfiml, model) batch_cholesky!(semfiml, model) #batch_sym_inv_update!(semfiml, model) batch_inv!(semfiml, model) - for i in 1:size(pattern_nsamples(observed(model)), 1) - semfiml.meandiff[i] .= obs_mean(observed(model))[i] - semfiml.imp_mean[i] + for (i, pat) in enumerate(observed(model).patterns) + semfiml.meandiff[i] .= pat.measured_mean .- semfiml.imp_mean[i] end end -function copy_per_pattern!(inverses, source_inverses, means, source_means, patterns) - @views for i in 1:size(patterns, 1) - inverses[i] .= source_inverses[patterns[i], patterns[i]] - end - - @views for i in 1:size(patterns, 1) - means[i] .= source_means[patterns[i]] +function copy_per_pattern!(fiml::SemFIML, model::AbstractSem) + Σ = imply(model).Σ + μ = imply(model).μ + data = observed(model) + @inbounds @views for (i, pat) in enumerate(data.patterns) + fiml.inverses[i] .= Σ[pat.measured_mask, pat.measured_mask] + fiml.imp_mean[i] .= μ[pat.measured_mask] end end -copy_per_pattern!(semfiml, model::M where {M <: AbstractSem}) = copy_per_pattern!( - semfiml.inverses, - imply(model).Σ, - semfiml.imp_mean, - imply(model).μ, - patterns(observed(model)), -) - function batch_cholesky!(semfiml, model) for i in 1:size(semfiml.inverses, 1) semfiml.choleskys[i] = cholesky!(Symmetric(semfiml.inverses[i])) diff --git a/src/observed/EM.jl b/src/observed/EM.jl index ef5da317d..beac45ca8 100644 --- a/src/observed/EM.jl +++ b/src/observed/EM.jl @@ -37,9 +37,9 @@ function em_mvn( 𝔼xxᵀ_pre = zeros(nvars, nvars) ### precompute for full cases - if length(observed.patterns[1]) == nvars - for row in pattern_rows(observed)[1] - row = observed.data_rowwise[row] + fullpat = observed.patterns[1] + if nmissed_vars(fullpat) == 0 + for row in eachrow(fullpat.data) 𝔼x_pre += row 𝔼xxᵀ_pre += row * row' end @@ -97,21 +97,27 @@ function em_mvn_Estep!(𝔼x, 𝔼xxᵀ, em_model, observed, 𝔼x_pre, 𝔼xx Σ = em_model.Σ # Compute the expected sufficient statistics - for i in 2:length(observed.pattern_nsamples) + for pat in observed.patterns + (nmissed_vars(pat) == 0) && continue # skip full cases # observed and unobserved vars - u = observed.patterns_not[i] - o = observed.patterns[i] + u = pat.miss_mask + o = pat.measured_mask # precompute for pattern - V = Σ[u, u] - Σ[u, o] * (Σ[o, o] \ Σ[o, u]) + Σoo = Σ[o, o] + Σuo = Σ[u, o] + μu = μ[u] + μo = μ[o] + + V = Σ[u, u] - Σuo * (Σoo \ Σ[o, u]) # loop trough data - for row in pattern_rows(observed)[i] - m = μ[u] + Σ[u, o] * (Σ[o, o] \ (observed.data_rowwise[row] - μ[o])) + for rowdata in eachrow(pat.data) + m = μu + Σuo * (Σoo \ (rowdata - μo)) 𝔼xᵢ[u] = m - 𝔼xᵢ[o] = observed.data_rowwise[row] + 𝔼xᵢ[o] = rowdata 𝔼xxᵀᵢ[u, u] = 𝔼xᵢ[u] * 𝔼xᵢ[u]' + V 𝔼xxᵀᵢ[o, o] = 𝔼xᵢ[o] * 𝔼xᵢ[o]' 𝔼xxᵀᵢ[o, u] = 𝔼xᵢ[o] * 𝔼xᵢ[u]' @@ -153,10 +159,10 @@ end # use μ and Σ of full cases function start_em_observed(observed::SemObservedMissing; kwargs...) - if (length(observed.patterns[1]) == nobserved_vars(observed)) & - (observed.pattern_nsamples[1] > 1) - μ = copy(observed.obs_mean[1]) - Σ = copy(Symmetric(observed.obs_cov[1])) + fullpat = observed.patterns[1] + if (nmissed_vars(fullpat) == 0) && (nobserved_vars(fullpat) > 1) + μ = copy(fullpat.measured_mean) + Σ = copy(Symmetric(fullpat.measured_cov)) if !isposdef(Σ) Σ = Matrix(Diagonal(Σ)) end diff --git a/src/observed/abstract.jl b/src/observed/abstract.jl index 90de8b5a6..bb92ea12e 100644 --- a/src/observed/abstract.jl +++ b/src/observed/abstract.jl @@ -8,3 +8,146 @@ Rows are samples, columns are observed variables. [`nsamples`](@ref), [`observed_vars`](@ref). """ samples(observed::SemObserved) = observed.data +nsamples(observed::SemObserved) = observed.nsamples + +observed_vars(observed::SemObserved) = observed.observed_vars + +############################################################################################ +### Additional functions +############################################################################################ + +# generate default observed variable names if none provided +default_observed_vars(nobserved_vars::Integer, prefix::Union{Symbol, AbstractString}) = + Symbol.(prefix, 1:nobserved_vars) + +# compute the permutation that subsets and reorders source elements +# to match the destination order. +# if multiple identical elements are present in the source, the last one is used. +# if one_to_one is true, checks that the source and destination have the same length. +function source_to_dest_perm( + src::AbstractVector, + dest::AbstractVector; + one_to_one::Bool = false, + entities::String = "elements", +) + if dest == src # exact match + return eachindex(dest) + else + one_to_one && + length(dest) != length(src) && + throw( + DimensionMismatch( + "The length of the new $entities order ($(length(dest))) " * + "does not match the number of $entities ($(length(src)))", + ), + ) + src_inds = Dict(el => i for (i, el) in enumerate(src)) + return [src_inds[el] for el in dest] + end +end + +# function to prepare input data shared by SemObserved implementations +# returns tuple of +# 1) the matrix of data +# 2) the observed variable symbols that match matrix columns +# 3) the permutation of the original observed_vars (nothing if no reordering) +# If observed_vars is not specified, the vars order is taken from the specification. +# If both observed_vars and specification are provided, the observed_vars are used to match +# the column of the user-provided data matrix, and observed_vars(specification) is used to +# reorder the columns of the data to match the speciation. +# If no variable names are provided at all, generates the symbols in the form +# Symbol(observed_var_prefix, i) for i=1:nobserved_vars. +function prepare_data( + data::Union{AbstractDataFrame, AbstractMatrix, NTuple{2, Integer}, Nothing}, + observed_vars::Union{AbstractVector, Nothing}, + spec::Union{SemSpecification, Nothing}; + observed_var_prefix::Union{Symbol, AbstractString}, +) + obs_vars = nothing + obs_vars_perm = nothing + if !isnothing(observed_vars) + obs_vars = Symbol.(observed_vars) + if !isnothing(spec) + obs_vars_spec = SEM.observed_vars(spec) + try + obs_vars_perm = source_to_dest_perm( + obs_vars, + obs_vars_spec, + one_to_one = false, + entities = "observed_vars", + ) + catch err + if isa(err, KeyError) + throw( + ArgumentError( + "observed_var \"$(err.key)\" from SEM specification is not listed in observed_vars argument", + ), + ) + else + rethrow(err) + end + end + # ignore trivial reorder + if obs_vars_perm == eachindex(obs_vars) + obs_vars_perm = nothing + end + end + elseif !isnothing(spec) + obs_vars = SEM.observed_vars(spec) + end + # observed vars in the order that matches the specification + obs_vars_reordered = isnothing(obs_vars_perm) ? obs_vars : obs_vars[obs_vars_perm] + + # subset the data, check that obs_vars matches data or guess the obs_vars + if data isa AbstractDataFrame + if !isnothing(obs_vars_reordered) # subset/reorder columns + data = data[:, obs_vars_reordered] + if obs_vars_reordered != obs_vars + @warn "The order of variables in observed_vars argument does not match the order of observed_vars(specification). The specification order is used." + end + else # default symbol names + obs_vars = obs_vars_reordered = Symbol.(names(data)) + end + data_mtx = Matrix(data) + elseif data isa AbstractMatrix + if !isnothing(obs_vars) + size(data, 2) == length(obs_vars) || DimensionMismatch( + "The number of columns in the data matrix ($(size(data, 2))) does not match the length of observed_vars ($(length(obs_vars))).", + ) + # reorder columns to match the spec + data_ordered = !isnothing(obs_vars_perm) ? data[:, obs_vars_perm] : data + else + obs_vars = + obs_vars_reordered = + default_observed_vars(size(data, 2), observed_var_prefix) + data_ordered = data + end + # make sure data_mtx is a dense matrix (required for methods like mean_and_cov()) + data_mtx = convert(Matrix, data_ordered) + elseif data isa NTuple{2, Integer} # given the dimensions of the data matrix, but no data itself + data_mtx = nothing + nobs_vars = data[2] + if isnothing(obs_vars) + obs_vars = + obs_vars_reordered = default_observed_vars(nobs_vars, observed_var_prefix) + elseif length(obs_vars) != nobs_vars + throw( + DimensionMismatch( + "The length of observed_vars ($(length(obs_vars))) does not match the data matrix columns ($(nobs_vars)).", + ), + ) + end + elseif isnothing(data) + data_mtx = nothing + if isnothing(obs_vars) + throw( + ArgumentError( + "No data, specification or observed_vars provided. Cannot infer observed_vars from provided inputs", + ), + ) + end + else + throw(ArgumentError("Unsupported data type: $(typeof(data))")) + end + return data_mtx, obs_vars_reordered, obs_vars_perm +end diff --git a/src/observed/covariance.jl b/src/observed/covariance.jl index b78f41833..221ef5ca3 100644 --- a/src/observed/covariance.jl +++ b/src/observed/covariance.jl @@ -1,128 +1,80 @@ """ -For observed covariance matrices and means. +Type alias for [`SemObservedData`](@ref) that has mean and covariance, but no actual data. -# Constructor +For instances of `SemObservedCovariance` [`samples`](@ref) returns `nothing`. +""" +const SemObservedCovariance{S} = SemObservedData{Nothing, S} +""" SemObservedCovariance(; specification, obs_cov, obs_colnames = nothing, meanstructure = false, obs_mean = nothing, - nsamples = nothing, + nsamples::Integer, kwargs...) -# Arguments -- `specification`: either a `RAMMatrices` or `ParameterTable` object (1) -- `obs_cov`: observed covariance matrix -- `obs_colnames::Vector{Symbol}`: column names of the covariance matrix -- `meanstructure::Bool`: does the model have a meanstructure? -- `obs_mean`: observed mean vector -- `nsamples::Number`: number of samples (observed data points); necessary for fit statistics - -# Extended help -## Interfaces -- `nsamples(::SemObservedCovariance)`: number of samples (observed data points) -- `n_man(::SemObservedCovariance)` -> number of manifest variables +Construct [`SemObserved`](@ref) without providing the observations data, +but with the covariations (`obs_cov`) and the means (`obs_means`) of the observed variables. -- `obs_cov(::SemObservedCovariance)` -> observed covariance matrix -- `obs_mean(::SemObservedCovariance)` -> observed means +Returns [`SemObservedCovariance`](@ref) object. -## Implementation -Subtype of `SemObserved` - -## Remarks -(1) the `specification` argument can also be `nothing`, but this turns of checking whether -the observed data/covariance columns are in the correct order! As a result, you should only -use this if you are sure your covariance matrix is in the right format. - -## Additional keyword arguments: -- `spec_colnames::Vector{Symbol} = nothing`: overwrites column names of the specification object +# Arguments +- `obs_cov`: pre-computed covariations of the observed variables +- `obs_mean`: optional pre-computed means of the observed variables +- `observed_vars::AbstractVector`: IDs of the observed variables (rows and columns of the `obs_cov` matrix) +- `specification`: optional SEM specification ([`SemSpecification`](@ref)) +- `nsamples::Number`: number of samples (observed data points) used to compute `obs_cov` and `obs_means` + necessary for calculating fit statistics """ -struct SemObservedCovariance{B, C} <: SemObserved - obs_cov::B - obs_mean::C - nobs_vars::Int - nsamples::Int -end - function SemObservedCovariance(; + obs_cov::AbstractMatrix, + obs_mean::Union{AbstractVector, Nothing} = nothing, + observed_vars::Union{AbstractVector, Nothing} = nothing, specification::Union{SemSpecification, Nothing} = nothing, - obs_cov, - obs_colnames = nothing, - spec_colnames = nothing, - obs_mean = nothing, - meanstructure = false, nsamples::Integer, + observed_var_prefix::Union{Symbol, AbstractString} = :obs, kwargs..., ) - if !meanstructure & !isnothing(obs_mean) - throw(ArgumentError("observed means were passed, but `meanstructure = false`")) - - elseif meanstructure & isnothing(obs_mean) - throw(ArgumentError("`meanstructure = true`, but no observed means were passed")) - end - - if isnothing(spec_colnames) && !isnothing(specification) - spec_colnames = observed_vars(specification) + nvars = size(obs_cov, 1) + size(obs_cov, 2) == nvars || throw( + DimensionMismatch( + "The covariance matrix should be square, $(size(obs_cov)) was found.", + ), + ) + S = eltype(obs_cov) + + if isnothing(obs_mean) + obs_mean = zeros(S, nvars) + else + length(obs_mean) == nvars || throw( + DimensionMismatch( + "The length of the mean vector $(length(obs_mean)) does not match the size of the covariance matrix $(size(obs_cov))", + ), + ) + S = promote_type(S, eltype(obs_mean)) end - if !isnothing(spec_colnames) & isnothing(obs_colnames) - throw(ArgumentError("no `obs_colnames` were specified")) + obs_cov = convert(Matrix{S}, obs_cov) + obs_mean = convert(Vector{S}, obs_mean) - elseif !isnothing(spec_colnames) & !(eltype(obs_colnames) <: Symbol) - throw(ArgumentError("please specify `obs_colnames` as a vector of Symbols")) + if !isnothing(observed_vars) + length(observed_vars) == nvars || throw( + DimensionMismatch( + "The length of the observed_vars $(length(observed_vars)) does not match the size of the covariance matrix $(size(obs_cov))", + ), + ) end - if !isnothing(spec_colnames) - obs_cov = reorder_obs_cov(obs_cov, spec_colnames, obs_colnames) - isnothing(obs_mean) || - (obs_mean = reorder_obs_mean(obs_mean, spec_colnames, obs_colnames)) - end + _, obs_vars, obs_vars_perm = + prepare_data((nsamples, nvars), observed_vars, specification; observed_var_prefix) - return SemObservedCovariance(obs_cov, obs_mean, size(obs_cov, 1), nsamples) -end - -############################################################################################ -### Recommended methods -############################################################################################ - -nsamples(observed::SemObservedCovariance) = observed.nsamples -nobserved_vars(observed::SemObservedCovariance) = observed.nobs_vars - -samples(observed::SemObservedCovariance) = - error("$(typeof(observed)) does not store data samples") - -############################################################################################ -### additional methods -############################################################################################ - -obs_cov(observed::SemObservedCovariance) = observed.obs_cov -obs_mean(observed::SemObservedCovariance) = observed.obs_mean - -############################################################################################ -### Additional functions -############################################################################################ - -# reorder covariance matrices -------------------------------------------------------------- -function reorder_obs_cov(obs_cov, spec_colnames, obs_colnames) - if spec_colnames == obs_colnames - return obs_cov - else - new_position = [findfirst(==(x), obs_colnames) for x in spec_colnames] - obs_cov = obs_cov[new_position, new_position] - return obs_cov + # reorder to match the specification + if !isnothing(obs_vars_perm) + obs_cov = obs_cov[obs_vars_perm, obs_vars_perm] + obs_mean = obs_mean[obs_vars_perm] end -end -# reorder means ---------------------------------------------------------------------------- - -function reorder_obs_mean(obs_mean, spec_colnames, obs_colnames) - if spec_colnames == obs_colnames - return obs_mean - else - new_position = [findfirst(==(x), obs_colnames) for x in spec_colnames] - obs_mean = obs_mean[new_position] - return obs_mean - end + return SemObservedData(nothing, obs_vars, obs_cov, obs_mean, nsamples) end diff --git a/src/observed/data.jl b/src/observed/data.jl index c9b50e597..b6ddaa43d 100644 --- a/src/observed/data.jl +++ b/src/observed/data.jl @@ -4,17 +4,15 @@ For observed data without missings. # Constructor SemObservedData(; - specification, data, - meanstructure = false, - obs_colnames = nothing, + observed_vars = nothing, + specification = nothing, kwargs...) # Arguments -- `specification`: either a `RAMMatrices` or `ParameterTable` object (1) -- `data`: observed data -- `meanstructure::Bool`: does the model have a meanstructure? -- `obs_colnames::Vector{Symbol}`: column names of the data (if the object passed as data does not have column names, i.e. is not a data frame) +- `specification`: optional SEM specification ([`SemSpecification`](@ref)) +- `data`: observed data -- *DataFrame* or *Matrix* +- `observed_vars::Vector{Symbol}`: column names of the data (if the object passed as data does not have column names, i.e. is not a data frame) # Extended help ## Interfaces @@ -27,112 +25,36 @@ For observed data without missings. ## Implementation Subtype of `SemObserved` - -## Remarks -(1) the `specification` argument can also be `nothing`, but this turns of checking whether -the observed data/covariance columns are in the correct order! As a result, you should only -use this if you are sure your observed data is in the right format. - -## Additional keyword arguments: -- `spec_colnames::Vector{Symbol} = nothing`: overwrites column names of the specification object -- `compute_covariance::Bool ) = true`: should the covariance of `data` be computed and stored? """ -struct SemObservedData{A, B, C} <: SemObserved - data::A - obs_cov::B - obs_mean::C - nobs_vars::Int +struct SemObservedData{D <: Union{Nothing, AbstractMatrix}, S <: Number} <: SemObserved + data::D + observed_vars::Vector{Symbol} + obs_cov::Matrix{S} + obs_mean::Vector{S} nsamples::Int end -# error checks -function check_arguments_SemObservedData(kwargs...) - # data is a data frame, - -end - function SemObservedData(; - specification::Union{SemSpecification, Nothing}, data, - obs_colnames = nothing, - spec_colnames = nothing, - meanstructure = false, - compute_covariance = true, + observed_vars::Union{AbstractVector, Nothing} = nothing, + specification::Union{SemSpecification, Nothing} = nothing, + observed_var_prefix::Union{Symbol, AbstractString} = :obs, kwargs..., ) - if isnothing(spec_colnames) && !isnothing(specification) - spec_colnames = observed_vars(specification) - end - - if !isnothing(spec_colnames) - if isnothing(obs_colnames) - try - data = data[:, spec_colnames] - catch - throw( - ArgumentError( - "Your `data` can not be indexed by symbols. " * - "Maybe you forgot to provide column names via the `obs_colnames = ...` argument.", - ), - ) - end - else - if data isa DataFrame - throw( - ArgumentError( - "You passed your data as a `DataFrame`, but also specified `obs_colnames`. " * - "Please make sure the column names of your data frame indicate the correct variables " * - "or pass your data in a different format.", - ), - ) - end - - if !(eltype(obs_colnames) <: Symbol) - throw(ArgumentError("please specify `obs_colnames` as a vector of Symbols")) - end - - data = reorder_data(data, spec_colnames, obs_colnames) - end - end + data, obs_vars, _ = + prepare_data(data, observed_vars, specification; observed_var_prefix) + obs_mean, obs_cov = mean_and_cov(data, 1) - if data isa DataFrame - data = Matrix(data) - end - - return SemObservedData( - data, - compute_covariance ? Statistics.cov(data) : nothing, - meanstructure ? vec(Statistics.mean(data, dims = 1)) : nothing, - size(data, 2), - size(data, 1), - ) + return SemObservedData(data, obs_vars, obs_cov, vec(obs_mean), size(data, 1)) end ############################################################################################ ### Recommended methods ############################################################################################ -nsamples(observed::SemObservedData) = observed.nsamples -nobserved_vars(observed::SemObservedData) = observed.nobs_vars - ############################################################################################ ### additional methods ############################################################################################ obs_cov(observed::SemObservedData) = observed.obs_cov obs_mean(observed::SemObservedData) = observed.obs_mean - -############################################################################################ -### Additional functions -############################################################################################ - -# reorder data ----------------------------------------------------------------------------- -function reorder_data(data::AbstractArray, spec_colnames, obs_colnames) - if spec_colnames == obs_colnames - return data - else - obs_positions = Dict(col => i for (i, col) in enumerate(obs_colnames)) - new_positions = [obs_positions[col] for col in spec_colnames] - return data[:, new_positions] - end -end diff --git a/src/observed/missing.jl b/src/observed/missing.jl index b628a313b..cf699252e 100644 --- a/src/observed/missing.jl +++ b/src/observed/missing.jl @@ -9,76 +9,44 @@ mutable struct EmMVNModel{A, b, B} fitted::B end +# FIXME type unstable +obs_mean(em::EmMVNModel) = ifelse(em.fitted, em.μ, nothing) +obs_cov(em::EmMVNModel) = ifelse(em.fitted, em.Σ, nothing) + """ For observed data with missing values. # Constructor SemObservedMissing(; - specification, data, - obs_colnames = nothing, + observed_vars = nothing, + specification = nothing, kwargs...) # Arguments -- `specification`: either a `RAMMatrices` or `ParameterTable` object (1) +- `specification`: optional SEM model specification ([`SemSpecification`](@ref)) - `data`: observed data -- `obs_colnames::Vector{Symbol}`: column names of the data (if the object passed as data does not have column names, i.e. is not a data frame) +- `observed_vars::Vector{Symbol}`: column names of the data (if the object passed as data does not have column names, i.e. is not a data frame) # Extended help ## Interfaces -- `nsamples(::SemObservedMissing)` -> number of observed data points -- `nobserved_vars(::SemObservedMissing)` -> number of manifest variables - -- `samples(::SemObservedMissing)` -> observed data -- `data_rowwise(::SemObservedMissing)` -> observed data as vector per observation, with missing values deleted +- `nsamples(::SemObservedMissing)` -> number of samples (data points) +- `nobserved_vars(::SemObservedMissing)` -> number of observed variables -- `patterns(::SemObservedMissing)` -> indices of non-missing variables per missing patterns -- `patterns_not(::SemObservedMissing)` -> indices of missing variables per missing pattern -- `pattern_rows(::SemObservedMissing)` -> row indices of observed data points that belong to each pattern -- `pattern_nsamples(::SemObservedMissing)` -> number of data points per pattern -- `pattern_nobs_vars(::SemObservedMissing)` -> number of non-missing observed variables per pattern -- `obs_mean(::SemObservedMissing)` -> observed mean per pattern -- `obs_cov(::SemObservedMissing)` -> observed covariance per pattern -- `em_model(::SemObservedMissing)` -> `EmMVNModel` that contains the covariance matrix and mean vector found via optimization maximization +- `samples(::SemObservedMissing)` -> data matrix (contains both measured and missing values) +- `em_model(::SemObservedMissing)` -> `EmMVNModel` that contains the covariance matrix and mean vector found via expectation maximization ## Implementation Subtype of `SemObserved` - -## Remarks -(1) the `specification` argument can also be `nothing`, but this turns of checking whether -the observed data/covariance columns are in the correct order! As a result, you should only -use this if you are sure your observed data is in the right format. - -## Additional keyword arguments: -- `spec_colnames::Vector{Symbol} = nothing`: overwrites column names of the specification object """ -mutable struct SemObservedMissing{ - A <: AbstractArray, - D <: Number, - O <: Number, - P <: Vector, - P2 <: Vector, - R <: Vector, - PD <: AbstractArray, - PO <: AbstractArray, - PVO <: AbstractArray, - A2 <: AbstractArray, - A3 <: AbstractArray, - S <: EmMVNModel, -} <: SemObserved - data::A - nobs_vars::D - nsamples::O - patterns::P # missing patterns - patterns_not::P2 - pattern_rows::R # coresponding rows in data_rowwise - data_rowwise::PD # list of data - pattern_nsamples::PO # observed rows per pattern - pattern_nobs_vars::PVO # number of non-missing variables per pattern - obs_mean::A2 - obs_cov::A3 - em_model::S +struct SemObservedMissing{T <: Real, S <: Real, E <: EmMVNModel} <: SemObserved + data::Matrix{Union{T, Missing}} + observed_vars::Vector{Symbol} + nsamples::Int + patterns::Vector{SemObservedMissingPattern{T, S}} + + em_model::E end ############################################################################################ @@ -86,116 +54,39 @@ end ############################################################################################ function SemObservedMissing(; - specification::Union{SemSpecification, Nothing}, data, - obs_colnames = nothing, - spec_colnames = nothing, + observed_vars::Union{AbstractVector, Nothing} = nothing, + specification::Union{SemSpecification, Nothing} = nothing, + observed_var_prefix::Union{Symbol, AbstractString} = :obs, kwargs..., ) - if isnothing(spec_colnames) && !isnothing(specification) - spec_colnames = observed_vars(specification) - end - - if !isnothing(spec_colnames) - if isnothing(obs_colnames) - try - data = data[:, spec_colnames] - catch - throw( - ArgumentError( - "Your `data` can not be indexed by symbols. " * - "Maybe you forgot to provide column names via the `obs_colnames = ...` argument.", - ), - ) - end - else - if data isa DataFrame - throw( - ArgumentError( - "You passed your data as a `DataFrame`, but also specified `obs_colnames`. " * - "Please make sure the column names of your data frame indicate the correct variables " * - "or pass your data in a different format.", - ), - ) - end - - if !(eltype(obs_colnames) <: Symbol) - throw(ArgumentError("please specify `obs_colnames` as a vector of Symbols")) - end - - data = reorder_data(data, spec_colnames, obs_colnames) - end - end - - if data isa DataFrame - data = Matrix(data) - end - - # remove persons with only missings - keep = Vector{Int64}() - for i in 1:size(data, 1) - if any(.!ismissing.(data[i, :])) - push!(keep, i) - end - end - data = data[keep, :] - + data, obs_vars, _ = + prepare_data(data, observed_vars, specification; observed_var_prefix) nsamples, nobs_vars = size(data) - # compute and store the different missing patterns with their rowindices - missings = ismissing.(data) - patterns = [missings[i, :] for i in 1:size(missings, 1)] - - patterns_cart = findall.(!, patterns) - data_rowwise = [data[i, patterns_cart[i]] for i in 1:nsamples] - data_rowwise = convert.(Array{Float64}, data_rowwise) - - remember = Vector{BitArray{1}}() - rows = [Vector{Int64}(undef, 0) for i in 1:size(patterns, 1)] - for i in 1:size(patterns, 1) - unknown = true - for j in 1:size(remember, 1) - if patterns[i] == remember[j] - push!(rows[j], i) - unknown = false - end - end - if unknown - push!(remember, patterns[i]) - push!(rows[size(remember, 1)], i) + # detect all different missing patterns with their row indices + pattern_to_rows = Dict{BitVector, Vector{Int}}() + for (i, datarow) in zip(axes(data, 1), eachrow(data)) + pattern = BitVector(.!ismissing.(datarow)) + if sum(pattern) > 0 # skip all-missing rows + pattern_rows = get!(() -> Vector{Int}(), pattern_to_rows, pattern) + push!(pattern_rows, i) end end - rows = rows[1:length(remember)] - n_patterns = size(rows, 1) - - # sort by number of missings - sort_n_miss = sortperm(sum.(remember)) - remember = remember[sort_n_miss] - remember_cart = findall.(!, remember) - remember_cart_not = findall.(remember) - rows = rows[sort_n_miss] - - pattern_nsamples = size.(rows, 1) - pattern_nobs_vars = length.(remember_cart) - - cov_mean = [cov_and_mean(data_rowwise[rows]) for rows in rows] - obs_cov = [cov_mean[1] for cov_mean in cov_mean] - obs_mean = [cov_mean[2] for cov_mean in cov_mean] + # process each pattern and sort from most to least number of observed vars + patterns = [ + SemObservedMissingPattern(pat, rows, data) for (pat, rows) in pairs(pattern_to_rows) + ] + sort!(patterns, by = nmissed_vars) + # allocate EM model (but don't fit) em_model = EmMVNModel(zeros(nobs_vars, nobs_vars), zeros(nobs_vars), false) return SemObservedMissing( - data, - nobs_vars, + convert(Matrix{Union{nonmissingtype(eltype(data)), Missing}}, data), + obs_vars, nsamples, - remember_cart, - remember_cart_not, - rows, - data_rowwise, - pattern_nsamples, - pattern_nobs_vars, - obs_mean, - obs_cov, + patterns, em_model, ) end @@ -204,19 +95,10 @@ end ### Recommended methods ############################################################################################ -nsamples(observed::SemObservedMissing) = observed.nsamples -nobserved_vars(observed::SemObservedMissing) = observed.nobs_vars - ############################################################################################ ### Additional methods ############################################################################################ -patterns(observed::SemObservedMissing) = observed.patterns -patterns_not(observed::SemObservedMissing) = observed.patterns_not -pattern_rows(observed::SemObservedMissing) = observed.pattern_rows -data_rowwise(observed::SemObservedMissing) = observed.data_rowwise -pattern_nsamples(observed::SemObservedMissing) = observed.pattern_nsamples -pattern_nobs_vars(observed::SemObservedMissing) = observed.pattern_nobs_vars -obs_mean(observed::SemObservedMissing) = observed.obs_mean -obs_cov(observed::SemObservedMissing) = observed.obs_cov em_model(observed::SemObservedMissing) = observed.em_model +obs_mean(observed::SemObservedMissing) = obs_mean(em_model(observed)) +obs_cov(observed::SemObservedMissing) = obs_cov(em_model(observed)) diff --git a/src/observed/missing_pattern.jl b/src/observed/missing_pattern.jl new file mode 100644 index 000000000..6ac6a360b --- /dev/null +++ b/src/observed/missing_pattern.jl @@ -0,0 +1,45 @@ +# data associated with the observed variables that all share the same missingness pattern +# variables that have values within that pattern are termed "measured" +# variables that have no measurements are termed "missing" +struct SemObservedMissingPattern{T, S} + measured_mask::BitVector # measured vars mask + miss_mask::BitVector # missing vars mask + rows::Vector{Int} # rows in original data + data::Matrix{T} # non-missing submatrix of data + + measured_mean::Vector{S} # means of measured vars + measured_cov::Matrix{S} # covariance of measured vars +end + +function SemObservedMissingPattern( + measured_mask::BitVector, + rows::AbstractVector{<:Integer}, + data::AbstractMatrix, +) + T = nonmissingtype(eltype(data)) + + pat_data = convert(Matrix{T}, view(data, rows, measured_mask)) + if size(pat_data, 1) > 1 + pat_mean, pat_cov = mean_and_cov(pat_data, 1, corrected = false) + @assert size(pat_cov) == (size(pat_data, 2), size(pat_data, 2)) + else + pat_mean = reshape(pat_data[1, :], 1, :) + # 1x1 covariance matrix since it is not meant to be used + pat_cov = fill(zero(T), 1, 1) + end + + return SemObservedMissingPattern{T, eltype(pat_mean)}( + measured_mask, + .!measured_mask, + rows, + pat_data, + dropdims(pat_mean, dims = 1), + pat_cov, + ) +end + +nobserved_vars(pat::SemObservedMissingPattern) = length(pat.measured_mask) +nsamples(pat::SemObservedMissingPattern) = length(pat.rows) + +nmeasured_vars(pat::SemObservedMissingPattern) = length(pat.measured_mean) +nmissed_vars(pat::SemObservedMissingPattern) = nobserved_vars(pat) - nmeasured_vars(pat) diff --git a/test/unit_tests/data_input_formats.jl b/test/unit_tests/data_input_formats.jl index 3fc255b84..cc72673a6 100644 --- a/test/unit_tests/data_input_formats.jl +++ b/test/unit_tests/data_input_formats.jl @@ -7,11 +7,19 @@ spec = ParameterTable( latent_vars = [:ind60, :dem60, :dem65], ) +# specification with non-existent observed var z1 +wrong_spec = ParameterTable( + observed_vars = [:x1, :x2, :x3, :y1, :y2, :y3, :y4, :y5, :y6, :y7, :y8, :z1], + latent_vars = [:ind60, :dem60, :dem65], +) + ### data ----------------------------------------------------------------------------------- dat = example_data("political_democracy") dat_missing = example_data("political_democracy_missing")[:, names(dat)] +@assert Symbol.(names(dat)) == observed_vars(spec) + dat_matrix = Matrix(dat) dat_missing_matrix = Matrix(dat_missing) @@ -21,7 +29,12 @@ dat_mean = vcat(Statistics.mean(dat_matrix, dims = 1)...) # shuffle variables new_order = [3, 2, 7, 8, 5, 6, 9, 11, 1, 10, 4] -shuffle_names = Symbol.(names(dat))[new_order] +shuffle_names = names(dat)[new_order] + +shuffle_spec = ParameterTable( + observed_vars = Symbol.(shuffle_names), + latent_vars = [:ind60, :dem60, :dem65], +) shuffle_dat = dat[:, new_order] shuffle_dat_missing = dat_missing[:, new_order] @@ -29,8 +42,8 @@ shuffle_dat_missing = dat_missing[:, new_order] shuffle_dat_matrix = dat_matrix[:, new_order] shuffle_dat_missing_matrix = dat_missing_matrix[:, new_order] -shuffle_dat_cov = Statistics.cov(shuffle_dat_matrix) -shuffle_dat_mean = vcat(Statistics.mean(shuffle_dat_matrix, dims = 1)...) +shuffle_dat_cov = cov(shuffle_dat_matrix) +shuffle_dat_mean = vec(mean(shuffle_dat_matrix, dims = 1)) # common tests for SemObserved subtypes function test_observed( @@ -42,17 +55,16 @@ function test_observed( meanstructure::Bool, approx_cov::Bool = false, ) - @test @inferred(nobserved_vars(observed)) == size(dat, 2) - # FIXME observed should provide names of observed variables - @test @inferred(observed_vars(observed)) == names(dat) broken = true - @test @inferred(nsamples(observed)) == size(dat, 1) - - hasmissing = - !isnothing(dat_matrix) && any(ismissing, dat_matrix) || - !isnothing(dat_cov) && any(ismissing, dat_cov) + if !isnothing(dat) + @test @inferred(nsamples(observed)) == size(dat, 1) + @test @inferred(nobserved_vars(observed)) == size(dat, 2) + @test @inferred(observed_vars(observed)) == Symbol.(names(dat)) + end if !isnothing(dat_matrix) - if hasmissing + @test @inferred(nsamples(observed)) == size(dat_matrix, 1) + + if any(ismissing, dat_matrix) @test isequal(@inferred(samples(observed)), dat_matrix) else @test @inferred(samples(observed)) == dat_matrix @@ -60,7 +72,7 @@ function test_observed( end if !isnothing(dat_cov) - if hasmissing + if any(ismissing, dat_cov) @test isequal(@inferred(obs_cov(observed)), dat_cov) else if approx_cov @@ -72,17 +84,17 @@ function test_observed( end # FIXME actually, SemObserved should not use meanstructure and always provide obs_mean() - # meanstructure is a part of SEM model + # since meanstructure belongs to the implied part of a SEM model if meanstructure if !isnothing(dat_mean) - if hasmissing + if any(ismissing, dat_mean) @test isequal(@inferred(obs_mean(observed)), dat_mean) else @test @inferred(obs_mean(observed)) == dat_mean end else - # FIXME if meanstructure is present, obs_mean() should provide something (currently Missing don't support it) - @test (@inferred(obs_mean(observed)) isa AbstractVector{Float64}) broken = true + # FIXME @inferred is broken for EM cov/mean since it may return nothing if EM was not run + @test @inferred(obs_mean(observed)) isa AbstractVector{Float64} broken = true # EM-based means end else @test @inferred(obs_mean(observed)) === nothing skip = true @@ -93,32 +105,29 @@ end @testset "SemObservedData" begin # errors - @test_throws ArgumentError( - "You passed your data as a `DataFrame`, but also specified `obs_colnames`. " * - "Please make sure the column names of your data frame indicate the correct variables " * - "or pass your data in a different format.", - ) begin - SemObservedData( - specification = spec, - data = dat, - obs_colnames = Symbol.(names(dat)), - ) - end + obs_data_redundant = SemObservedData( + specification = spec, + data = dat, + observed_vars = Symbol.(names(dat)), + ) + @test observed_vars(obs_data_redundant) == Symbol.(names(dat)) + @test observed_vars(obs_data_redundant) == observed_vars(spec) - @test_throws ArgumentError( - "Your `data` can not be indexed by symbols. " * - "Maybe you forgot to provide column names via the `obs_colnames = ...` argument.", - ) begin - SemObservedData(specification = spec, data = dat_matrix) - end + obs_data_spec = SemObservedData(specification = spec, data = dat_matrix) + @test observed_vars(obs_data_spec) == observed_vars(spec) - @test_throws ArgumentError("please specify `obs_colnames` as a vector of Symbols") begin - SemObservedData(specification = spec, data = dat_matrix, obs_colnames = names(dat)) - end + obs_data_strnames = + SemObservedData(specification = spec, data = dat_matrix, observed_vars = names(dat)) + @test observed_vars(obs_data_strnames) == Symbol.(names(dat)) @test_throws UndefKeywordError(:data) SemObservedData(specification = spec) - @test_throws UndefKeywordError(:specification) SemObservedData(data = dat_matrix) + obs_data_nonames = SemObservedData(data = dat_matrix) + @test observed_vars(obs_data_nonames) == Symbol.("obs", 1:size(dat_matrix, 2)) + + obs_data_nonames2 = + SemObservedData(data = dat_matrix, observed_var_prefix = "observed_") + @test observed_vars(obs_data_nonames2) == Symbol.("observed_", 1:size(dat_matrix, 2)) @testset "meanstructure=$meanstructure" for meanstructure in (false, true) observed = SemObservedData(specification = spec, data = dat; meanstructure) @@ -128,35 +137,98 @@ end observed_nospec = SemObservedData(specification = nothing, data = dat_matrix; meanstructure) - test_observed(observed_nospec, dat, dat_matrix, dat_cov, dat_mean; meanstructure) + test_observed( + observed_nospec, + nothing, + dat_matrix, + dat_cov, + dat_mean; + meanstructure, + ) observed_matrix = SemObservedData( specification = spec, data = dat_matrix, - obs_colnames = Symbol.(names(dat)), - meanstructure = meanstructure, + observed_vars = Symbol.(names(dat)); + meanstructure, ) test_observed(observed_matrix, dat, dat_matrix, dat_cov, dat_mean; meanstructure) + # detect non-existing column + @test_throws "ArgumentError: column name \"z1\"" SemObservedData( + specification = wrong_spec, + data = shuffle_dat, + ) + + # detect non-existing observed_var + @test_throws "ArgumentError: observed_var \"z1\"" SemObservedData( + specification = wrong_spec, + data = shuffle_dat_matrix, + observed_vars = shuffle_names, + ) + + # cannot infer observed_vars + @test_throws "No data, specification or observed_vars provided" SemObservedData( + data = nothing, + ) + + if false # FIXME data = nothing is for simulation studies + # no data, just observed_vars + observed_nodata = + SemObservedData(data = nothing, observed_vars = Symbol.(names(dat))) + @test observed_nodata isa SemObservedData + @test @inferred(samples(observed_nodata)) === nothing + @test observed_vars(observed_nodata) == Symbol.(names(dat)) + end + + @test_warn "The order of variables in observed_vars" SemObservedData( + specification = spec, + data = shuffle_dat, + observed_vars = shuffle_names, + ) + + # spec takes precedence in obs_vars order + observed_spec = SemObservedData( + specification = spec, + data = shuffle_dat, + observed_vars = shuffle_names, + ) + + test_observed( + observed_spec, + dat, + dat_matrix, + dat_cov, + meanstructure ? dat_mean : nothing; + meanstructure, + ) + observed_shuffle = - SemObservedData(specification = spec, data = shuffle_dat; meanstructure) + SemObservedData(specification = shuffle_spec, data = shuffle_dat; meanstructure) - test_observed(observed_shuffle, dat, dat_matrix, dat_cov, dat_mean; meanstructure) + test_observed( + observed_shuffle, + shuffle_dat, + shuffle_dat_matrix, + shuffle_dat_cov, + meanstructure ? shuffle_dat_mean : nothing; + meanstructure, + ) observed_matrix_shuffle = SemObservedData( - specification = spec, + specification = shuffle_spec, data = shuffle_dat_matrix, - obs_colnames = shuffle_names; + observed_vars = shuffle_names; meanstructure, ) test_observed( observed_matrix_shuffle, - dat, - dat_matrix, - dat_cov, - dat_mean; + shuffle_dat, + shuffle_dat_matrix, + shuffle_dat_cov, + meanstructure ? shuffle_dat_mean : nothing; meanstructure, ) end # meanstructure @@ -170,43 +242,6 @@ end # SemObservedData @test_throws UndefKeywordError(:nsamples) SemObservedCovariance(obs_cov = dat_cov) - @test_throws ArgumentError("no `obs_colnames` were specified") begin - SemObservedCovariance( - specification = spec, - obs_cov = dat_cov, - nsamples = size(dat, 1), - ) - end - - @test_throws ArgumentError("observed means were passed, but `meanstructure = false`") begin - SemObservedCovariance( - specification = nothing, - obs_cov = dat_cov, - obs_mean = dat_mean, - nsamples = size(dat, 1), - ) - end - - @test_throws ArgumentError("please specify `obs_colnames` as a vector of Symbols") begin - SemObservedCovariance( - specification = spec, - obs_cov = dat_cov, - obs_colnames = names(dat), - nsamples = size(dat, 1), - meanstructure = false, - ) - end - - @test_throws ArgumentError("`meanstructure = true`, but no observed means were passed") begin - SemObservedCovariance( - specification = spec, - obs_cov = dat_cov, - obs_colnames = Symbol.(names(dat)), - meanstructure = true, - nsamples = size(dat, 1), - ) - end - @testset "meanstructure=$meanstructure" for meanstructure in (false, true) # errors @@ -220,12 +255,25 @@ end # SemObservedData meanstructure, ) - # should work + # default vars + observed_nonames = SemObservedCovariance( + obs_cov = dat_cov, + obs_mean = meanstructure ? dat_mean : nothing, + nsamples = size(dat, 1), + ) + @test observed_vars(observed_nonames) == Symbol.("obs", 1:size(dat_cov, 2)) + + @test_throws DimensionMismatch SemObservedCovariance( + obs_cov = dat_cov, + observed_vars = Symbol.("obs", 1:(size(dat_cov, 2)+1)), + nsamples = size(dat, 1), + ) + observed = SemObservedCovariance( specification = spec, obs_cov = dat_cov, obs_mean = meanstructure ? dat_mean : nothing, - obs_colnames = obs_colnames = Symbol.(names(dat)), + observed_vars = Symbol.(names(dat)), nsamples = size(dat, 1), meanstructure = meanstructure, ) @@ -240,7 +288,7 @@ end # SemObservedData approx_cov = true, ) - @test_throws ErrorException samples(observed) + @test @inferred(samples(observed)) === nothing observed_nospec = SemObservedCovariance( specification = nothing, @@ -252,7 +300,7 @@ end # SemObservedData test_observed( observed_nospec, - dat, + nothing, nothing, dat_cov, dat_mean; @@ -260,32 +308,53 @@ end # SemObservedData approx_cov = true, ) - @test_throws ErrorException samples(observed_nospec) + @test @inferred(samples(observed_nospec)) === nothing - observed_shuffle = SemObservedCovariance( + # detect non-existing observed_var + @test_throws "ArgumentError: observed_var \"z1\"" SemObservedCovariance( + specification = wrong_spec, + obs_cov = shuffle_dat_cov, + observed_vars = shuffle_names, + nsamples = size(dat, 1), + ) + + # spec takes precedence in obs_vars order + observed_spec = SemObservedCovariance( specification = spec, obs_cov = shuffle_dat_cov, - obs_mean = meanstructure ? dat_mean[new_order] : nothing, - obs_colnames = shuffle_names, - nsamples = size(dat, 1); - meanstructure, + obs_mean = meanstructure ? shuffle_dat_mean : nothing, + observed_vars = shuffle_names, + nsamples = size(dat, 1), ) test_observed( - observed_shuffle, + observed_spec, dat, nothing, dat_cov, - dat_mean; + meanstructure ? dat_mean : nothing; meanstructure, approx_cov = true, ) - @test_throws ErrorException samples(observed_shuffle) + observed_shuffle = SemObservedCovariance( + specification = shuffle_spec, + obs_cov = shuffle_dat_cov, + obs_mean = meanstructure ? shuffle_dat_mean : nothing, + observed_vars = shuffle_names, + nsamples = size(dat, 1); + meanstructure, + ) - # respect specification order - @test @inferred(obs_cov(observed_shuffle)) ≈ obs_cov(observed) - @test @inferred(observed_vars(observed_shuffle)) == shuffle_names broken = true + test_observed( + observed_shuffle, + shuffle_dat, + nothing, + shuffle_dat_cov, + meanstructure ? shuffle_dat_mean : nothing; + meanstructure, + approx_cov = true, + ) end # meanstructure end # SemObservedCovariance @@ -294,38 +363,31 @@ end # SemObservedCovariance @testset "SemObservedMissing" begin # errors - @test_throws ArgumentError( - "You passed your data as a `DataFrame`, but also specified `obs_colnames`. " * - "Please make sure the column names of your data frame indicate the correct variables " * - "or pass your data in a different format.", - ) begin - SemObservedMissing( - specification = spec, - data = dat_missing, - obs_colnames = Symbol.(names(dat)), - ) - end + observed_redundant_names = SemObservedMissing( + specification = spec, + data = dat_missing, + observed_vars = Symbol.(names(dat)), + ) + @test observed_vars(observed_redundant_names) == Symbol.(names(dat)) - @test_throws ArgumentError( - "Your `data` can not be indexed by symbols. " * - "Maybe you forgot to provide column names via the `obs_colnames = ...` argument.", - ) begin - SemObservedMissing(specification = spec, data = dat_missing_matrix) - end + observed_spec_only = SemObservedMissing(specification = spec, data = dat_missing_matrix) + @test observed_vars(observed_spec_only) == observed_vars(spec) - @test_throws ArgumentError("please specify `obs_colnames` as a vector of Symbols") begin - SemObservedMissing( - specification = spec, - data = dat_missing_matrix, - obs_colnames = names(dat), - ) - end + observed_str_colnames = SemObservedMissing( + specification = spec, + data = dat_missing_matrix, + observed_vars = names(dat), + ) + @test observed_vars(observed_str_colnames) == Symbol.(names(dat)) @test_throws UndefKeywordError(:data) SemObservedMissing(specification = spec) - @test_throws UndefKeywordError(:specification) SemObservedMissing( - data = dat_missing_matrix, - ) + observed_no_names = SemObservedMissing(data = dat_missing_matrix) + @test observed_vars(observed_no_names) == Symbol.(:obs, 1:size(dat_missing_matrix, 2)) + + observed_no_names2 = + SemObservedMissing(data = dat_missing_matrix, observed_var_prefix = "observed_") + @test observed_vars(observed_no_names2) == Symbol.("observed_", 1:size(dat_matrix, 2)) @testset "meanstructure=$meanstructure" for meanstructure in (false, true) observed = @@ -340,13 +402,10 @@ end # SemObservedCovariance meanstructure, ) - @test @inferred(length(StructuralEquationModels.patterns(observed))) == 55 - @test sum(@inferred(StructuralEquationModels.pattern_nsamples(observed))) == + @test @inferred(length(observed.patterns)) == 55 + @test sum(@inferred(nsamples(pat)) for pat in observed.patterns) == size(dat_missing, 1) - @test all( - <=(size(dat_missing, 2)), - @inferred(StructuralEquationModels.pattern_nsamples(observed)) - ) + @test all(nsamples(pat) <= size(dat_missing, 2) for pat in observed.patterns) observed_nospec = SemObservedMissing( specification = nothing, @@ -356,7 +415,7 @@ end # SemObservedCovariance test_observed( observed_nospec, - dat_missing, + nothing, dat_missing_matrix, nothing, nothing; @@ -366,7 +425,7 @@ end # SemObservedCovariance observed_matrix = SemObservedMissing( specification = spec, data = dat_missing_matrix, - obs_colnames = Symbol.(names(dat)), + observed_vars = Symbol.(names(dat)), ) test_observed( @@ -378,11 +437,28 @@ end # SemObservedCovariance meanstructure, ) - observed_shuffle = - SemObservedMissing(specification = spec, data = shuffle_dat_missing) + # detect non-existing column + @test_throws "ArgumentError: column name \"z1\"" SemObservedMissing( + specification = wrong_spec, + data = shuffle_dat, + ) + + # detect non-existing observed_var + @test_throws "ArgumentError: observed_var \"z1\"" SemObservedMissing( + specification = wrong_spec, + data = shuffle_dat_missing_matrix, + observed_vars = shuffle_names, + ) + + # spec takes precedence in obs_vars order + observed_spec = SemObservedMissing( + specification = spec, + observed_vars = shuffle_names, + data = shuffle_dat_missing, + ) test_observed( - observed_shuffle, + observed_spec, dat_missing, dat_missing_matrix, nothing, @@ -390,16 +466,28 @@ end # SemObservedCovariance meanstructure, ) + observed_shuffle = + SemObservedMissing(specification = shuffle_spec, data = shuffle_dat_missing) + + test_observed( + observed_shuffle, + shuffle_dat_missing, + shuffle_dat_missing_matrix, + nothing, + nothing; + meanstructure, + ) + observed_matrix_shuffle = SemObservedMissing( - specification = spec, + specification = shuffle_spec, data = shuffle_dat_missing_matrix, - obs_colnames = shuffle_names, + observed_vars = shuffle_names, ) test_observed( observed_matrix_shuffle, - dat_missing, - dat_missing_matrix, + shuffle_dat_missing, + shuffle_dat_missing_matrix, nothing, nothing; meanstructure,