resources/rmisstastic_biblio.bib

% Encoding: UTF-8

@Article{abayomi_etal_JRSSC2008,
  Title                    = {Diagnostics for multivariate imputations},
  Author                   = {Abayomi, K. and Gelman, A. and Levy, M.},
  Journal                  = {Journal of the Royal Statistical Society, Series C (Applied Statistics)},
  Year                     = {2008},
  Number                   = {3},
  Pages                    = {273-291},
  Volume                   = {57},

  Abstract                 = {We consider three sorts of diagnostics for random imputations: displays of the completed data, which are intended to reveal unusual patterns that might suggest problems with the imputations, comparisons of the distributions of observed and imputed data values and checks of the fit of observed data to the model that is used to create the imputations. We formulate these methods in terms of sequential regression multivariate imputation, which is an iterative procedure in which the missing values of each variable are randomly imputed conditionally on all the other variables in the completed data matrix. We also consider a recalibration procedure for sequential regression imputations. We apply these methods to the 2002 environmental sustainability index, which is a linear aggregation of 64 environmental variables on 142 countries.},
  Doi                      = {10.1111/j.1467-9876.2007.00613.x},
  ISSN                     = {1467-9876},
  Keywords                 = {missing values; multiple imputation: multivariate statistics; sustainability; environmental statistics},
  Owner                    = {alyssa},
  Publisher                = {Blackwell Publishing Ltd},
  Timestamp                = {2017.11.08},
  Topics                   = {mi}
}

@Article{albert_follmann_B2000,
  Title                    = {Modeling repeated count data subject to informative dropout},
  Author                   = {Albert, P. S. and Follmann, D. A.},
  Journal                  = {Biometrics},
  Year                     = {2000},
  Number                   = {3},
  Pages                    = {667-677},
  Volume                   = {56},

  Abstract                 = {In certain diseases, outcome is the number of morbid events over the course of follow-up. In epilepsy, e.g., daily seizure counts are often used to reflect disease severity. Follow-up of patients in clinical trials of such diseases is often subject to censoring due to patients dying or dropping out. If the sicker patients tend to be censored in such trials, estimates of the treatment effect that do not incorporate the censoring process may be misleading. We extend the shared random effects approach of Wu and Carroll (1988, Biometrics 44, 175-188) to the setting of repeated counts of events. Three strategies are developed. The first is a likelihood-based approach for jointly modeling the count and censoring processes. A shared random effect is incorporated to introduce dependence between the two processes. The second is a likelihood-based approach that conditions on the dropout times in adjusting for informative dropout. The third is a generalized estimating equations (GEE) approach, which also conditions on the dropout times but makes fewer assumptions about the distribution of the count process. Estimation procedures for each of the approaches are discussed, and the approaches are applied to data from an epilepsy clinical trial. A simulation study is also conducted to compare the various approaches. Through analyses and simulations, we demonstrate the flexibility of the likelihood-based conditional model for analyzing data from the epilepsy trial.},
  Doi                      = {10.1111/j.0006-341X.2000.00667.x},
  ISSN                     = {0006341X, 15410420},
  Owner                    = {alyssa},
  Publisher                = {[Wiley, International Biometric Society]},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar}
}

@Book{allison_MD2001,
  Title                    = {Missing Data},
  Author                   = {Allison, P. D.},
  Publisher                = {Sage Publications},
  Year                     = {2001},

  Address                  = {Thousand Oaks, CA, USA},
  Series                   = {Quantitative Applications in the Social Sciences},

  Doi                      = {10.1136/bmj.38977.682025.2C},
  ISBN                     = {9780761916727},
  ISSN                     = {0959-8138},
  Mendeley-groups          = {missing data},
  Owner                    = {nathalie},
  Timestamp                = {2017.03.06},
  Topics                   = {general}
}

@Article{andridge_little_ISR2010,
  Title                    = {A review of hot deck imputation for survey non-response},
  Author                   = {Andridge, R. and Little, R. J. A.},
  Journal                  = {International Statistical Review},
  Year                     = {2010},
  Number                   = {1},
  Pages                    = {40-64},
  Volume                   = {78},

  Abstract                 = {Hot deck imputation is a method for handling missing data in which each missing value is replaced with an observed response from a ``similar'' unit. Despite being used extensively in practice, the theory is not as well developed as that of other imputation methods. We have found that no consensus exists as to the best way to apply the hot deck and obtain inferences from the completed data set. Here we review different forms of the hot deck and existing research on its statistical properties. We describe applications of the hot deck currently in use, including the U.S. Census Bureau's hot deck for the Current Population Survey (CPS). We also provide an extended example of variations of the hot deck applied to the third National Health and Nutrition Examination Survey (NHANES III). Some potential areas for future research are highlighted.},
  Annote                   = {A review of Hot deck imputation for survey Non-response},
  Doi                      = {10.1111/j.1751-5823.2010.00103.x},
  Keywords                 = {item non-response; missing data; multiple imputation; variance estimation},
  Mendeley-groups          = {missing data},
  Owner                    = {alyssa},
  Timestamp                = {2016.09.27},
  Topics                   = {hot-deck}
}

@Article{audigier_etal_ADAC2016,
  Title                    = {A principal component method to impute missing values for mixed data},
  Author                   = {Audigier, V. and Husson, F. and Josse, J.},
  Journal                  = {Advances in Data Analysis and Classification},
  Year                     = {2016},
  Number                   = {1},
  Pages                    = {5-26},
  Volume                   = {10},

  Abstract                 = {We propose a new method to impute missing values in mixed data sets. It is based on a principal component method, the factorial analysis for mixed data, which balances the influence of all the variables that are continuous and categorical in the construction of the principal components. Because the imputation uses the principal axes and components, the prediction of the missing values is based on the similarity between individuals and on the relationships between variables. The properties of the method are illustrated via simulations and the quality of the imputation is assessed using real data sets. The method is compared to a recent method (Stekhoven and Buhlmann Bioinformatics 28:113_118, 2011) based on random forest and shows better performance especially for the imputation of categorical variables and situations with highly linear relationships between continuous variables.},
  Doi                      = {10.1007/s11634-014-0195-1},
  Keywords                 = {missing values; mixed data; imputation; principal component method; factorial analysis of mixed data},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.22},
  Topics                   = {factorial data analysis; imputation}
}

@Article{audigier_etal_JSCS2015,
  Title                    = {Multiple imputation for continuous variables using a {B}ayesian principal component analysis},
  Author                   = {Audigier, V. and Husson, F. and Josse, J.},
  Journal                  = {Journal of Statistical Computation and Simulation},
  Year                     = {2015},
  Number                   = {11},
  Pages                    = {2140-2156},
  Volume                   = {86},

  Abstract                 = {We propose a multiple imputation method based on principal component analysis (PCA) to deal with incomplete continuous data. To reflect the uncertainty of the parameters from one imputation to the next, we use a Bayesian treatment of the PCA model. Using a simulation study and real data sets, the method is compared to two classical approaches: multiple imputation based on joint modelling and on fully conditional modelling. Contrary to the others, the proposed method can be easily used on data sets where the number of individuals is less than the number of variables and when the variables are highly correlated. In addition, it provides unbiased point estimates of quantities of interest, such as an expectation, a regression coefficient or a correlation coefficient, with a smaller mean squared error. Furthermore, the widths of the confidence intervals built for the quantities of interest are often smaller whilst ensuring a valid coverage.},
  Doi                      = {10.1080/00949655.2015.1104683},
  Keywords                 = {missing values; continuous data; multiple imputaiton; bayesian principal component analysis; data augmentation},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.23},
  Topics                   = {factorial data analysis; multiple imputation}
}

@Article{audigier_etal_SC2016,
  Title                    = {{MIMCA}: multiple imputation for categorical variables with multiple correspondence analysis},
  Author                   = {Audigier, V. and Husson, F. and Josse, J.},
  Journal                  = {Statistics and Computing},
  Year                     = {2016},
  Number                   = {2},
  Pages                    = {1-18},
  Volume                   = {27},

  Abstract                 = {We propose a multiple imputation method to deal with incomplete categorical data. This method imputes the missing entries using the principal components method dedicated to categorical data: multiple correspondence analysis {\{}(MCA).{\}} The uncertainty concerning the parameters of the imputation model is reflected using a non-parametric bootstrap. Multiple imputation using {\{}MCA{\}} {\{}(MIMCA){\}} requires estimating a small number of parameters due to the dimensionality reduction property of {\{}MCA.{\}} It allows the user to impute a large range of data sets. In particular, a high number of categories per variable, a high number of variables or a small the number of individuals are not an issue for {\{}MIMCA.{\}} Through a simulation study based on real data sets, the method is assessed and compared to the reference methods (multiple imputation using the loglinear model, multiple imputation by logistic regressions) as well to the latest works on the topic (multiple imputation by random forests or by the Dirichlet process mixture of products of multinomial distributions model). The proposed method shows good performances in terms of bias and coverage for an analysis model such as a main effects logistic regression model. In addition, {\{}MIMCA{\}} has the great advantage that it is substantially less time consuming on data sets of high dimensions than the other multiple imputation methods.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {1505.08116},
  Doi                      = {10.1007/s11222-016-9635-4},
  Eprint                   = {1505.08116},
  ISSN                     = {15731375},
  Keywords                 = {bootstrap; categorical data; missing values; multiple correspondence analysis; multiple imputation},
  Owner                    = {alyssa},
  Publisher                = {Springer US},
  Timestamp                = {2017.07.06},
  Topics                   = {factorial data analysis; multiple imputation}
}

@Article{bang_robins_B2005,
  Title                    = {Doubly robust estimation in missing data and causal inference models},
  Author                   = {Bang, H. and Robins, J. M.},
  Journal                  = {Biometrics},
  Year                     = {2005},
  Number                   = {4},
  Pages                    = {962-973},
  Volume                   = {61},

  Abstract                 = {The goal of this article is to construct doubly robust (DR) estimators in ignorable missing data and causal inference models. In a missing data model, an estimator is DR if it remains consistent when either (but not necessarily both) a model for the missingness mechanism or a model for the distribution of the complete data is correctly specified. Because with observational data one can never be sure that either a missingness model or a complete data model is correct, perhaps the best that can be hoped for is to find a DR estimator. DR estimators, in contrast to standard likelihood-based or (nonaugmented) inverse probability-weighted estimators, give the analyst two chances, instead of only one, to make a valid inference. In a causal inference model, an estimator is DR if it remains consistent when either a model for the treatment assignment mechanism or a model for the distribution of the counterfactual data is correctly specified. Because with observational data one can never be sure that a model for the treatment assignment mechanism or a model for the counterfactual data is correct, inference based on DR estimators should improve upon previous approaches. Indeed, we present the results of simulation studies which demonstrate that the finite sample performance of DR estimators is as impressive as theory would predict. The proposed method is applied to a cardiovascular clinical trial.},
  Doi                      = {10.1111/j.1541-0420.2005.00377.x},
  ISBN                     = {0006-341X},
  ISSN                     = {0006341X},
  Keywords                 = {causal inference; doubly robust estimation; longitudinal data; marginal structural model; missing data; semiparametrics},
  Owner                    = {alyssa},
  Pmid                     = {16401269},
  Timestamp                = {2017.05.29},
  Topics                   = {causal inference}
}

@Article{baraldi_enders_JSP2010,
  Title                    = {An introduction to modern missing data analysis},
  Author                   = {Baraldi, A. N. and Enders, C. K.},
  Journal                  = {Journal of School Psychology},
  Year                     = {2010},
  Number                   = {1},
  Pages                    = {5-37},
  Volume                   = {48},

  Abstract                 = {A great deal of recent methodological research has focused on two modern missing data analysis methods: maximum likelihood and multiple imputation. These approaches are advantageous to traditional techniques (e.g. deletion and mean imputation techniques) because they require less stringent assumptions and mitigate the pitfalls of traditional techniques. This article explains the theoretical underpinnings of missing data analyses, gives an overview of traditional missing data techniques, and provides accessible descriptions of maximum likelihood and multiple imputation. In particular, this article focuses on maximum likelihood estimation and presents two analysis examples from the Longitudinal Study of American Youth data. One of these examples includes a description of the use of auxiliary variables. Finally, the paper illustrates ways that researchers can use intentional, or planned, missing data to enhance their research designs.},
  Doi                      = {10.1016/j.jsp.2009.10.001},
  Keywords                 = {missing data; multiple imputation; maximum likelihood; planned missingness},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.21},
  Topics                   = {general_informal}
}

@Article{baretta_santaniello_BMCMIDM2016,
  Title                    = {Nearest neighbor imputation algorithms: a critical evaluation},
  Author                   = {Baretta, L. and Santaniello, A.},
  Journal                  = {BMC Medical Informatics and Decision Making},
  Year                     = {2016},
  Number                   = {Supp. 3},
  Pages                    = {74},
  Volume                   = {16},

  Abstract                 = {Background Nearest neighbor (NN) imputation algorithms are efficient methods to fill in missing data where each missing value on some records is replaced by a value obtained from related cases in the whole set of records. Besides the capability to substitute the missing data with plausible values that are as close as possible to the true value, imputation algorithms should preserve the original data structure and avoid to distort the distribution of the imputed variable. Despite the efficiency of NN algorithms little is known about the effect of these methods on data structure. Methods Simulation on synthetic datasets with different patterns and degrees of missingness were conducted to evaluate the performance of NN with one single neighbor (1NN) and with k neighbors without (kNN) or with weighting (wkNN) in the context of different learning frameworks: plain set, reduced set after ReliefF filtering, bagging, random choice of attributes, bagging combined with random choice of attributes (Random-Forest-like method). Results Whatever the framework, kNN usually outperformed 1NN in terms of precision of imputation and reduced errors in inferential statistics, 1NN was however the only method capable of preserving the data structure and data were distorted even when small values of k neighbors were considered; distortion was more severe for resampling schemas. Conclusions The use of three neighbors in conjunction with ReliefF seems to provide the best trade-off between imputation error and preservation of the data structure. The very same conclusions can be drawn when imputation experiments were conducted on the single proton emission computed tomography (SPECTF) heart dataset after introduction of missing data completely at random.},
  Doi                      = {10.1186/s12911-016-0318-z},
  Keywords                 = {near neighbour; imputation method; imputation algorithm; near neighbour algorithm; Minkowski norm},
  Owner                    = {nathalie},
  Series                   = {Proceedings of the 5th Translational Bioinformatics Conference (TBC 2015): medical informatics and decision making},
  Timestamp                = {2018.05.17},
  Topics                   = {knn}
}

@article{bartlett_etal_2015,
  Title                    = {Asymptotically unbiased estimation of exposure odds ratios in complete records logistic regression},
  Author                   = {Bartlett, Jonathan W and Harel, Ofer and Carpenter, James R},
  Journal                  = {American journal of epidemiology},
  Volume                   = {182},
  Number                   = {8},
  Pages                    = {730--736},
  Year                     = {2015},
  Publisher                = {Oxford University Press},
  Doi                      = {10.1093/aje/kwv114},

  Abstract                 = {Missing data are a commonly occurring threat to the validity and efficiency of epidemiologic studies. Perhaps the most common approach to handling missing data is to simply drop those records with 1 or more missing values, in so-called “complete records” or “complete case” analysis. In this paper, we bring together earlier-derived yet perhaps now somewhat neglected results which show that a logistic regression complete records analysis can provide asymptotically unbiased estimates of the association of an exposure of interest with an outcome, adjusted for a number of confounders, under a surprisingly wide range of missing-data assumptions. We give detailed guidance describing how the observed data can be used to judge the plausibility of these assumptions. The results mean that in large epidemiologic studies which are affected by missing data and analyzed by logistic regression, exposure associations may be estimated without bias in a number of settings where researchers might otherwise assume that bias would occur.},
  Keywords                 = {complete case analysis; logistic regression; missing data; odds ratio},
  Owner                    = {imke},
  Timestamp                = {2019.04.01},
  Topics                   = {causal inference}
}


@Article{beaulac_rosenthal_2018,
  Title                    = {BEST: A decision tree algorithm that handles missing values},
  Author                   = {Beaulac, C{\'e}dric and Rosenthal, Jeffrey S},
  Journal                  = {arXiv preprint},
  archivePrefix            = {arXiv},
  eprint                   = {1804.10168},
  Year                     = {2018},
  Url                      = {https://arxiv.org/pdf/1804.10168.pdf},

  Abstract                 = {The main contribution of this paper is the development of a new decision tree algorithm. The proposed approach allows users to guide the algorithm through the data partitioning process. We believe this feature has many applications but in this paper we demonstrate how to utilize this algorithm to analyse data sets containing missing values. We tested our algorithm against simulated data sets with various missing data structures and a real data set. The results demonstrate that this new classification procedure efficiently handles missing values and produces results that are slightly more accurate and more interpretable than most common procedures without any imputations or pre-processing.},

  Keywords                 = {cart; machine learning; variable importance analysis},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {random forests; regression trees; variable selection}

}

@InProceedings{bengio_gingras_1995,
  Title                    = {Recurrent neural networks for missing or asynchronous data},
  Author                   = {Bengio, Y. and Gingras, F.},
  Booktitle                = {Proceedings of the 8th International Conference on Neural Information Processing Systems},
  Pages                    = {395-401},
  Year                     = {1995},
  Editor                   = {-},
  Address                  = {Cambridge, MA, USA},
  Eventdate                = {1995-11-27/1995-12-02},
  Publisher                = {MIT Press},

  Abstract                 = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more "discriminant" approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},

  Url                      = {http://papers.nips.cc/paper/1126-recurrent-neural-networks-for-missing-or-asynchronous-data.pdf},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Keywords                 = {machine learning; deep learning; rnn; sequential data},
  Topics                   = {deep learning; rnn}
}

@Article{bertsimas_etal_2017,
  Title                    = {From predictive methods to missing data imputation: an optimization approach},
  Author                   = {Bertsimas, Dimitris and Pawlowski, Colin and Zhuo, Ying Daisy},
  Journal                  = {The Journal of Machine Learning Research},
  Volume                   = {18},
  Number                   = {1},
  Pages                    = {7133--7171},
  Year                     = {2017},
  Publisher                = {JMLR.org},
  Abstract                 = {Missing data is a common problem in real-world settings and for this reason has attracted significant attention in the statistical literature. We propose a flexible framework based on formal optimization to impute missing data with mixed continuous and categorical variables. This framework can readily incorporate various predictive models including K nearest neighbors, support vector machines, and decision tree based methods, and can be adapted for multiple imputation. We derive fast first-order methods that obtain high quality solutions in seconds following a general imputation algorithm opt.impute presented in this paper. We demonstrate that our proposed method improves out-of-sample accuracy in large-scale computational experiments across a sample of 84 data sets taken from the UCI Machine Learning Repository. In all scenarios of missing at random mechanisms and various missing percentages, opt.impute produces the best overall imputation in most data sets benchmarked against five other methods: mean impute, K-nearest neighbors, iterative knn, Bayesian PCA, and predictive-mean matching, with an average reduction in mean absolute error of 8.3\% against the best cross-validated benchmark method. Moreover, opt.impute leads to improved out-of-sample performance of learning algorithms trained using the imputed data, demonstrated by computational experiments on 10 downstream tasks. For models trained using opt.impute single imputations with 50\% data missing, the average out-of-sample R2 is 0.339 in the regression tasks and the average out-of-sample accuracy is 86.1\% in the classification tasks, compared to 0.315 and 84.4\% for the best cross-validated benchmark method. In the multiple imputation setting, downstream models trained using opt.impute obtain a statistically significant improvement over models trained using multivariate imputation by chained equations (mice) in 8/10 missing data scenarios considered.},
  Keywords                 = {missing data imputation; K-NN; SVM; optimal decision trees},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {imputation; knn; decision trees}
}


@Article{beunckens_etal_2008,
  Title={A latent-class mixture model for incomplete longitudinal Gaussian data},
  Author={Beunckens, Caroline and Molenberghs, Geert and Verbeke, Geert and Mallinckrodt, Craig},
  Journal={Biometrics},
  Volume={64},
  Number={1},
  Pages={96--105},
  Year={2008},
  Publisher={Wiley Online Library}, 
  Abstract={In the analyses of incomplete longitudinal clinical trial data, there has been a shift, away from simple methods that are valid only if the data are missing completely at random, to more principled ignorable analyses, which are valid under the less restrictive missing at random assumption. The availability of the necessary standard statistical software nowadays allows for such analyses in practice. While the possibility of data missing not at random (MNAR) cannot be ruled out, it is argued that analyses valid under MNAR are not well suited for the primary analysis in clinical trials. Rather than either forgetting about or blindly shifting to an MNAR framework, the optimal place for MNAR analyses is within a sensitivity‐analysis context. One such route for sensitivity analysis is to consider, next to selection models, pattern‐mixture models or shared‐parameter models. The latter can also be extended to a latent‐class mixture model, the approach taken in this article. The performance of the so‐obtained flexible model is assessed through simulations and the model is applied to data from a depression trial.},
  Doi = {https://doi.org/10.1111/j.1541-0420.2007.00837.x}, 
  Topics = {mnar},
   Owner                    = {aude},
  Timestamp                = {2021.01.20}
  
}

@Article{bianchi_etal_2019,
  Title                    = {Learning representations of multivariate time series with missing data},
  Author                   = {Bianchi, Filippo Maria and Livi, Lorenzo and Mikalsen, Karl {\O}yvind and Kampffmeyer, Michael and Jenssen, Robert},
  Journal                  = {Pattern Recognition},
  Volume                   = {96},
  Pages                    = {106973},
  Year                     = {2019},
  Publisher                = {Elsevier},
  DOI                      = {10.1016/j.patcog.2019.106973},

  Abstract                 = {Learning compressed representations of multivariate time series (MTS) facilitates data analysis in the presence of noise and redundant information, and for a large number of variates and time steps. However, classical dimensionality reduction approaches are designed for vectorial data and cannot deal explicitly with missing values. In this work, we propose a novel autoencoder architecture based on recurrent neural networks to generate compressed representations of MTS. The proposed model can process inputs characterized by variable lengths and it is specifically designed to handle missing data. Our autoencoder learns fixed-length vectorial representations, whose pairwise similarities are aligned to a kernel function that operates in input space and that handles missing values. This allows to learn good representations, even in the presence of a significant amount of missing data. To show the effectiveness of the proposed approach, we evaluate the quality of the learned representations in several classification tasks, including those involving medical data, and we compare to other methods for dimensionality reduction. Successively, we design two frameworks based on the proposed architecture: one for imputing missing data and another for one-class classification. Finally, we analyze under what circumstances an autoencoder with recurrent layers can learn better compressed representations of MTS than feed-forward architectures.},

  Keywords                 = {Representation learning; Multivariate time series; Autoencoders; Recurrent neural networks; Kernel methods},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {time series; deep learning; neural network}
}

@InProceedings{biessmann_CIKM2018,
  Title                    = {"Deep" Learning for Missing Value Imputation in Tables with Non-Numerical Data},
  Author                   = {Biessmann, F. and Salinas, D. and Schelter, S. and Schmidt, P. and Lange, D.},
  Booktitle                = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
  Series                   = {CIKM '18},
  Year                     = {2018},
  ISBN                     = {978-1-4503-6014-2},
  Location                 = {Torino, Italy},
  Pages                    = {2017--2025},
  Url                      = {http://doi.acm.org/10.1145/3269206.3272005},
  Doi                      = {10.1145/3269206.3272005},
  Publisher                = {ACM},
  Address                  = {New York, NY, USA},
  Editor                   = {-},

  Abstract                 = {The success of applications that process data critically depends on the quality of the ingested data. Completeness of a data source is essential in many cases. Yet, most missing value imputation approaches suffer from severe limitations. They are almost exclusively restricted to numerical data, and they either offer only simple imputation methods or are difficult to scale and maintain in production. Here we present a robust and scalable approach to imputation that extends to tables with non-numerical values, including unstructured text data in diverse languages. Experiments on public data sets as well as data sets sampled from a large product catalog in different languages (English and Japanese) demonstrate that the proposed approach is both scalable and yields more accurate imputations than previous approaches. Training on data sets with several million rows is a matter of minutes on a single machine. With a median imputation F1 score of 0.93 across a broad selection of data sets our approach achieves on average a 23-fold improvement compared to mode imputation. While our system allows users to apply state-of-the-art deep learning models if needed, we find that often simple linear n-gram models perform on par with deep learning methods at a much lower operational cost. The proposed method learns all parameters of the entire imputation pipeline automatically in an end-to-end fashion, rendering it attractive as a generic plugin both for engineers in charge of data pipelines where data completeness is relevant, as well as for practitioners without expertise in machine learning who need to impute missing values in tables with non-numerical data.},

  Owner                    = {imke},
  Timestamp                = {2018.12.18},
  Keywords                 = {data cleaning; missing value imputation},
  Topics                   = {deep learning; neural networks}
}


@Article{blake_etal_2019,
  Title                    = {Propensity scores using missingness pattern information: a practical guide},
  Author                   = {Blake, Helen A. and Leyrat, Clémence and Mansfield, Kate and Seaman, Shaun and Tomlinson, Laurie and Carpenter, James and Williamson, Elizabeth},
  Year                     = {2019},
  Journal                  = {arXiv preprint},
  archivePrefix            = {arXiv},
  Year                     = {2018},
  eprint                   = {1901.03981},
  primaryClass             = {stat.ME},
  Abstract                 = {Electronic health records are a valuable data source for investigating health-related questions, and propensity score analysis has become an increasingly popular approach to address confounding bias in such investigations. However, because electronic health records are typically routinely recorded as part of standard clinical care, there are often missing values, particularly for potential confounders. In our motivating study -- using electronic health records to investigate the effect of renin-angiotensin system blockers on the risk of acute kidney injury -- two key confounders, ethnicity and chronic kidney disease stage, have 59% and 53% missing data, respectively.
The missingness pattern approach (MPA), a variant of the missing indicator approach, has been proposed as a method for handling partially observed confounders in propensity score analysis. In the MPA, propensity scores are estimated separately for each missingness pattern present in the data. Although the assumptions underlying the validity of the MPA are stated in the literature, it can be difficult in practice to assess their plausibility.
In this paper, we explore the MPA's underlying assumptions by using causal diagrams to assess their plausibility in a range of simple scenarios, drawing general conclusions about situations in which they are likely to be violated. We present a framework providing practical guidance for assessing whether the MPA's assumptions are plausible in a particular setting and thus deciding when the MPA is appropriate. We apply our framework to our motivating study, showing that the MPA's underlying assumptions appear reasonable, and we demonstrate the application of MPA to this study.},
  Keywords                 = {Electronic health records; Missing confounder data; Missing indicator; Missingness pattren; Propensity score analysis},

  Url                      = {https://researchonline.lshtm.ac.uk/4651159/1/1901.03981v1.pdf},

  Owner                    = {imke},
  Timestamp                = {2019.02.13},
  Topics                   = {causal inference}
}

@Article{brinis_etal_2019,
  Title                    = {Hollow-tree: a metric access method for data with missing values},
  Author                   = {Brinis, Safia and Traina, Caetano and Traina, Agma JM},
  Journal                  = {Journal of Intelligent Information Systems},
  Pages                    = {1--28},
  Year                     = {2019},
  Publisher                = {Springer},
  DOI                      = {10.1007/s10844-019-00567-8},

  Abstract                 = {Similarity search is fundamental to store and retrieve large volumes of complex data required by many real world applications. A useful mechanism for such concept is the query-by-similarity. Based on their topological properties, metric similarity functions can be used to index sets of data which can be queried effectively and efficiently by the so-called metric access methods. However, data produced by various application domains and the varying data types handled often lead to missing data, hence, they do not follow the metric similarity requirements. As a consequence, missing data cause distortions in the index structure and yield bias in the query answer. In this paper, we propose the Hollow-tree, a novel access method aimed at successfully retrieving data with missing attribute values. It employs new strategies for indexing and searching data elements, capable of handling the missing data issues when the cause of missingness is ignorable. The indexing strategy is based on a family of distance functions that allow measuring the distance between elements with missing values, along with a set of policies able to organize the elements in the index without causing distortions to its internal structure. The searching strategy employs fractal dimension property of the data to achieve accurate query answer while considering data with missing values part of the response. Results from experiments performed on a variety of real and synthetic data sets showed that, while other metric access methods deteriorate with small amounts of missing values, the Hollow-tree maintains a remarkable performance with almost 100\% of precision and recall for range queries and more than 90\% for k-nearest neighbor queries, for up to 40\% of missing values.},

  Keywords                 = {Missing at random; Similarity search; Fractal dimension},
  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {classification; knn; clustering}

}

@Article{buck_JRSSB1960,
  Title                    = {A method of estimation of missing values in multivariate data suitable for use with an electronic computer},
  Author                   = {Buck, S. F.},
  Journal                  = {Journal of the Royal Statistical Society, Series B},
  Year                     = {1960},
  Pages                    = {302-306},
  Volume                   = {22},

  Abstract                 = {Procedures for treating missing data in the statistical analysis of survey data are reviewed. The main topics covered are: (1) how to assess the nature of missing data especially with regard to randomness, (2) a comparison of listwise and pairwise deletion, and (3) methods for using maximum information to estimate (a) parameters or (b) missing values.},
  Doi                      = {10.1177/004912417700600206},
  Owner                    = {nathalie},
  Timestamp                = {2016.09.28},
  Topics                   = {survey}
}

@InProceedings{burns_ARC1990,
  Title                    = {Multiple and replicate item imputation in a complex sample survey},
  Author                   = {Burns, R. M.},
  Booktitle                = {Proceedings of the 6th Annual Research Conference},
  Year                     = {1990},

  Address                  = {Washington DC, USA},
  Editor                   = {Bureau of the Census},
  Pages                    = {655-665},

  Owner                    = {nathalie},
  Timestamp                = {2018.06.06}
}

@Article{candes_etal_IEEETSP2013,
  Title                    = {Unbiased risk estimates for singular value thresholding and spectral estimators},
  Author                   = {Cand\`es, E. J. and Sing-Long, C. A. and Trzasko, J. D.},
  Journal                  = {IEEE Transactions on Signal Processing},
  Year                     = {2013},
  Number                   = {19},
  Pages                    = {4643-4657},
  Volume                   = {61},

  Abstract                 = {In an increasing number of applications, it is of interest to recover an approximately low-rank data matrix from noisy observations. This paper develops an unbiased risk estimate -- holding in a Gaussian model -- for any spectral estimator obeying some mild regularity assumptions. In particular, we give an unbiased risk estimate formula for singular value thresholding (SVT), a popular estimation strategy that applies a soft-thresholding rule to the singular values of the noisy observations. Among other things, our formulas offer a principled and automated way of selecting regularization parameters in a variety of problems. In particular, we demonstrate the utility of the unbiased risk estimation for SVT-based denoising of real clinical cardiac MRI series data. We also give new results concerning the differentiability of certain matrix-valued functions.},
  Doi                      = {10.1109/TSP.2013.2270464},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.09},
  Topics                   = {factorial data analysis; misc}
}

@article{carpenter_etal_JRSS2006,
  Title                    = {A comparison of multiple imputation and doubly robust estimation for analyses with missing data},
  Author                   = {Carpenter, James R. and Kenward, Michael G. and Vansteelandt, Stijn},
  Journal                  = {Journal of the Royal Statistical Society: Series A (Statistics in Society)},
  Volume                   = {169},
  Number                   = {3},
  Pages                    = {571--584},
  Year                     = {2006},

  Abstract                 = {Multiple imputation is now a well-established technique for analysing data sets where some units have incomplete observations. Provided that the imputation model is correct, the resulting estimates are consistent. An alternative, weighting by the inverse probability of observing complete data on a unit, is conceptually simple and involves fewer modelling assumptions, but it is known to be both inefficient (relative to a fully parametric approach) and sensitive to the choice of weighting model. Over the last decade, there has been a considerable body of theoretical work to improve the performance of inverse probability weighting, leading to the development of ‘doubly robust’ or ‘doubly protected’ estimators. We present an intuitive review of these developments and contrast these estimators with multiple imputation from both a theoretical and a practical viewpoint.},
  Keywords                 = {Double robustness; Inverse probability weighting; Missing at random; Multiple imputation},
  Doi                      = {10.1111/j.1467-985X.2006.00407.x},

  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {ipw; mi}
}


@Book{carpenter_kenward_MIA2013,
  Title                    = {Multiple Imputation and its Application},
  Author                   = {Carpenter, J. and Kenward, M.},
  Publisher                = {Wiley},
  Year                     = {2013},

  Address                  = {Chichester, West Sussex, UK},

  Abstract                 = {A practical guide to analysing partially observed data. Collecting, analysing and drawing inferences from data is central to research in the medical and social sciences. Unfortunately, it is rarely possible to collect all the intended data. The literature on inference from the resulting incomplete data is now huge, and continues to grow both as methods are developed for large and complex data structures, and as increasing computer power and suitable software enable researchers to apply these methods. This book focuses on a particular statistical method for analysing and drawing inferences from incomplete data, called Multiple Imputation (MI). MI is attractive because it is both practical and widely applicable. The authors aim is to clarify the issues raised by missing data, describing the rationale for MI, the relationship between the various imputation models and associated algorithms and its application to increasingly complex data structures.},
  Doi                      = {10.1002/9781119942283},
  ISBN                     = {9780470740521},
  Owner                    = {alyssa},
  Timestamp                = {2017.04.11},
  Topics                   = {multiple imputation; general}
}

@InProceedings{chen_guestrin_2016,
  Title                    = {XGBoost: A Scalable Tree Boosting System},
  Author                   = {Chen, T. and Guestrin, C.},
  Booktitle                = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
  Year                     = {2016},
  Editor                   = {-},

  Address                  = {New York, NY, USA},
  Pages                    = {785-794},
  Publisher                = {ACM},

  Abstract                 = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable end-to-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
  Doi                      = {10.1145/2939672.2939785},
  Eventdate                = {2016-08-13/2016-08-17},
  ISBN                     = {0450342322},
  Owner                    = {imke},
  Timestamp                = {2018.10.30},
  Keywords                 = {large-scale machine learning},
  Topics                   = {random forests}
}


@Article{chen_reiter_2019,
  Title                    = {Nonparametric Pattern-Mixture Models for Inference with Missing Data},
  Author                   = {Chen, Yen-Chi and Sadinle, Mauricio},
  Journal                  = {arXiv preprint},
  archivePrefix            = {arXiv},
  eprint                   = {1904.11085},
  Year                     = {2019},
  primaryClass             = {stat.ME},

  Url                      = {https://arxiv.org/pdf/1904.11085.pdf},

  Abstract                 = {Pattern-mixture models provide a transparent approach for handling missing data, where the full-data distribution is factorized in a way that explicitly shows the parts that can be estimated from observed data alone, and the parts that require identifying restrictions. We introduce a nonparametric estimator of the full-data distribution based on the pattern-mixture model factorization. Our approach uses the empirical observed-data distribution and augments it with a nonparametric estimator of the missing-data distributions under a given identifying restriction. Our results apply to a large class of donor-based identifying restrictions that encompasses commonly used ones and can handle
both monotone and nonmonotone missingness. We propose a Monte Carlo procedure to derive point estimates of functionals of interest, and the bootstrap to construct confidence intervals.},
  Keywords                 = {Bootstrap; Missingness mechanism; Nonignorable nonresponse; Nonparametric identification; Nonparametric inference},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {mnar}
}

@Article{chen_shao_JOS2000,
  Title                    = {Nearest neighbor imputation for survey data},
  Author                   = {Chen, J. and Shao, J.},
  Journal                  = {Journal of Official Statistics},
  Year                     = {2000},
  Number                   = {2},
  Pages                    = {113-131},
  Volume                   = {16},

  Abstract                 = {Nearest neighbor imputation is one of the hot deck methods used to compensate for nonresponse in sample surveys. Although it has a long history of application, few theoretical properties of the nearest neighbor imputation method are known prior to the current article. We show that under some conditions, the nearest neighbor imputation method provides asymptotically unbiased and consistent estimators of functions of population means (or totals), population distributions, and population quantiles. We also derive the asymptotic variances for estimators based on nearest neighbor imputation and consistent estimators of these asymptotic variances. Some simulation results show that the estimators based on nearest neighbor imputation and the proposed variance estimators have good performances.},
  ISSN                     = {0282-423X},
  Keywords                 = {biases; hot deck; quantiles; sample means; variance estimation},
  Mendeley-groups          = {missing data},
  Owner                    = {alyssa},
  Timestamp                = {2016.09.27},
  Topics                   = {knn},
  Url                      = {http://www.jos.nu/Articles/abstract.asp?article=162113}
}

@Article{collins_etal_PM2007,
  Title                    = {A comparison of inclusive and restrictive strategies in modern missing data procedures},
  Author                   = {Collins, L. M. and Schafer, J. L. and Chi-Ming, K.},
  Journal                  = {Psychological Methods},
  Year                     = {2007},
  Number                   = {4},
  Pages                    = {330-351},
  Volume                   = {6},

  Abstract                 = {Two classes of modem missing data procedures, maximum likelihood (ML) and multiple imputation (MI), tend to yield similar results when implemented in comparable ways. In either approach, it is possible to include auxiliary variables solely for the purpose of improving the missing data procedure. A simulation was presented to assess the potential costs and benefits of a restrictive strategy, which makes minimal use of auxiliary variables, versus an inclusive strategy, which makes liberal use of such variables. The simulation showed that the inclusive strategy is to be greatly preferred. With an inclusive strategy not only is there a reduced chance of inadvertently omitting an important cause of missingness, there is also the possibility of noticeable gains in terms of increased efficiency and reduced bias, with only minor costs. As implemented in currently available software, the ML approach tends to encourage the use of a restrictive strategy, whereas the MI approach makes it relatively simple to use an inclusive strategy.},
  Doi                      = {10.1037/1082-989X.6.4.330},
  Owner                    = {nathalie},
  Timestamp                = {2018.06.06},
  Topics                   = {multiple imputation; ml}
}

@Article{cranmer_gill_BJPS2012,
  Title                    = {We have to be discrete about this: a non-parametric imputation technique for missing categorical data},
  Author                   = {Cranmer, S. J. and Gill, J.},
  Journal                  = {British Journal of Political Science},
  Year                     = {2012},
  Pages                    = {425-449},
  Volume                   = {43},

  Abstract                 = {Missing values are a frequent problem in empirical political science research. Surprisingly, the match between the measurement of the missing values and the correcting algorithms applied is seldom studied. While multiple imputation is a vast improvement over the deletion of cases with missing values, it is often unsuitable for imputing highly non-granular discrete data. We develop a simple technique for imputing missing values in such situations, which is a variant of hot deck imputation, drawing from the conditional distribution of the variable with missing values to preserve the discrete measure of the variable. This method is tested against existing techniques using Monte Carlo analysis and then applied to real data on democratization and modernization theory. Software for our imputation technique is provided in a free, easy-to-use package for the R statistical environment.},
  Doi                      = {10.1017/S0007123412000312},
  Owner                    = {nathalie},
  Timestamp                = {2016.02.15},
  Topics                   = {knn; imputation}
}

@Article{crookston_finley_JSS2008,
  Title                    = {{yaImpute}: an {R} package for {kNN} imputation},
  Author                   = {Crookston, N. L. and Finley, A. O.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2008},
  Pages                    = {10},
  Volume                   = {23},

  Abstract                 = {This article introduces yaImpute, an R package for nearest neighbor search and imputation. Although nearest neighbor imputation is used in a host of disciplines, the methods implemented in the yaImpute package are tailored to imputation-based forest attribute estimation and mapping. The impetus to writing the yaImpute is a growing interest in nearest neighbor imputation methods for spatially explicit forest inventory, and a need within this research community for software that facilitates comparison among different nearest neighbor search algorithms and subsequent imputation techniques. yaImpute provides directives for defining the search space, subsequent distance calculation, and imputation rules for a given number of nearest neighbors. Further, the package offers a suite of diagnostics for comparison among results generated from different imputation analyses and a set of functions for mapping imputation results.},
  Doi                      = {10.18637/jss.v023.i10},
  Owner                    = {nathalie},
  Timestamp                = {2017.10.09},
  Topics                   = {knn; imputation}
}

@Article{dax_2014,
  Title                    = {Imputing Missing Entries of a Data Matrix: A review},
  Author                   = {Dax, A.},
  Journal                  = {Journal of Advanced Computing},
  Year                     = {2014},
  Pages                    = {98-222},
  Volume                   = {3},
  Number                   = {3},

  Abstract                 = {This review presents a practical summary of the missing data literature, including a sketch of missing data theory and descriptions of normal-model multiple imputation (MI) and maximum likelihood methods. Practical missing data analysis issues are discussed, most notably the inclusion of auxiliary variables for improving power and reducing bias. Solutions are given for missing data challenges such as handling longitudinal, categorical, and clustered data with normal-model MI; including interactions in the missing data model; and handling large numbers of variables. The discussion of attrition and nonignorable missingness emphasizes the need for longitudinal diagnostics and for reducing the uncertainty about the missing data mechanism under attrition. Strategies suggested for reducing attrition bias include using auxiliary variables, collecting follow-up data on a sample of those initially missing, and collecting data on intent to drop out. Suggestions are given for moving forward with research on missing data and attrition.},
  Doi                      = {10.7726/jac.2014.1007},
  Keywords                 = {imputation; missing data; matrix completion problems; low-rank approximations; nearest neighbors; iterative SVD; least squares methods; rank minimization; nuclear norm minimization; error assessment; training set; probe set; cross-validation; rank determination},
  Owner                    = {imke},
  Timestamp                = {2018.11.07},
  Topics                   = {general_informal; knn; imputation}
}

@Article{dempster_etal_JRSSB1977,
  Title                    = {Maximum likelihood from incomplete data via the {EM} algorithm},
  Author                   = {Dempster, A. P. and Laird, N. M. and Rubin, D. B.},
  Journal                  = {Journal of the Royal Statistical Society, Series B (Methodological)},
  Year                     = {1977},
  Number                   = {1},
  Pages                    = {1-38},
  Volume                   = {39},

  Keywords                 = {maximum likelihood estimation; statistical variance; statism; factor analysis; algorithms; estimation methods; missing data; censored data; perceptron convergence procedure},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.11},
  Topics                   = {ML},
  Url                      = {http://www.jstor.org/stable/2984875}
}

@Article{diggle_kenward_AP1994,
  Title                    = {Informative drop-out in longitudinal data analysis},
  Author                   = {Diggle, P. and Kenward, M. G.},
  Journal                  = {Journal of the Royal Statistical Society, Series C (Applied Statistics)},
  Year                     = {1994},
  Number                   = {1},
  Pages                    = {49-93},
  Volume                   = {43},

  Abstract                 = {A model is proposed for continuous longitudinal data with non-ignorable or informative drop-out (ID). The model combines a multivariate linear model for the underlying response with a logistic regression model for the drop-out process. The latter incorporates dependence of the probability of drop-out on unobserved, or missing, observations. Parameters in the model are estimated by using maximum likelihood (ML) and inferences drawn through conventional likelihood procedures. In particular, likelihood ratio tests can be used to assess the informativeness of the drop-out process through comparison of the full model with reduced models corresponding to random drop-out (RD) and completely random processes. A simulation study is used to assess the procedure in two settings: the comparison of time trends under a linear regression model with autocorrelated errors and the estimation of period means and treatment differences from a four-period four-treatment crossover trial. It is seen in both settings that, when data are generated under an ID process, the ML estimators from the ID model do not suffer from the bias that is present in the ordinary least squares and RD ML estimators. The approach is then applied to three examples. These derive from a milk protein trial involving three groups of cows, milk yield data from a study of mastitis in dairy cattle and data from a multicentre clinical trial on the study of depression. All three examples provide evidence of an underlying ID process, two with some strength. It is seen that the assumption of an ID rather than an RD process has practical implications for the interpretation of the data.},
  Doi                      = {10.2307/2986113},
  ISBN                     = {00359254},
  ISSN                     = {00359254, 14679876},
  Keywords                 = {longitudinal methods; missing data},
  Mendeley-groups          = {missing data},
  Owner                    = {alyssa},
  Pmid                     = {6121453},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar}
}


@Article{ding_li_SS2018,
  Title                    = {Causal Inference: A Missing Data Perspective},
  Author                   = {Ding, P. and Li, F.},
  Journal                  = {Statistical Science},
  Year                     = {2018},
  Volume                   = {33},
  Number                   = {2},
  Pages                    = {214--237},

  Abstract                 = {Inferring causal effects of treatments is a central goal in many disciplines. The potential outcomes framework is a main statistical approach to causal inference, in which a causal effect is defined as a comparison of the potential outcomes of the same units under different treatment conditions. Because for each unit at most one of the potential outcomes is observed and the rest are missing, causal inference is inherently a missing data problem. Indeed, there is a close analogy in the terminology and the inferential framework between causal inference and missing data. Despite the intrinsic connection between the two subjects, statistical analyses of causal inference and missing data also have marked differences in aims, settings and methods. This article provides a systematic review of causal inference from the missing data perspective. Focusing on ignorable treatment assignment mechanisms, we discuss a wide range of causal inference methods that have analogues in missing data analysis, such as imputation, inverse probability weighting and doubly robust methods. Under each of the three modes of inference—Frequentist, Bayesian and Fisherian randomization—we present the general structure of inference for both finite-sample and super-population estimands, and illustrate via specific examples. We identify open questions to motivate more research to bridge the two fields.},
  Doi                      = {10.1214/18-STS645},

  Keywords                 = {assignment mechanism; ignorability; imputation; missing data mechanism; observational studies; potential outcome; propensity score; randomizatoin; weighting},
  Owner                    = {imke},
  Timestamp                = {2018.12.11},
  Topics                   = {causal inference}
}

@Article{ding_simonoff_JMLR2010,
  Title                    = {An investigation of missing data methods for classification trees applied to binary response data},
  Author                   = {Ding, Y. and Simonoff, J. S.},
  Journal                  = {Journal of Machine Learning Research},
  Year                     = {2010},
  Pages                    = {131-170},
  Volume                   = {11},
  Number                   = {1},

  Abstract                 = {There are many different methods used by classification tree algorithms when missing data occur in the predictors, but few studies have been done comparing their appropriateness and performance. This paper provides both analytic and Monte Carlo evidence regarding the effectiveness of six popular missing data methods for classification trees applied to binary response data. We show that in the context of classification trees, the relationship between the missingness and the dependent variable, as well as the existence or non-existence of missing values in the testing data, are the most helpful criteria to distinguish different missing data methods. In particular, separate class is clearly the best method to use when the testing set has missing values and the missingness is related to the response variable. A real data set related to modeling bankruptcy of a firm is then analyzed. The paper concludes with discussion of adaptation of these results to logistic regression, and other potential generalizations.},
  Keywords                 = {classification tree; missing data; separate class; rpart; C4.5; cart},
  Owner                    = {nathalie},
  Timestamp                = {2016.11.30},
  Topics                   = {imputation; surrogate variables; classification trees},
  Url                      = {http://www.jmlr.org/papers/v11/ding10a.html}
}

@Article{dong_peng_SP2013,
  Title                    = {Principled missing data methods for researchers},
  Author                   = {Dong, Yiran and Peng, Chao-Ying Joanne},
  Journal                  = {SpringerPlus},
  Year                     = {2013},
  Pages                    = {222},
  Volume                   = {2},

  Abstract                 = {The impact of missing data on quantitative research can be serious, leading to biased estimates of parameters, loss of information, decreased statistical power, increased standard errors, and weakened generalizability of findings. In this paper, we discussed and demonstrated three principled missing data methods: multiple imputation, full information maximum likelihood, and expectation-maximization algorithm, applied to a real-world data set. Results were contrasted with those obtained from the complete data set and from the listwise deletion method. The relative merits of each method are noted, along with common features they share. The paper concludes with an emphasis on the importance of statistical assumptions, and recommendations for researchers. Quality of research will be enhanced if (a) researchers explicitly acknowledge missing data problems and the conditions under which they occurred, (b) principled methods are employed to handle missing data, and (c) the appropriate treatment of missing data is incorporated into review standards of manuscripts submitted for publication.},
  Doi                      = {10.1186/2193-1801-2-222},
  Keywords                 = {missing data; listwise deletion; mi; fiml; em; mar; mcar; mnar},
  Owner                    = {nathalie},
  Timestamp                = {2018.06.06},
  Topics                   = {general_informal}
}

@Book{enders_AMDA2010,
  Title                    = {Applied Missing Data Analysis},
  Author                   = {Enders, C. K.},
  Publisher                = {Guilford Press},
  Year                     = {2010},

  Abstract                 = {Walking readers step by step through complex concepts, this book translates missing data techniques into something that applied researchers and graduate students can understand and utilize in their own research. Enders explains the rationale and procedural details for maximum likelihood estimation, Bayesian estimation, multiple imputation, and models for handling missing not at random (MNAR) data. Easy-to-follow examples and small simulated data sets illustrate the techniques and clarify the underlying principles. The companion website includes data files and syntax for the examples in the book as well as up-to-date information on software. The book is accessible to substantive researchers while providing a level of detail that will satisfy quantitative specialists.},
  ISBN                     = {9781606236390},
  Owner                    = {alyssa},
  Pages                    = {401},
  Timestamp                = {2016.09.27},
  Topics                   = {general}
}

@Article{enders_SEM2001,
  Title                    = {A primer on maximum likelihood algorithms available for use with missing data},
  Author                   = {Enders, C. K.},
  Journal                  = {Structural Equation Modeling},
  Year                     = {2001},
  Number                   = {1},
  Pages                    = {128-141},
  Volume                   = {8},

  Abstract                 = {Maximum likelihood algorithms for use with missing data are becoming commonplace in microcomputer packages. Specifically, 3 maximum likelihood algorithms are currently available in existing software packages: the multiple-group approach, full information maximum likelihood estimation, and the EM algorithm. Although they belong to the same family of estimator, confusion appears to exist over the differences among the 3 algorithms. This article provides a comprehensive, nontechnical overview of the 3 maximum likelihood algorithms. Multiple imputation, which is frequently used in conjunction with the EM algorithm, is also discussed.},
  Doi                      = {10.1207/S15328007SEM0801_7},
  Owner                    = {alyssa},
  Timestamp                = {2017.07.07},
  Topics                   = {ml}
}

@Article{erler_etal_2019,
  author        = {Erler, Nicole S and Rizopoulos, Dimitris and Lesaffre, Emmanuel MEH},
  journal       = {arXiv preprint},
  title         = {JointAI: joint analysis and imputation of incomplete data in R},
  year          = {2019},
  abstract      = {Missing data occur in many types of studies and typically complicate the analysis. Multiple imputation, either using joint modelling or the more flexible fully conditional specification approach, are popular and work well in standard settings. In settings involving non-linear associations or interactions, however, incompatibility of the imputation model with the analysis model is an issue often resulting in bias. Similarly, complex outcomes such as longitudinal or survival outcomes cannot be adequately handled by standard implementations. In this paper, we introduce the R package JointAI, which utilizes the Bayesian framework to perform simultaneous analysis and imputation in regression models with incomplete covariates. Using a fully Bayesian joint modelling approach it overcomes the issue of uncongeniality while retaining the attractive flexibility of fully conditional specification multiple imputation by specifying the joint distribution of analysis and imputation models as a sequence of univariate models that can be adapted to the type of variable. JointAI provides functions for Bayesian inference with generalized linear and generalized linear mixed models and extensions thereof as well as survival models and joint models for longitudinal and survival data, that take arguments analogous to corresponding well known functions for the analysis of complete data from base R and other packages. Usage and features of JointAI are described and illustrated using various examples and the theoretical background is outlined.},
  archiveprefix = {arXiv},
  arxivid       = {1907.10867v3},
  keywords      = {multiple imputation; Bayesian inference; R},
  owner         = {aude},
  timestamp     = {2021.01.12},
  topics        = {multiple imputation},
  url           = {https://arxiv.org/abs/1907.10867},
}


@article{fang_etal_2018,
  Title                    = {Imputation-based adjusted score equations in generalized linear models with nonignorable missing covariate values},
  Author                   = {Fang, F. and Zhao, J. and Shao, J.},
  Journal                  = {Statistica Sinica},
  Volume                   = {28},
  Year                     = {2018},
  Number                   = {4},
  Pages                    = {1677--1701},
  Publisher                = {Institute of Statistical Science},

  Abstract                 = {We consider the estimation of unknown parameters in a generalized linear model when some covariates have nonignorable missing values. When an instrument, a covariate that helps identifying parameters under nonignorable missingness, is appropriately specified, a pseudo likelihood approach similar to that in Tang, Little and Raghunathan (2003) or Zhao and Shao (2015) can be applied. However, this approach does not work well when the instrument is a weak predictor of the response given other covariates. We show that the asymptotic variances of the pseudo likelihood estimators for the regression coefficients of covariates other than the instrument diverge to infinity as the regression coefficient of the instrument goes to 0. By an imputation-based adjustment for the score equations, we propose a new estimator for the regression coefficients of the covariates other than the instrument. This works well even if the instrument is a weak predictor. It is semiparametric since the propensity of missing covariate data is completely unspecified. To solve the adjusted score equation, we develop an iterative algorithm that can be applied by using standard softwares at each iteration. We establish some theoretical results on the convergence of the proposed iterative algorithm and asymptotic normality of the resulting estimators. A variance estimation formula is also derived. Some simulation results and a data example are presented for illustration.},
  Doi                      = {10.5705/ss.202015.0437},
  Keywords                 = {Adjusted likelihood; Identifiability; Nonignorable missing covariate data; Pseudo-likelihood; Semiparametric},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar}
}

@Article{fay_JASA1996,
  Title                    = {Alternative paradigms for the analysis of imputed survey data},
  Author                   = {Fay, R. E.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1996},
  Number                   = {434},
  Pages                    = {490-498},
  Volume                   = {91},

  Abstract                 = {Rubin has offered multiple imputation as a general approach to inference from survey data sets with missing values filled in through imputation. In many situations the multiple imputation variance estimator is consistent. In tum, this observation has lent support to a number of complex applications. In fact, however, the multiple imputation variance estimator is inconsistent under some simple conditions. This article extends previous work of Rao and Shao and of Fay directed toward consistent variance estimation under wider conditions. Extensions of Rao and Shao's results to fractionally weighted imputation combines the estimation efficiency of multiple imputation and the consistency of the Rao-Shao variance estimator.},
  Doi                      = {10.1080/01621459.1996.10476909},
  Keywords                 = {fractionally weighted imputation; missing data; multiple imputation; Rao-Shao variance estimator},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.16},
  Topics                   = {multiple imputation}
}

@Article{fellegi_holt_JASA1976,
  Title                    = {A systematic approach to automatic edit and imputation},
  Author                   = {Fellegi, I. P. and Holt, D.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1976},
  Number                   = {353},
  Pages                    = {17-35},
  Volume                   = {71},

  Doi                      = {10.2307/2285726},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.23},
  Topics                   = {imputation}
}

@Article{ferrari_etal_CSDA2011,
  Title                    = {An imputation method for categorical variables with application to nonlinear principal component analysis},
  Author                   = {Ferrari, Pier Alda and Annoni, Paola and Barbiero, Alessandro and Manzi, Giancario},
  Journal                  = {Computational Statistics \& Data Analysis},
  Year                     = {2011},
  Number                   = {7},
  Pages                    = {2410-2420},
  Volume                   = {55},

  Abstract                 = {The problem of missing data in building multidimensional composite indicators is a delicate problem which is often underrated. An imputation method particularly suitable for categorical data is proposed. This method is discussed in detail in the framework of nonlinear principal component analysis and compared to other missing data treatments which are commonly used in this analysis. Its performance vs. these other methods is evaluated throughout a simulation procedure performed on both an artificial case, varying the experimental conditions, and a real case. The proposed procedure is implemented using R.},
  Doi                      = {10.1016/j.csda.2011.02.007},
  Keywords                 = {composite indicators; forward imputation; imputation procedure; listwise deletion; nearest neighbor; ordinal data; passive treatment},
  Owner                    = {nathalie},
  Timestamp                = {2018.06.07},
  Topics                   = {imputation; knn; factorial data analysis}
}

@Article{finkbeiner_P1979,
  Title                    = {Estimation for the multiple factor model when data are missing},
  Author                   = {Finkbeiner, C.},
  Journal                  = {Psychometrika},
  Year                     = {1979},
  Number                   = {4},
  Pages                    = {409-420},
  Volume                   = {44},

  Abstract                 = {A maximum likelihood method of estimating the parameters of the multiple factor model when data are missing from the sample is presented. A Monte Carlo study compares the method with 5 heuristic methods of dealing with the problem. The present method shows some advantage in accuracy of estimation over the heuristic methods but is considerably more costly computationally.},
  Doi                      = {10.1007/BF02296204},
  Keywords                 = {factor analysis; missing data},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.11},
  Topics                   = {imputation; ml}
}

@article{fitzmorice_etal_JRSS1995,
  Title                    = {Regression Models for Longitudinal Binary Responses with Informative Drop-Outs},
  Author                   = {Fitzmaurice, Garrett M. and Molenberghs, Geert and Lipsitz, Stuart R.},
  Journal                  = {Journal of the Royal Statistical Society. Series B (Methodological)},
  Number                   = {4},
  Pages                    = {691--704},
  Publisher                = {[Royal Statistical Society, Wiley]},
  Volume                   = {57},
  Year                     = {1995},

  Abstract                 = {This paper reviews both likelihood-based and non-likelihood (generalized estimating equations) regression models for longitudinal binary responses when there are drop-outs. Throughout, it is assumed that the regression parameters for the marginal expectations of the binary responses are of primary scientific interest. The association or time dependence between the responses is largely regarded as a nuisance characteristic of the data. The performance of the methods is compared, in terms of asymptotic bias, under misspecification of the association between the responses and the missing data mechanism or drop-out process.},
  ISSN                     = {00359246},
  Url                      = {http://www.jstor.org/stable/2345937},

  Keywords                 = {Generalized Estimating Equations; Maximum Likelihood Estimation; Missing Data; Repeated Measures},
  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {survey}
}

@Article{follman_wu_B1995,
  Title                    = {An approximate generalized linear model with random effects for informative missing data},
  Author                   = {Follmann, D. and Wu, M.},
  Journal                  = {Biometrics},
  Year                     = {1995},
  Number                   = {1},
  Pages                    = {151-168},
  Volume                   = {51},

  Abstract                 = {This paper develops a class of models to deal with missing data from longitudinal studies. We assume that separate models for the primary response and missingness (e.g., number of missed visits) are linked by a common random parameter. Such models have been developed in the econometrics (Heckman, 1979, Econometrica 47, 153-161) and biostatistics (Wu and Carroll, 1988, Biometrics 44, 175-188) literature for a Gaussian primary response. We allow the primary response, conditional on the random parameter, to follow a generalized linear model and approximate the generalized linear model by conditioning on the data that describes missingness. The resultant approximation is a mixed generalized linear model with possibly heterogeneous random effects. An example is given to illustrate the approximate approach, and simulations are performed to critique the adequacy of the approximation for repeated binary data.},
  Doi                      = {10.2307/2533322},
  ISSN                     = {0006341X, 15410420},
  Owner                    = {alyssa},
  Publisher                = {[Wiley, International Biometric Society]},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar}
}

@Article{gad_darwish_AJAMS2013,
  Title                    = {A shared parameter model for longitudinal data with missing values},
  Author                   = {Gad, A. M. and Darwish, N. M. M.},
  Journal                  = {American Journal of Applied Mathematics and Statistics},
  Year                     = {2013},
  Number                   = {2},
  Pages                    = {30-35},
  Volume                   = {1},

  Abstract                 = {Longitudinal studies represent one of the principal research strategies employed in medical and social research. These studies are the most appropriate for studying individual change over time. The prematurely withdrawal of some subjects from the study (dropout) is termed nonrandom when the probability of missingness depends on the missing value. Nonrandom dropout is common phenomenon associated with longitudinal data and it complicates statistical inference. The shared parameter model is used to fit longitudinal data in the presence of nonrandom dropout. The stochastic EM algorithm is developed to obtain the model parameter estimates. Also, parameter estimates of the dropout model have been obtained. Standard errors of estimates have been calculated using the developed Monte Carlo method. The proposed approach performance is evaluated through a simulation study. Also, the proposed approach is applied to a real data set.},
  Owner                    = {alyssa},
  Timestamp                = {2017.08.07},
  Topics                   = {mnar},
  Url                      = {http://pubs.sciepub.com/ajams/1/2/3}
}

@Article{gelman_etal_1998,
  Title                    = {Not asked and not answered: Multiple imputation for multiple surveys},
  Author                   = {Gelman, A. and King, G. and Liu, C.},
  Journal                  = {Journal of the American Statistical Association},
  Volume                   = {93},
  Number                   = {443},
  Pages                    = {846--857},
  Year                     = {1998},
  Publisher                = {Taylor \& Francis Group},

  Abstract                 = {We present a method of analyzing a series of independent cross-sectional surveys in which some questions are not answered in some surveys and some respondents do not answer some of the questions posed. The method is also applicable to a single survey in which different questions are asked or different sampling methods are used in different strata or clusters. Our method involves multiply imputing the missing items and questions by adding to existing methods of imputation designed for single surveys a hierarchical regression model that allows covariates at the individual and survey levels. Information from survey weights is exploited by including in the analysis the variables on which the weights were based, and then reweighting individual responses (observed and imputed) to estimate population quantities. We also develop diagnostics for checking the fit of the imputation model based on comparing imputed data to nonimputed data. We illustrate with the example that motivated this project: a study of pre-election public opinion polls in which not all the questions of interest are asked in all the surveys, so that it is infeasible to impute within each survey separately.},
  Keywords                 = {Bayesian inference; cluster sampling; diagnostics; hierarchical models; ignorable nonresponse; missing data; political science; sample surveys; stratified sampling},

  Doi                      = {10.1080/01621459.1998.10473737},
  Owner                    = {imke},
  Timestamp                = {2018.11.19},
  Topics                   = {mi; survey}
}

@Article{gelman_etal_2005,
  Title                    = {Multiple Imputation for Model Checking: Completed-Data Plots with Missing and Latent Data},
  Author                   = {Gelman, A. and van Mechelen, I. and Verbeke, G. and Heitjan, D. F. and Meulders, M.},
  Journal                  = {Biometrics},
  Volume                   = {61},
  Number                   = {1},
  Pages                    = {74--85},
  Year                     = {2005},
  Publisher                = {Wiley Online Library},

  Abstract                 = {In problems with missing or latent data, a standard approach is to first impute the unobserved data, then perform all statistical analyses on the completed dataset -- corresponding to the observed data and imputed unobserved data --  using standard procedures for complete‐data inference. Here, we extend this approach to model checking by demonstrating the advantages of the use of completed‐data model diagnostics on imputed completed datasets. The approach is set in the theoretical framework of Bayesian posterior predictive checks (but, as with missing‐data imputation, our methods of missing‐data model checking can also be interpreted as “predictive inference” in a non‐Bayesian context). We consider the graphical diagnostics within this framework. Advantages of the completed‐data approach include: (1) One can often check model fit in terms of quantities that are of key substantive interest in a natural way, which is not always possible using observed data alone. (2) In problems with missing data, checks may be devised that do not require to model the missingness or inclusion mechanism; the latter is useful for the analysis of ignorable but unknown data collection mechanisms, such as are often assumed in the analysis of sample surveys and observational studies. (3) In many problems with latent data, it is possible to check qualitative features of the model (for example, independence of two variables) that can be naturally formalized with the help of the latent data. We illustrate with several applied examples.},
  Keywords                 = {Bayesian model checking; exploratory data analysis; multiple imputation; nonresponse; posterior predictive checks; realized discrepancies; residuals},

  Doi                      = {10.1111/j.0006-341X.2005.031010.x},
  Owner                    = {imke},
  Timestamp                = {2018.11.19},
  Topics                   = {mi}
}

@InProceedings{gill_etal_1997,
  Title                    = {Coarsening at random: Characterizations, conjectures, counter-examples},
  Author                   = {Gill, Richard D and Van Der Laan, Mark J and Robins, James M},
  Booktitle                = {Proceedings of the First Seattle Symposium in Biostatistics},
  Pages                    = {255--294},
  Year                     = {1997},
  Organization             = {Springer},
  Abstract                 = {The notion of coarsening at random (CAR) was introduced by Heitjan and Rubin (1991) to describe the most general form of randomly grouped, censored, or missing data, for which the coarsening mechanism can be ignored when making likelihood-based inference about the parameters of the distribution of the variable of interest. The CAR assumption is popular, and applications abound. However the full implications of the assumption have not been realized. Moreover a satisfactory theory of CAR for continuously distributed data -- which is needed in many applications, particularly in survival analysis -- hardly exists as yet. This paper gives a detailed study of CAR. We show that grouped data from a finite sample space always fit a CAR model: a nonparametric model for the variable of interest together with the assumption of an arbitrary CAR mechanism puts no restriction at all on the distribution of the observed data. In a slogan, CAR is everything. We describe what would seem to be the most general way CAR data could occur in practice, a sequential procedure called randomized monotone coarsening. We show that CAR mechanisms exist which are not of this type. Such a coarsening mechanism uses information about the underlying data which is not revealed to the observer, without this affecting the observer’s conclusions. In a second slogan, CAR is more than it seems. This implies that if the analyst can argue from subject-matter considerations that coarsened data is CAR, he or she has knowledge about the structure of the coarsening mechanism which can be put to good use in non-likelihood-based inference procedures. We argue that this is a valuable option in multivariate survival analysis. We give a new definition of CAR in general sample spaces, criticising earlier proposals, and we establish parallel results to the discrete case. The new definition focusses on the distribution rather than the density of the data. It allows us to generalise the theory of CAR to the important situation where coarsening variables (e.g., censoring times) are partially observed as well as the variables of interest.},
  Keywords                 = {coarsening at random; CAR; missingness mechanisms; survival analysis},

  Doi                      = {10.1007/978-1-4684-6316-3_14},
  Owner                    = {imke},
  Timestamp                = {2019.08.02},
  Topics                   = {mnar; mechanisms}

}

@Article{golden_etal_2019,
  Title                    = {Consequences of model misspecification for maximum likelihood estimation with missing data},
  Author                   = {Golden, Richard M and Henley, Steven S and White, Halbert and Kashner, T Michael},
  Journal                  = {Econometrics},
  Volume                   = {7},
  Number                   = {3},
  Pages                    = {37},
  Year                     = {2019},
  Publisher                = {Multidisciplinary Digital Publishing Institute},
  Doi                      = {10.3390/econometrics7030037},

  Abstract                 = {Researchers are often faced with the challenge of developing statistical models with incomplete data. Exacerbating this situation is the possibility that either the researcher's complete-data model or the model of the missing-data mechanism is misspecified. In this article, we create a formal theoretical framework for developing statistical models and detecting model misspecification in the presence of incomplete data where maximum likelihood estimates are obtained by maximizing the
observable-data likelihood function when the missing-data mechanism is assumed ignorable. First, we provide sufficient regularity conditions on the researcher's complete-data model to characterize the asymptotic behavior of maximum likelihood estimates in the simultaneous presence of both missing data and model misspecification. These results are then used to derive robust hypothesis testing
methods for possibly misspecified models in the presence of Missing at Random (MAR) or Missing Not at Random (MNAR) missing data. Second, we introduce a method for the detection of model misspecification in missing data problems using recently developed Generalized Information Matrix Tests (GIMT). Third, we identify regularity conditions for the Missing Information Principle (MIP) to hold in the presence of model misspecification so as to provide useful computational covariance matrix estimation formulas. Fourth, we provide regularity conditions that ensure the observable-data expected negative log-likelihood function is convex in the presence of partially observable data when the amount of missingness is sufficiently small and the complete-data likelihood is convex. Fifth, we show that when the researcher has correctly specified a complete-data model with a convex negative likelihood function and an ignorable missing-data mechanism, then its strict local minimizer is the true parameter value for the complete-data model when the amount of missingness is sufficiently small. Our results thus provide new robust estimation, inference, and specification analysis methods for developing statistical models with incomplete data.},
  Keywords                 = {asymptotic theory; ignorable; Generalized Information Matrix Test; misspecification;
missing data; nonignorable; sandwich estimator; specification analysis},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {ml; regression}

}

@InProceedings{gondara_wang_2018,
  Title                    = {MIDA: Multiple Imputation using Denoising Autoencoders},
  Author                   = {Gondara, L. and Wang, K.},
  Booktitle                = {Proceedings of the 22nd Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 2018)},
  Series                   = {Lecture Notes in Computer Science},
  Year                     = {2018},
  Editor                   = {Phung, D. and Tseng, V. and Webb, G. and Ho, B. and Ganji, M. and Rashidi, L.},

  Pages                    = {260-272},
  Publisher                = {Springer International Publishing},

  Eventdate                = {2018-06-03/2018-06-06},
  ISBN                     = {3319930404},

  Abstract                 = {Missing data is a significant problem impacting all domains. State-of-the-art framework for minimizing missing data bias is multiple imputation, for which the choice of an imputation model remains nontrivial. We propose a multiple imputation model based on overcomplete deep denoising autoencoders. Our proposed model is capable of handling different data types, missingness patterns, missingness proportions and distributions. Evaluation on several real life datasets show our proposed model significantly outperforms current state-of-the-art methods under varying conditions while simultaneously improving end of the line analytics.},
  Doi                      = {10.1007/978-3-319-93040-4_21},
  Url                      = {https://arxiv.org/abs/1705.02737},
  Keywords                 = {Multiple imputation; denoising autoencoders; DAE},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {multiple imputation; deep learning}
}

@InProceedings{goodfellow_etal_2013,
  Title                    = {Multi-Prediction Deep Boltzmann Machines},
  Author                   = {Goodfellow, I. and Mirza, M. and Courville, A. and Bengio, Y.},
  Booktitle                = {Proceedings of the 26th International Conference on Neural Information Processing Systems},
  Series                   = {Advances in Neural Information Processing Systems 26},
  Editor                   = {Burges, C.J.C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K.Q.},
  Pages                    = {548--556},
  Year                     = {2013},
  Publisher                = {Curran Associates, Inc.},
  Eventdate                = {2013-12-05/2013-12-10},

  Abstract                 = {We introduce the Multi-Prediction Deep Boltzmann Machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.},

  Url                      = {http://papers.nips.cc/paper/5024-multi-prediction-deep-boltzmann-machines.pdf},

  Keywords                 = {Classification; deep Boltzmann Machines; DBM; pseudolikelihood},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {classification; deep learning}
}

@Article{graham_ARP2009,
  Title                    = {Missing data analysis: making it work in the real world},
  Author                   = {Graham, J. W.},
  Journal                  = {Annual Review of Psychology},
  Year                     = {2009},
  Pages                    = {549-576},
  Volume                   = {60},

  Abstract                 = {This review presents a practical summary of the missing data literature, including a sketch of missing data theory and descriptions of normal-model multiple imputation (MI) and maximum likelihood methods. Practical missing data analysis issues are discussed, most notably the inclusion of auxiliary variables for improving power and reducing bias. Solutions are given for missing data challenges such as handling longitudinal, categorical, and clustered data with normal-model MI; including interactions in the missing data model; and handling large numbers of variables. The discussion of attrition and nonignorable missingness emphasizes the need for longitudinal diagnostics and for reducing the uncertainty about the missing data mechanism under attrition. Strategies suggested for reducing attrition bias include using auxiliary variables, collecting follow-up data on a sample of those initially missing, and collecting data on intent to drop out. Suggestions are given for moving forward with research on missing data and attrition.},
  Doi                      = {10.1146/annurev.psych.58.110405.085530},
  ISBN                     = {0066-4308 (Print) 0066-4308 (Linking)},
  ISSN                     = {0066-4308},
  Mendeley-groups          = {missing data},
  Owner                    = {alyssa},
  Pmid                     = {18652544},
  Shorttitle               = {Missing Data Analysis},
  Timestamp                = {2016.11.30},
  Topics                   = {general}
}

@Article{graham_etal_PS2007,
  Title                    = {How many imputations are really needed? Some practical clarifications of multiple imputation theory},
  Author                   = {Graham, John W. and Olchowski, Allison E. and Gilreath, Tamika E.},
  Journal                  = {Prevention Science},
  Year                     = {2007},
  Number                   = {3},
  Pages                    = {206-213},
  Volume                   = {8},

  Abstract                 = {Multiple imputation (MI) and full information maximum likelihood (FIML) are the two most common approaches to missing data analysis. In theory, MI and FIML are equivalent when identical models are tested using the same variables, and when m, the number of imputations performed with MI, approaches infinity. However, it is important to know how many imputations are necessary before MI and FIML are sufficiently equivalent in ways that are important to prevention scientists. MI theory suggests that small values of m, even on the order of three to five imputations, yield excellent results. Previous guidelines for sufficient m are based on relative efficiency, which involves the fraction of missing information (gamma) for the parameter being estimated, and m. In the present study, we used a Monte Carlo simulation to test MI models across several scenarios in which gamma and m were varied. Standard errors and p-values for the regression coefficient of interest varied as a function of m, but not at the same rate as relative efficiency. Most importantly, statistical power for small effect sizes diminished as m became smaller, and the rate of this power falloff was much greater than predicted by changes in relative efficiency. Based our findings, we recommend that researchers using MI should perform many more imputations than previously considered sufficient. These recommendations are based on gamma, and take into consideration one's tolerance for a preventable power falloff (compared to FIML) due to using too few imputations.},
  Doi                      = {10.1007/s11121-007-0070-9},
  Keywords                 = {multiple imputation; number of imputations; full information maximum likelihood; missing data; statistical power},
  Owner                    = {nathalie},
  Timestamp                = {2018.06.06},
  Topics                   = {multiple imputation}
}

@InBook{graham_etal_SP1997,
  Title                    = {The Science of Prevention: Methodological Advances from Alcohol and Substance Abuse Research},
  Booktitle                = {The Science of Prevention: Methodological Advances from Alcohol and Substance Abuse Research},
  Author                   = {Graham, J. W. and Hofer, S. M. and Donaldson, S. I. and MacKinnon, D. P. and Schafer, J. L.},
  Chapter                  = {Analysis with missing data in prevention research},
  Editor                   = {Bryant, K.J. and Windle, M. and West, S.G.},
  Pages                    = {325-366},
  Publisher                = {American Psychological Association},
  Year                     = {1997},

  Address                  = {Washington, DC, USA},

  Abstract                 = {(from the chapter) Outlines leading approaches to dealing with missing data problems, specifically as they apply to alcohol and drug prevention research. First, the authors discuss methods for missing continuous data. (the expectation-maximization algorithm, multiple imputation, multiple-group structural equation modeling, and raw maximum-likelihood). Next, they discuss missing categorical data, present the beginnings of a maximum-likelihood approach to analysis with missing categorical data, discuss the use of a multiple imputation procedure for categorical data, and touch on the use of continuous-data methods for analyzing categorical data. The authors then discuss what happens when the assumptions underlying their recommended approach are not met fully with new data being presented relating to the causes of missingness. A general sensitivity analysis for the case in which the assumptions of the recommended missing data procedures are not met fully is presented. Finally, the authors discuss new approaches available to prevention and applied psychological researchers and suggest that prevention studies in general may be relatively free from serious attrition biases (if recommended analyses are used). (PsycINFO Database Record (c) 2002 APA, all rights reserved).},
  Doi                      = {10.1037/10222-010},
  ISBN                     = {1-55798-439-5},
  ISSN                     = {1046-9516},
  Keywords                 = {alcohol abuse; drug abuse prevention; experimentation; statistical estimation; maximum likelihood},
  Owner                    = {nathalie},
  Pmid                     = {9243532},
  Timestamp                = {2018.07.12},
  Topics                   = {}
}

@Article{heckman_AESM1976,
  Title                    = {The common structure of statistical models of truncation, sample selection and limited dependent variables and a simple estimator for such models},
  Author                   = {Heckman, J. J.},
  Journal                  = {Annals of Economic and Social Measurement},
  Year                     = {1976},
  Number                   = {4},
  Pages                    = {475-492},
  Volume                   = {5},

  ISBN                     = {0691003637},
  ISSN                     = {0361-8595},
  Mendeley-groups          = {missing data},
  Owner                    = {alyssa},
  Pmid                     = {9590},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar},
  Url                      = {http://ideas.repec.org/h/nbr/nberch/10491.html}
}

@Article{heckman_E1979,
  Title                    = {Sample selection bias as a specification error},
  Author                   = {Heckman, J.},
  Journal                  = {Econometrica},
  Year                     = {1979},
  Number                   = {1},
  Pages                    = {153-161},
  Volume                   = {47},

  Abstract                 = {This paper discusses the bias that results from using nonrandomly selected samples to estimate behavioral relationships as an ordinary specification error or "omitted variables" bias. A simple consistent two stage estimator is considered that enables analysts to utilize simple regression methods to estimate behavioral functions by least squares methods. The asymptotic distribution of the estimator is derived.},
  Doi                      = {10.2307/1912352},
  ISSN                     = {00129682, 14680262},
  Owner                    = {alyssa},
  Publisher                = {[Wiley, Econometric Society]},
  Timestamp                = {2017.10.25},
  Topics                   = {}
}

@Article{hogan_laird_SM1997,
  Title                    = {Mixture models for the joint distribution of repeated measures and event times},
  Author                   = {Hogan, J. W. and Laird, N. M.},
  Journal                  = {Statistics in Medecine},
  Year                     = {1997},
  Number                   = {1-3},
  Pages                    = {239-257},
  Volume                   = {16},

  Abstract                 = {Many long-term clinical trials collect both a vector of repeated measurements and an event time on each subject; often, the two outcomes are dependent. One example is the use of surrogate markers to predict disease onset or survival. Another is longitudinal trials which have outcome-related dropout. We describe a mixture model for the joint distribution which accommodates incomplete repeated measures and right-censored event times, and provide methods for full maximum likelihood estimation. The methods are illustrated through analysis of data from a clinical trial for a new schizophrenia therapy; in the trial, dropout time is closely related to outcome, and the dropout process differs between treatments. The parameter estimates from the model are used to make a treatment comparison after adjusting for the effects of dropout. An added benefit of the analysis is that it permits using the repeated measures to increase efficiency of estimates of the event time distribution.},
  Keywords                 = {mnar; mixture model; ml; clinical trial},
  Owner                    = {alyssa},
  Timestamp                = {2017.08.07},
  Topics                   = {}
}

@article{hogan_lancaster_2004,
  Title                    = {Instrumental variables and inverse probability weighting for causal inference from longitudinal observational studies},
  Author                   = {Hogan, J. W. and Lancaster, T.},

  Journal                  = {Statistical Methods in Medical Research},
  Volume                   = {13},
  Number                   = {1},
  Pages                    = {17-48},
  Year                     = {2004},
  Doi                      = {10.1191/0962280204sm351ra},
  Abstract = {Inferring causal effects from longitudinal repeated measures data has high relevance to a number of areas of research, including economics, social sciences and epidemiology. In observational studies in particular, the treatment receipt mechanism is typically not under the control of the investigator; it can depend on various factors, including the outcome of interest. This results in differential selection into treatment levels, and can lead to selection bias when standard routines such as least squares regression are used to estimate causal effects. Interestingly, both the characterization of and methodology for handling selection bias can differ substantially by disciplinary tradition. In social sciences and economics, instrumental variables (IV) is the standard method for estimating linear and nonlinear models in which the error term may be correlated with an observed covariate. When such correlation is not ruled out, the covariate is called endogenous and least squares estimates of the covariate effect are typically biased. The availability of an instrumental variable can be used to reduce or eliminate the bias.In public health and clinical medicine (e.g., epidemiology and biostatistics), selection bias is typically viewed in terms of confounders, and the prevailing methods are geared toward making proper adjustments via explicit use of observed confounders (e.g., stratification, standardization). A class of methods known as inverse probability weighting (IPW) estimators, which relies on modeling selection in terms of confounders, is gaining in popularity for making such adjustments.Our objective is to review and compare IPW and IV for estimating causal treatment effects from longitudinal data, where the treatment may vary with time. We accomplish this by defining the causal estimands in terms of a linear stochastic model of potential outcomes (counterfactuals). Our comparison includes a review of terminology typically used in discussions of causal inference (e.g., confounding, endogeneity); a review of assumptions required to identify causal effects and their implications for estimation and interpretation; description of estimation via inverse weighting and instrumental variables; and a comparative analysis of data from a longitudinal cohort study of HIV-infected women. In our discussion of assumptions and estimation routines, we try to emphasize sufficient conditions needed to implement relatively standard analyses that can essentially be formulated as regression models. In that sense this review is geared toward the quantitative practitioner.The objective of the data analysis is to estimate the causal (therapeutic) effect of receiving combination antiviral therapy on longitudinal CD4 cell counts, where receipt of therapy varies with time and depends on CD4 count and other covariates. Assumptions are reviewed in context, and resulting inferences are compared. The analysis illustrates the importance of considering the existence of unmeasured confounding and of checking for ‘weak instruments.’ It also suggests that IV methodology may have a role in longitudinal cohort studies where potential instrumental variables are available.},

  Keywords                 = {causal inference; longitudinal data; ipw; instrumental variables},
  Owner                    = {imke},
  Timestamp                = {2018.11.12},
  Topics                   = {causal inference}
}


@Article{honaker_etal_JSS2011,
  Title                    = {Amelia {II}: a program for missing data},
  Author                   = {Honaker, J. and King, G. and Blackwell, M.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2011},
  Number                   = {7},
  Volume                   = {45},

  Abstract                 = {Amelia II "multiply imputes" missing data in a single cross-section (such as a survey), from a time series (like variables collected for each year in a country), or from a time-series-cross-sectional data set (such as collected by years for each of several countries). Amelia II implements our bootstrapping-based algorithm that gives essentially the same answers as the standard IP or EMis approaches, is usually considerably faster than existing approaches and can handle many more variables. Unlike Amelia I and other statistically rigorous imputation software, it virtually never crashes (but please let us know if you find to the contrary!). The program also generalizes existing approaches by allowing for trends in time series across observations within a cross-sectional unit, as well as priors that allow experts to incorporate beliefs they have about the values of missing cells in their data. Amelia II also includes useful diagnostics of the fit of multiple imputation models. The program works from the R command line or via a graphical user interface that does not require users to know R.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {arXiv:1501.0228},
  Doi                      = {10.18637/jss.v045.i07},
  Eprint                   = {arXiv:1501.0228},
  ISBN                     = {1548-7660},
  ISSN                     = {15487660},
  Owner                    = {alyssa},
  Pmid                     = {18291371},
  Timestamp                = {2017.10.16},
  Topics                   = {multiple imputation}
}

@Article{horton_kleinman_2007,
  Title                    = {Much Ado About Nothing - A Comparison of Missing Data Methods and Software to Fit Incomplete Data Regression Models},
  Author                   = {Horton, N. J. and Kleinman, K. P.},
  Journal                  = {The American Statistician},
  Year                     = {2017},
  Number                   = {1},
  Pages                    = {79-90},
  Volume                   = {61},
  Shorttitle               = {Much Ado About Nothing},

  Abstract                 = {Missing data are a recurring problem that can cause bias or lead to inefficient analyses. Statistical methods to address missingness have been actively pursued in recent years, including imputation, likelihood, and weighting approaches. Each approach is more complicated when there are many patterns of missing values, or when both categorical and continuous random variables are involved. Implementations of routines to incorporate observations with incomplete variables in regression models are now widely available. We review these routines in the context of a motivating example from a large health services research dataset. While there are still limitations to the current implementations, and additional efforts are required of the analyst, it is feasible to incorporate partially observed values, and these methods should be used in practice.},
  Doi                      = {10.1198/000313007X172556},
  Keywords                 = {multiple imputation; conditional gaussian; health services research; maximum likelihood; psychiatric epidemiology},
  Owner                    = {imke},
  Timestamp                = {2018.11.07},
  Topics                   = {general_informal}
}

@Article{hothorn_etal_2012,
  Title                    = {Unbiased Recursive Partitioning: A Conditional Inference Framework},
  Author                   = {Hothorn, T. and Hornik, K. and Zeileis, A.},
  Journal                  = {Journal of Computational and Graphical Statistics},
  Year                     = {2012},
  Number                   = {3},
  Pages                    = {651-674},
  Volume                   = {15},

  Abstract                 = {Recursive binary partitioning is a popular tool for regression analysis. Two fundamental problems of exhaustive search procedures usually applied to fit such models have been known for a long time: overfitting and a selection bias towards covariates with many possible splits or missing values. While pruning procedures are able to solve the overfitting problem, the variable selection bias still seriously affects the interpretability of tree-structured regression models. For some special cases unbiased procedures have been suggested, however lacking a common theoretical foundation. We propose a unified framework for recursive partitioning which embeds tree-structured regression models into a well defined theory of conditional inference procedures. Stopping criteria based on multiple test procedures are implemented and it is shown that the predictive performance of the resulting trees is as good as the performance of established exhaustive search procedures. It turns out that the partitions and therefore the models induced by both approaches are structurally different, confirming the need for an unbiased variable selection. Moreover, it is shown that the prediction accuracy of trees with early stopping is equivalent to the prediction accuracy of pruned trees with unbiased variable selection. The methodology presented here is applicable to all kinds of regression problems, including nominal, ordinal, numeric, censored as well as multivariate response variables and arbitrary measurement scales of the covariates. Data from studies on glaucoma classification, node positive breast cancer survival and mammography experience are re-analyzed.},
  Doi                      = {10.1198/106186006X133933},
  Keywords                 = {multiple testing; multivariate regression trees; ordinal regression trees; permutation tests; variable selection},
  Owner                    = {imke},
  Timestamp                = {2018.10.30},
  Topics                   = {regression trees; variable selection}
}

@Article{huisman_QQ2000,
  Title                    = {Imputation of missing item responses: some simple techniques},
  Author                   = {Huisman, M.},
  Journal                  = {Quality \& Quantity},
  Year                     = {2000},
  Number                   = {4},
  Pages                    = {331-351},
  Volume                   = {34},

  Abstract                 = {Among the wide variety of procedures to handle missing data, imputing the missing values is a popular strategy to deal with missing item responses. In this paper some simple and easily implemented imputation techniques like item and person mean substitution, and somehot-deck procedures, are investigated. A simulation study was performed based on responses to items forming a scale to measure a latent trait of the respondents. The effects of different imputation procedures on the estimation of the latent ability of the respondents were investigated, as well as the effect on the estimation of Cronbach's alpha (indicating the reliability of the test) and Loevinger's H-coefficient (indicating scalability). The results indicate that procedures which use the relationships between items perform best, although they tend to overestimate the scale quality.},
  Doi                      = {10.1023/A:1004782230065},
  Keywords                 = {missing data; mean imputation; hot-deck imputation; item response theory; simulation},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.16},
  Topics                   = {imputation; hot-deck}
}

@Article{husson_josse_FQP2013,
  Title                    = {Handling missing values in multiple factor analysis},
  Author                   = {Husson, F. and Josse, J.},
  Journal                  = {Food Quality and Preference},
  Year                     = {2013},
  Pages                    = {77-85},
  Volume                   = {30},

  Doi                      = {10.1016/j.foodqual.2013.04.013},
  Keywords                 = {exploratory multivariate analysis, missing values, multi-table data, multiple factor analysis, napping data},
  Owner                    = {nathalie},
  Timestamp                = {2016.09.28},
  Topics                   = {factorial data analysis; imputation}
}

@Article{ibrahim_etal_2001,
  Title                    = {Missing responses in generalised linear mixed models when the missing data mechanism is nonignorable},
  Author                   = {Ibrahim, J. G. and Chen, M. and Lipsitz, S. R.},
  Journal                  = {Biometrika},
  Year                     = {2001},
  Number                   = {2},
  Pages                    = {551-564},
  Volume                   = {88},
  Publisher                = {Oxford University Press},

  Abstract                 = {We propose a method for estimating parameters in the generalised linear mixed model with nonignorable missing response data and with nonmonotone patterns of missing data in the response variable. We develop a Monte Carlo EM algorithm for estimating the parameters in the model via the Gibbs sampler. For the normal random effects model, we derive a novel analytical form for the E‐ and M‐steps, which is facilitated by integrating out the random effects. This form leads to a computationally feasible and extremely efficient Monte Carlo EM algorithm for computing maximum likelihood estimates and standard errors. In addition, we propose a very general joint multinomial model for the missing data indicators, which can be specified via a sequence of one‐dimensional conditional distributions. This multinomial model allows for an arbitrary correlation structure between the missing data indicators, and has the potential of reducing the number of nuisance parameters. Real datasets from the International Breast Cancer Study Group and an environmental study involving dyspnoea in cotton workers are presented to illustrate the proposed methods.},
  Doi                      = {10.1093/biomet/88.2.551},
  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Keywords                 = {EM algorithm; Gibbs sampling; Maximum likelihood estimation; Missing data mechanism; Monte Carlo EM algorithm; Random effects model},
  Topics                   = {em; mnar; glm; mcem; ml}
}

@Article{ibrahim_etal_RSS1999,
  Title                    = {Missing Covariates in Generalized Linear Models When the Missing Data Mechanism is Non-Ignorable},
  Author                   = {Ibrahim, J. G. and Lipsitz, S. R. and Chen, M.},
  Journal                  = {Journal of the Royal Statistical Society},
  Year                     = {1999},
  Number                   = {1},
  Pages                    = {173-190},
  Volume                   = {61},
  Series                   = {Series B (Statistical Methodology)},
  Publisher                = {Wiley for the Royal Statistical Society},

  Abstract                 = {We propose a method for estimating parameters in generalized linear models with missing covariates and a non‐ignorable missing data mechanism. We use a multinomial model for the missing data indicators and propose a joint distribution for them which can be written as a sequence of one‐dimensional conditional distributions, with each one‐dimensional conditional distribution consisting of a logistic regression. We allow the covariates to be either categorical or continuous. The joint covariate distribution is also modelled via a sequence of one‐dimensional conditional distributions, and the response variable is assumed to be completely observed. We derive the E‐ and M‐steps of the EM algorithm with non‐ignorable missing covariate data. For categorical covariates, we derive a closed form expression for the E‐ and M‐steps of the EM algorithm for obtaining the maximum likelihood estimates (MLEs). For continuous covariates, we use a Monte Carlo version of the EM algorithm to obtain the MLEs via the Gibbs sampler. Computational techniques for Gibbs sampling are proposed and implemented. The parametric form of the assumed missing data mechanism itself is not `testable' from the data, and thus the non‐ignorable modelling considered here can be viewed as a sensitivity analysis concerning a more complicated model. Therefore, although a model may have `passed' the tests for a certain missing data mechanism, this does not mean that we have captured, even approximately, the correct missing data mechanism. Hence, model checking for the missing data mechanism and sensitivity analyses play an important role in this problem and are discussed in detail. Several simulations are given to demonstrate the methodology. In addition, a real data set from a melanoma cancer clinical trial is presented to illustrate the methods proposed.},
  Url                      = {http://www.jstor.org/stable/2680744},
  Doi                      = {10.1111/1467-9868.00170},
  Owner                    = {imke},
  Timestamp                = {2018.11.07},
  Keywords                 = {EM algorithm; Gibbs sampling; Logistic regression; Maximum likelihood estimation; Missing data mechanism; Monte Carlo EM algorithm},
  Topics                   = {em; mnar; glm; mcem; ml}
}


@Article{ilin_raiko_JMLR2010,
  Title                    = {Practical approaches to {P}rincipal {C}omponent {A}nalysis in the presence of missing values},
  Author                   = {Ilin, A. and Raiko, T.},
  Journal                  = {Journal of Machine Learning Research},
  Year                     = {2010},
  Pages                    = {1957-2000},
  Volume                   = {11},

  Abstract                 = {Principal component analysis (PCA) is a classical data analysis technique that finds linear transformations of data that retain the maximal amount of variance. We study a case where some of the data values are missing, and show that this problem has many features which are usually associated with nonlinear models, such as overfitting and bad locally optimal solutions. A probabilistic formulation of PCA provides a good foundation for handling missing values, and we provide formulas for doing that. In case of high dimensional and very sparse data, overfitting becomes a severe problem and traditional algorithms for PCA are very slow. We introduce a novel fast algorithm and extend it to variational Bayesian learning. Different versions of PCA are compared in artificial experiments, demonstrating the effects of regularization and modeling of posterior variance. The scalability of the proposed algorithm is demonstrated by applying it to the Netflix problem.},
  ISBN                     = {1532-4435},
  Keywords                 = {missing values; overfitting; principal component analysis; regularization; variational},
  Owner                    = {alyssa},
  Timestamp                = {2016.11.30},
  Topics                   = {factorial data analysis; imputation},
  Url                      = {http://jmlr.csail.mit.edu/papers/v11/ilin10a.html}
}

@Article{imbert_etal_B2018,
  Title                    = {Multiple hot-deck imputation for network inference from {RNA} sequencing data},
  Author                   = {Imbert, A. and Valsesia, A. and Le Gall, C. and Armenise, C. and Lefebvre, G. and Gourraud, P. and Viguerie, N. and Villa-Vialaneix, N.},
  Journal                  = {Bioinformatics},
  Year                     = {2018},
  Number                   = {10},
  Pages                    = {1726-1732},
  Volume                   = {34},

  Abstract                 = {Motivation: Network inference provides a global view of the relations existing between gene expression in a given transcriptomic experiment (often only for a restricted list of chosen genes). However, it is still a challenging problem: even if the cost of sequencing techniques has decreased over the last years, the number of samples in a given experiment is still (very) small compared to the number of genes. Results: We propose a method to increase the reliability of the inference when RNA-seq expression data have been measured together with an auxiliary dataset that can provide external information on gene expression similarity between samples. Our statistical approach, hd-MI, is based on imputation for samples without available RNA-seq data that are considered as missing data but are observed on the secondary dataset. hd-MI can improve the reliability of the inference for missing rates up to 30% and provides more stable networks with a smaller number of false positive edges. On a biological point of view, hd-MI was also found relevant to infer networks from RNA-seq data acquired in adipose tissue during a nutritional intervention in obese individuals. In these networks, novel links between genes were highlighted, as well as an improved comparability between the two steps of the nutritional intervention. Availability: Software and sample data are available as an R package, RNAseqNet, that can be downloaded from the Comprehensive R Archive Network (CRAN).},
  Doi                      = {10.1093/bioinformatics/btx819},
  Owner                    = {nathalie},
  Timestamp                = {2017.07.07},
  Topics                   = {multiple imputation; hot-deck},
  Website                  = {https://academic.oup.com/bioinformatic}
}

@inproceedings{ipsen_etal_2020,
  title={How to deal with missing data in supervised deep learning?},
  author={Ipsen, Niels and Mattei, Pierre-Alexandre and Frellsen, Jes},
  booktitle={ICML Workshop on the Art of Learning with Missing Values (Artemiss)},
  year={2020},
  Url={https://hal.inria.fr/hal-03044144/},
  Abstract={The issue of missing data in supervised learning has been largely overlooked, especially in the deep learning community. We investigate strategies to adapt neural architectures to handle missing values. Here, we focus on regression and classification problems where the features are assumed to be missing at random. Of particular interest are schemes that allow to reuse as-is a neural discriminative architecture. One scheme involves imputing the missing values with learnable constants. We propose a second novel approach that leverages recent advances in deep generative modelling. More precisely, a deep latent variable model can be learned jointly with the discriminative model, using importance-weighted variational inference in an end-to-end way. This hybrid approach, which mimics multiple imputation, also allows to impute the data, by relying on both the discriminative and the generative model. We also discuss ways of using a pre-trained generative model to train the discriminative one. In domains where powerful deep generative models are available, the hybrid approach leads to large performance gains.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {deep learning; supervised learning}
}

@Article{ipsen_etal_2020,
  Title                    = {not-MIWAE: Deep generative modelling with missing not at random data},
  Author                   = {Ipsen, Niels Bruun and Mattei, Pierre-Alexandre and Frellsen, Jes},
  Journal                  = {arXiv preprint},
  Year                     = {2020},
  Archiveprefix            = {arXiv},
  Arxivid                  = {2006.12871},
  Url                      = {https://arxiv.org/abs/2006.12871},

  Abstract                 = {When a missing process depends on the missing values themselves, it needs to be explicitly modelled and taken into account while doing likelihood-based inference. We present an approach for building and fitting deep latent variable models (DLVMs) in cases where the missing process is dependent on the missing data. Specifically, a deep neural network enables us to flexibly model the conditional distribution of the missingness pattern given the data. This allows for incorporating prior information about the type of missingness (e.g. self-censoring) into the model. Our inference technique, based on importance-weighted variational inference, involves maximising a lower bound of the joint likelihood. Stochastic gradients of the bound are obtained by using the reparameterisation trick both in latent space and data space. We show on various kinds of data sets and missingness patterns that explicitly modelling the missing process can be invaluable.},

  Keywords                 = {mnar; deep learning},
  Owner                    = {aude},
  Timestamp                = {2021.01.12},
  Topics                   = {deep learning; mnar}
}

@Article{jamshidian_etal_JSS2014,
  Title                    = {{MissMech}: an {R} package for testing homoscedasticity, multivariate normality, and missing completely at random ({MCAR})},
  Author                   = {Jamshidian, M. and Jalal, S. and Jansen, C.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2014},
  Number                   = {6},
  Pages                    = {1-31},
  Volume                   = {56},

  Abstract                 = {Researchers are often faced with analyzing data sets that are not complete. To prop- erly analyze such data sets requires the knowledge of the missing data mechanism. If data are missing completely at random (MCAR), then many missing data analysis techniques lead to valid inference. Thus, tests of MCAR are desirable. The package MissMech implements two tests developed by Jamshidian and Jalal (2010) for this purpose. These tests can be run using a function called TestMCARNormality. One of the tests is valid if data are normally distributed, and another test does not require any distributional assumptions for the data. In addition to testing MCAR, in some special cases, the function TestMCARNormality is also able to test whether data have a multivariate normal distribution. As a bonus, the functions in MissMech can also be used for the following additional tasks: (i) test of homoscedasticity for several groups when data are completely observed, (ii) perform the k-sample test of Anderson-Darling to determine whether k groups of univariate data come from the same distribution, (iii) impute incomplete data sets using two methods, one where normality is assumed and one where no specific distributional assumptions are made, (iv) obtain normal-theory maximum likelihood estimates for mean and covariance matrix when data are incomplete, along with their standard errors, and finally (v) perform the Neyman's test of uniformity. All of these features are explained in the paper, including examples.},
  Doi                      = {10.18637/jss.v056.i06},
  ISBN                     = {1548-7660},
  ISSN                     = {1548-7660},
  Keywords                 = {anderson-darling; goodness of fit test; hawkins; homogeneity of covariances; incomplete data; maximum likelihood estimate; missing data; neyman; imputation,s test,test},
  Owner                    = {alyssa},
  Timestamp                = {2017.05.09},
  Topics                   = {diagnosis}
}

@Article{jamshidian_jalal_P2010,
  Title                    = {Tests of homoscedasticity, normality, and missing completely at random for incomplete multivariate data},
  Author                   = {Jamshidian, M. and Jalal, S.},
  Journal                  = {Psychometrika},
  Year                     = {2010},
  Number                   = {4},
  Pages                    = {649-674},
  Volume                   = {75},

  Abstract                 = {Test of homogeneity of covariances (or homoscedasticity) among several groups has many applications in statistical analysis. In the context of incomplete data analysis, tests of homoscedasticity among groups of cases with identical missing data patterns have been proposed to test whether data are missing completely at random (MCAR). These tests of MCAR require large sample sizes n and/or large group sample sizes n(i), and they usually fail when applied to non-normal data. Hawkins (1981) proposed a test of multivariate normality and homoscedasticity that is an exact test for complete data when n(i) are small. This paper proposes a modification of this test for complete data to improve its performance, and extends its application to test of homoscedasticity and MCAR when data are multivariate normal and incomplete. Moreover, it is shown that the statistic used in the Hawkins test in conjunction with a nonparametric k-sample test can be used to obtain a nonparametric test of homoscedasticity that works well for both normal and non-normal data. It is explained how a combination of the proposed normal-theory Hawkins test and the nonparametric test can be employed to test for homoscedasticity, MCAR, and multivariate normality. Simulation studies show that the newly proposed tests generally outperform their existing competitors in terms of Type I error rejection rates. Also, a power study of the proposed tests indicates good power. The proposed methods use appropriate missing data imputations to impute missing data. Methods of multiple imputation are described and one of the methods is employed to confirm the result of our single imputation methods. Examples are provided where multiple imputation enables one to identify a group or groups whose covariance matrices differ from the majority of other groups.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {NIHMS150003},
  Doi                      = {10.1007/s11336-010-9175-3},
  Eprint                   = {NIHMS150003},
  ISBN                     = {1133601091753},
  ISSN                     = {00333123},
  Keywords                 = {covariance structures; k-sample test; missing data; multiple imputation; nonparametric test; structural equations; test of homogeneity of covariances},
  Owner                    = {alyssa},
  Pmid                     = {21720450},
  Timestamp                = {2017.05.09},
  Topics                   = {diagnosis}
}

@Article{jiang_etal_2018,
  Title                    = {Logistic Regression with Missing Covariates--Parameter Estimation, Model Selection and Prediction},
  Author                   = {Jiang, Wei and Josse, Julie and Lavielle, Marc},
  Journal                  = {arXiv preprint},
  ArchivePrefix            = {arXiv},
  Eprint                   = {1805.04602},
  PrimaryClass             = {stat.ME},
  Year                     = {2018},

  Abstract                 = {Logistic regression is a common classification method in supervised learning. Sur- prisingly, there are very few solutions for performing it and selecting variables in the presence of missing values. We develop a complete approach, including the es- timation of parameters and variance of estimators, derivation of confidence intervals and a model selection procedure, for cases where the missing values can be anywhere in covariates. By well organizing different patterns of missingness in each observa- tion, we propose a stochastic approximation version of the EM algorithm based on Metropolis-Hasting sampling, to perform statistical inference for logistic regression with incomplete data. We also tackle the problem of prediction for a new individual with missing values, which is never addressed. The methodology is computationally efficient, and its good coverage and variable selection properties are demonstrated in a simulation study where we contrast its performances to other methods. For instance, the popular multiple imputation by chained equation can lead to biased estimates while our method is unbiased. We then illustrate the method on a dataset of severely traumatized patients from Paris hospitals to predict the occurrence of hemorrhagic shock, a leading cause of early preventable death in severe trauma cases. The aim is to consolidate the current red flag procedure, a binary alert identifying patients with a high risk of severe hemorrhage. The methodology is implemented in the R package misaem.},

  Keywords                 = {incomplete data; observed likelihood; variable selection; major trauma; public health},
  Owner                    = {imke},
  Timestamp                = {2019.03.31},
  Topics                   = {ml; regression}
}

@Article{joenssen_bankhofer_JTACS2012,
  Title                    = {Donor limited hot deck imputation: effect on parameter estimation},
  Author                   = {Joenssen, D. W. and Bankhofer, U.},
  Journal                  = {Journal of Theoretical and Applied Computer Science},
  Year                     = {2012},
  Number                   = {3},
  Pages                    = {58-70},
  Volume                   = {6},

  Abstract                 = {Methods for dealing with missing data in the context of large surveys or data mining projects are limited by the computational complexity that they may exhibit. Hot deck imputation methods are computationally simple, yet effective for creating complete data sets from which correct inferences may be drawn. All hot deck methods draw values for the imputation of missing values from the data matrix that will later be analyzed. The object, from which these available values are taken for imputation within another, is called the donor. This duplication of values may lead to the problem that using any donor ``too often'' will induce incorrect estimates. To mitigate this dilemma some hot deck methods limit the amount of times any one donor may be selected. This study answers which conditions influence whether or not any such limitation is sensible for six different hot deck methods. In addition, five factors that influence the strength of any such advantage are identified and possibilities for further research are discussed.},
  Keywords                 = {hot deck imputation; missing data; non-response; imputation; simulation},
  ISSN                     = {2299-2634},
  Url                      = {http://www.jtacs.org/archive/2012/3/6},
  Owner                    = {aimbert},
  Timestamp                = {2017.02.21},
  Topics                   = {imputation; hot-deck}
}

@Article{jones_JASA1996,
  Title                    = {Indicator and Stratification Methods for Missing Explanatory Variables in Multiple Linear Regression},
  Author                   = {Jones, Michael P.},
  Journal                  = {Journal of the American Statistical Association},
  Volume                   = {91},
  Number                   = {433},
  Pages                    = {222-230},
  Year                     = {1996},
  Publisher                = {Taylor & Francis},
  Doi                      = {10.1080/01621459.1996.10476680},

  Abstract                 = {The statistical literature and folklore contain many methods for handling missing explanatory variable data in multiple linear regression. One such approach is to incorporate into the regression model an indicator variable for whether an explanatory variable is observed. Another approach is to stratify the model based on the range of values for an explanatory variable, with a separate stratum for those individuals in which the explanatory variable is missing. For a least squares regression analysis using either of these two missing-data approaches, the exact biases of the estimators for the regression coefficients and the residual variance are derived and reported. The complete-case analysis, in which individuals with any missing data are omitted, is also investigated theoretically and is found to be free of bias in many situations, though often wasteful of information. A numerical evaluation of the bias of two missing-indicator methods and the complete-case analysis is reported. The missing-indicator methods show unacceptably large biases in practical situations and are not advisable in general.},

  Keywords                 = {incomplete data; regression; stratification; missing-indicator methods},
  Owner                    = {imke},
  Timestamp                = {2019.02.04},
  Topics                   = {ml; regression}
}

@InProceedings{jonsson_wohlin_ISSM2004,
  Title                    = {An evaluation of k-nearest neighbour imputation using lIkert data},
  Author                   = {J{\"{o}}nsson, P. and Wohlin, C.},
  Booktitle                = {Proceedings of the 10th International Symposium on Software Metrics},
  Year                     = {2004},
  Editor                   = {-},

  Address                  = {Chicago, IL, USA},
  Pages                    = {1530-1435},
  Publisher                = {IEEE},

  Abstract                 = {Studies in many different fields of research suffer from the problem of missing data. With missing data, statistical tests will lose power, results may be biased, or analysis may not be feasible at all. There are several ways to handle the problem, for example through imputation. With imputation, missing values are replaced with estimated values according to an imputation method or model. In the k-nearest neighbour (k-NN) method, a case is imputed using values from the k most similar cases. In this paper, we present an evaluation of the k-NN method using Likert data in a software engineering context. We simulate the method with different values of k and for different percentages of missing data. Our findings indicate that it is feasible to use the k-NN method with Likert data. We suggest that a suitable value of k is approximately the square root of the number of complete cases. We also show that by relaxing the method rules with respect to selecting neighbours, the ability of the method remains high for large amounts of missing data without affecting the quality of the imputation.},
  Doi                      = {10.1109/METRIC.2004.1357895},
  Eventdate                = {2004-09-14/2004-09-16},
  ISBN                     = {0769521290},
  ISSN                     = {15301435},
  Owner                    = {alyssa},
  Timestamp                = {2017.05.29},
  Topics                   = {knn}
}

@Article{josse_etal_2019,
  Title                    = {On the consistency of supervised learning with missing values},
  Author                   = {Josse, Julie and Prost, Nicolas and Scornet, Erwan and Varoquaux, Ga{\"e}l},
  Journal                  = {arXiv preprint},
  archivePrefix            = {arXiv},
  eprint                   = {1902.06931},
  primaryClass             = {stat.ML},
  Year                     = {2019},
  Url                      = {https://arxiv.org/abs/1902.06931},

  Abstract                 = {In many application settings, the data are plagued with missing features. These hinder data analysis. An abundant literature addresses missing values in an inferential framework, where the aim is to estimate parameters and their variance from incomplete tables. Here, we consider supervised-learning settings where the objective is to best predict a target when missing values appear in both training and test sets. We analyze which missing-values strategies lead to good prediction. We show the consistency of two approaches to estimating the prediction function. The most striking one shows that the widely-used mean imputation prior to learning method is consistent when missing values are not informative. This is in contrast with inferential settings as mean imputation is known to have serious drawbacks in terms of deformation of the joint and marginal distribution of the data. That such a simple approach can be consistent has important consequences in practice. This result holds asymptotically when the learning algorithm is consistent in itself. We contribute additional analysis on decision trees as they can naturally tackle empirical risk minimization with missing values. This is due to their ability to handle the half-discrete nature of variables with missing values. After comparing theoretically and empirically different missing-values strategies in trees, we recommend using the missing incorporated in attributes method as it can handle both non-informative and informative missing values.},
  Keywords                 = {Imputation; decision trees; expectation maximization},
  Owner                    = {imke},
  Timestamp                = {2019.03.17},
  Topics                   = {random trees; random forests}
}

@Article{josse_etal_ADAC2011,
  Title                    = {Multiple imputation in principal component analysis},
  Author                   = {Josse, J. and Pag\`es, J. and Husson, F.},
  Journal                  = {Advances in Data Analysis and Classification},
  Year                     = {2011},
  Number                   = {3},
  Pages                    = {231-246},
  Volume                   = {5},

  Abstract                 = {The available methods to handle missing values in principal component analysis only provide point estimates of the parameters (axes and components) and estimates of the missing values. To take into account the variability due to missing values a multiple imputation method is proposed. First a method to generate multiple imputed data sets from a principal component analysis model is defined. Then, two ways to visualize the uncertainty due to missing values onto the principal component analysis results are described. The first one consists in projecting the imputed data sets onto a reference configuration as supplementary elements to assess the stability of the individuals (respectively of the variables). The second one consists in performing a principal component analysis on each imputed data set and fitting each obtained configuration onto the reference one with Procrustes rotation. The latter strategy allows to assess the variability of the principal component analysis parameters induced by the missing values. The methodology is then evaluated from a real data set.},
  Doi                      = {10.1007/s11634-011-0086-7},
  Keywords                 = {Bootstrap; EM algorithm; Missing values; Multiple imputation; Principal component analysis;Procrustes rotation},
  Owner                    = {alyssa},
  Timestamp                = {2016.12.20},
  Topics                   = {multiple imputation; factorial data analysis}
}


@Article{josse_etal_JC2012,
  Title                    = {Handling missing values with regularized iterative multiple correspondance analysis},
  Author                   = {Josse, Julie and Chavent, Marie and Liquet, Benoi and Husson, Fran\c{c}ois},
  Journal                  = {Journal of Classification},
  Year                     = {2012},
  Number                   = {1},
  Pages                    = {91-116},
  Volume                   = {29},

  Abstract                 = {A common approach to deal with missing values in multivariate exploratory data analysis consists in minimizing the loss function over all non-missing elements, which can be achieved by EM-type algorithms where an iterative imputation of the missing values is performed during the estimation of the axes and components. This paper proposes such an algorithm, named iterative multiple correspondence analysis, to handle missing values in multiple correspondence analysis (MCA). The algorithm, based on an iterative PCA algorithm, is described and its properties are studied. We point out the overfitting problem and propose a regularized version of the algorithm to overcome this major issue. Finally, performances of the regularized iterative MCA algorithm (implemented in the R-package named missMDA) are assessed from both simulations and a real dataset. Results are promising with respect to other methods such as the missing-data passive modified margin method, an adaptation of the missing passive method used in Gifi’s Homogeneity analysis framework.},
  Doi                      = {10.1007/s00357-012-9097-0},
  Keywords                 = {multiple correspondence analysis; categorical data; missing values; imputation; regularization},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.22},
  Topics                   = {factorial data analysis; imputation}
}

@Article{josse_etal_JSFdS2009,
  Title                    = {Gestion des donn\'ees manquantes en {A}nalyse en {C}omposantes {P}rincipales},
  Author                   = {Josse, J. and Husson, F. and Pag\`es, J.},
  Journal                  = {Journal de la Soci\'et\'e Fran\c{c}aise de Statistique},
  Year                     = {2009},
  Number                   = {2},
  Pages                    = {28-51},
  Volume                   = {150},

  Abstract                 = {An approach commonly used to handle missing values in Principal Component Analysis (PCA) consists in ignoring the missing values by optimizing the loss function over all non-missing ele- ments. This can be achieved by several methods, including the use of NIPALS, weighted regression or iterative PCA. The latter is based on iterative imputation of the missing elements during the es- timation of the parameters, and can be seen as a particular EM algorithm. First, we review theses approaches with respect to the criterion minimization. This presentation gives a good understanding of their properties and the difficulties encountered. Then, we point out the problem of overfitting and we show how the probabilistic formulation of PCA (Tipping {\&} Bishop, 1997) offers a proper and convenient regularization term to overcome this problem. Finally, the performances of the new algorithm are compared to those of the other algorithms from simulations.},
  Keywords                 = {ACP; ACP probabiliste; ACP-GEM; algorithme EM; donn\'ees manquantes; moindres carr\'es altern\'es pond\'er\'es; surajustement},
  Owner                    = {alyssa},
  Timestamp                = {2016.11.30},
  Topics                   = {factorial data analysis; imputation},
  Url                      = {http://journal-sfds.fr/ojs/index.php/J-SFdS/article/view/33/27}
}

@Article{josse_husson_JSFdS2012,
  Title                    = {Handling missing values in exploratory multivariate data analysis methods},
  Author                   = {Josse, J. and Husson, F.},
  Journal                  = {Journal de la Soci\'et\'e Fran\c{c}aise de Statistique},
  Year                     = {2012},
  Number                   = {2},
  Pages                    = {79-99},
  Volume                   = {153},

  Abstract                 = {This paper is a written version of the talk Julie Josse delivered at the 44 Journ{\'{e}}es de Statistique (Bruxelles, 2012), when being awarded the Marie-Jeanne Laurent-Duhamel prize for her Ph.D. dissertation by the French Statistical Society. It proposes an overview of some results, proposed in Julie Josse and Fran{\c{c}}ois Husson's papers, as well as new challenges in the field of handling missing values in exploratory multivariate data analysis methods and especially in principal component analysis (PCA). First we describe a regularized iterative PCA algorithm to provide point estimates of the principal axes and components and to overcome the major issue of overfitting. Then, we give insight in the parameters variance using a non parametric multiple imputation procedure. Finally, we discuss the problem of the choice of the number of dimensions and we detail cross-validation approximation criteria. The proposed methodology is implemented in the R package missMDA. R{\'{e}}sum{\'{e}} : Cet article fait suite {\`{a}} la conf{\'{e}}rence de Julie Josse sur ses travaux de th{\`{e}}se lors de la r{\'{e}}ception du prix Marie-Jeanne Laurent-Duhamel, dans le cadre des 44e Journ{\'{e}}es de Statistique (Bruxelles, 2012). Il reprend les principaux r{\'{e}}sultats des papiers de Julie Josse et Fran{\c{c}}ois Husson sur la gestion des donn{\'{e}}es manquantes en analyse factorielle et d{\'{e}}crit de nouvelles avanc{\'{e}}es sur le sujet. Dans un premier temps, nous d{\'{e}}taillons un algorithme d'ACP it{\'{e}}rative r{\'{e}}gularis{\'{e}}e qui permet d'estimer les axes et composantes principales en pr{\'{e}}sence de donn{\'{e}}es manquantes et qui pallie le probl{\`{e}}me majeur du surajustement. L'estimation ponctuelle est enrichie par la construction de zone de confiance. Une m{\'{e}}thode d'imputation multiple non-param{\'{e}}trique est alors d{\'{e}}velopp{\'{e}}e pour prendre en compte l'incertitude due aux donn{\'{e}}es manquantes. Enfin, nous abordons le probl{\`{e}}me r{\'{e}}current du choix du nombre de dimensions et d{\'{e}}finissons des approximations de la validation crois{\'{e}}e de type validation crois{\'{e}}e g{\'{e}}n{\'{e}}ralis{\'{e}}e. Tous ces travaux sont mis {\`{a}} disposition de l'utilisateur gr{\^{a}}ce au package missMDA du logiciel libre R.},
  ISSN                     = {2102-6238},
  Owner                    = {alyssa},
  Timestamp                = {2016.09.27},
  Topics                   = {multiple imputation; factorial data analysis},
  Url                      = {http://publications-sfds.fr/ojs/index.php/J-SFdS/article/view/122/112}
}

@Article{josse_husson_JSS2016,
  Title                    = {{missMDA}: a package for handling missing values in multivariate data analysis},
  Author                   = {Josse, J. and Husson, F.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2016},
  Number                   = {1},
  Pages                    = {1-31},
  Volume                   = {70},

  Doi                      = {10.18637/jss.v070.i01},
  Owner                    = {nathalie},
  Timestamp                = {2016.10.17},
  Topics                   = {multiple imputation; factorial data analysis}
}

@Article{kaiser_JSI2014,
  Title                    = {Dealing with missing values in data},
  Author                   = {Kaiser, J.},
  Journal                  = {Journal of Systems Integration},
  Year                     = {2014},
  Number                   = {1},
  Pages                    = {42-51},
  Volume                   = {5},

  Abstract                 = {Many existing industrial and research data sets contain missing values due to various reasons, such as manual data entry procedures, equipment errors and incorrect measurements. Problems associated with missing values are loss of efficiency, complications in handling and analyzing the data and bias resulting from differences between missing and complete data. The important factor for selection of approach to missing values is missing data mechanism. There are various strategies for dealing with missing values. Some analytical methods have their own approach to handle missing values. Data set reduction is another option. Finally missing values problem can be handled by missing values imputation. This paper presents simple methods for missing values imputation like using most common value, mean or median, closest fit approach and methods based on data mining algorithms like k-nearest neighbor, neural networks and association rules, discusses their usability and presents issues with their applicability on examples.},
  Doi                      = {10.20470/jsi.v5i1.178},
  ISBN                     = {18042724},
  ISSN                     = {18042724},
  Owner                    = {alyssa},
  Pmid                     = {94265105},
  Timestamp                = {2017.05.29},
  Topics                   = {general}
}

@InProceedings{kallus_etal_2018,
  Title                    = {Causal Inference with Noisy and Missing Covariates via Matrix Factorization},
  Author                   = {Kallus, N. and Mao, X. and Udell, M.},
  Booktitle                = {Advances in Neural Information Processing Systems},
  Year                     = {2018},
  Editor                   = {-},

  Abstract                 = {Valid causal inference in observational studies often requires controlling for confounders. However, in practice measurements of confounders may be noisy, and can lead to biased estimates of causal effects. We show that we can reduce the bias caused by measurement noise using a large number of noisy measurements of the underlying confounders. We propose the use of matrix factorization to infer the confounders from noisy covariates, a flexible and principled framework that adapts to missing values, accommodates a wide variety of data types, and can augment many causal inference methods. We bound the error for the induced average treatment effect estimator and show it is consistent in a linear regression setting, using Exponential Family Matrix Completion preprocessing. We demonstrate the effectiveness of the proposed procedure in numerical experiments with both synthetic data and real clinical data.},
  Archiveprefix            = {arXiv},
  Url                      = {https://arxiv.org/abs/1806.00811},
  Eprint                   = {1806.00811},
  Owner                    = {imke},
  Timestamp                = {2018.11.12},
  Topics                   = {causal inference; matrix factorization}
}

@Article{kalton_kasprzyk_SM1986,
  Title                    = {The treatment of missing survey data},
  Author                   = {Kalton, G. and Kasprzyk, D.},
  Journal                  = {Survey Methodology},
  Year                     = {1986},
  Number                   = {1},
  Pages                    = {1-16},
  Volume                   = {12},

  Abstract                 = {Missing survey data occur because of total nonresponse and item nonresponse. The standard way to attempt to compensate for total nonresponse is by some form of weighting adjustment, whereas item nonresponses are handled by some form of imputation. This paper reviews methods of weighting adjustment and imputation and discusses their properties.},
  Keywords                 = {nonresponse; item nonresponse; weiweight adjustments; imputation},
  Owner                    = {alyssa},
  Timestamp                = {2017.06.07},
  Topics                   = {imputation; survey; ipw},
  Url                      = {http://www.statcan.gc.ca/pub/12-001-x/1986001/article/14404-eng.pdf}
}

@Article{kapelner_bleich_2014,
  Title                    = {Prediction with missing data via Bayesian additive regression trees},
  Author                   = {Kapelner, A. and Bleich, J.},
  Journal                  = {Canadian Journal of Statistics},
  Year                     = {2015},
  Number                   = {2},
  Pages                    = {224-239},
  Volume                   = {43},

  Abstract                 = {We present a method for incorporating missing data into general prediction problems which use non-parametric statistical learning. We focus on a tree‐based method, Bayesian Additive Regression Trees (BART), enhanced with “Missingness Incorporated in Attributes,” a recently proposed approach for incorporating missingness into decision trees. This procedure extends the native partitioning mechanisms found in tree‐based models and does not require imputation. Simulations on generated models and real data indicate that our procedure offers promise for both selection model and pattern‐mixture frameworks as measured by out‐of‐sample predictive accuracy. We also illustrate BART's abilities to incorporate missingness into uncertainty intervals. Our implementation is readily available in the R package bartMachine.},
  Doi                      = {10.1002/cjs.11248},
  ISSN                     = {0319-5724},
  Keywords                 = {statistical learning; non-parametric statistical learning; BART; random forests; decision trees; missing data},
  Owner                    = {imke},
  Timestamp                = {2018.10.30},
  Url                      = {https://arxiv.org/abs/1306.0618v3},
  Topics                   = {random trees; random forests; prediction; statistical learning}
}

@article{khosravi_etal_2020,
  title={Handling missing data in decision trees: A probabilistic approach},
  author={Khosravi, Pasha and Vergari, Antonio and Choi, YooJung and Liang, Yitao and Broeck, Guy Van den},
  journal={arXiv preprint arXiv:2006.16341},
  year={2020},
  Abstract={Decision trees are a popular family of models due to their attractive properties such as interpretability and ability to handle heterogeneous data. Concurrently, missing data is a prevalent occurrence that hinders performance of machine learning models. As such, handling missing data in decision trees is a well studied problem. In this paper, we tackle this problem by taking a probabilistic approach. At deployment time, we use tractable density estimators to compute the "expected prediction" of our models. At learning time, we fine-tune parameters of already learned trees by minimizing their "expected prediction loss" w.r.t. our density estimators. We provide brief experiments showcasing effectiveness of our methods compared to few baselines.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {decision tree}
}

@Book{kim_shao_2013,
  Title                    = {Statistical Methods for Handling Incomplete Data},
  Author                   = {Kim, J. K. and Shao, J.},
  Publisher                = {Chapman and Hall/CRC},
  Year                     = {2013},

  Address                  = {Boca Raton, FL, USA},

  Abstract                 = {Due to recent theoretical findings and advances in statistical computing, there has been a rapid development of techniques and applications in the area of missing data analysis. Statistical Methods for Handling Incomplete Data covers the most up-to-date statistical theories and computational methods for analyzing incomplete data. The book presents thorough treatments of: (1) Statistical theories of likelihood-based inference with missing data, (2) Computational techniques and theories on imputation, (3) Methods involving propensity score weighting, nonignorable missing data, longitudinal missing data, survey sampling, and statistical matching.},
  ISBN                     = {9781482205077},
  Keywords                 = {imputation; EM algorithm; MC EM; latent variable model; propensity scoring; longitudinal data; survey sampling},
  Owner                    = {imke},
  Timestamp                = {2018.10.26},
  Topics                   = {general}
}

@Article{kohn_ansley_JASA1986,
  Title                    = {Estimation, prediction, and interpolation for {ARIMA} models with missing data},
  Author                   = {Kohn, Robert and Ansley, Craig F.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1986},
  Number                   = {395},
  Pages                    = {751-761},
  Volume                   = {81},

  Abstract                 = {We show how to define and then compute efficiently the marginal likelihood of an ARIMA model with missing observations. The computation is carried out by using the univariate version of the modified Kalman filter introduced by Ansley and Kohn (1985a), which allows a partially diffuse initial state vector. We also show how to predict and interpolate missing observations and obtain the mean squared error of the estimate.},
  Doi                      = {10.2307/2289007},
  ISSN                     = {01621459},
  Keywords                 = {Kalman filters; datasets; data smoothing; modeling; missing data; interpolation; state vectors; covariance matrices; maximum likelihood estimation; time series models},
  Owner                    = {alyssa},
  Publisher                = {[American Statistical Association, Taylor \& Francis, Ltd.]},
  Timestamp                = {2018.06.07},
  Topics                   = {time series; imputation}
}

@Article{kowarik_templ_JSS2016,
  Title                    = {Imputation with the {R} Package {VIM}},
  Author                   = {Kowarik, A. and Templ, M.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2016},
  Number                   = {7},
  Pages                    = {1-16},
  Volume                   = {74},

  Abstract                 = {The package VIM (Templ, Alfons, Kowarik, and Prantner 2016) is developed to explore and analyze the structure of missing values in data using visualization methods, to impute these missing values with the built-in imputation methods and to verify the imputation process using visualization tools, as well as to produce high-quality graphics for publications. This article focuses on the different imputation techniques available in the package. Four different imputation methods are currently implemented in VIM, namely hot-deck imputation, k-nearest neighbor imputation, regression imputation and iterative robust model-based imputation (Templ, Kowarik, and Filzmoser 2011). All of these methods are implemented in a flexible manner with many options for customization. Furthermore in this article practical examples are provided to highlight the use of the implemented methods on real-world applications. In addition, the graphical user interface of VIM has been re-implemented from scratch resulting in the package VIMGUI (Schopfhauser, Templ, Alfons, Kowarik, and Prantner 2016) to enable users without extensive R skills to access these imputation and visualization methods.},
  Doi                      = {10.18637/jss.v074.i07},
  Owner                    = {nathalie},
  Timestamp                = {2017.05.29},
  Keywords                 = {imputation; visualization; diagnosis},
  Topics                   = {imputation}
}

@article{kropko_etal_2014,
 Title                     = {Multiple Imputation for Continuous and Categorical Data: Comparing Joint Multivariate Normal and Conditional Approaches},
 Author                    = {Kropko, J. and Goodrich, B. and Gelman, A. and Hill, J.},
 Year                      = {2014},
 Volume                    = {22},
 Number                    = {4},
 Journal                   = {Political Analysis},
 Publisher                 = {Cambridge University Press},
 Pages                     = {497--519},
 Doi                       = {10.1093/pan/mpu007},
 Abstract                  = {We consider the relative performance of two common approaches to multiple imputation (MI): joint multivariate normal (MVN) MI, in which the data are modeled as a sample from a joint MVN distribution; and conditional MI, in which each variable is modeled conditionally on all the others. In order to use the multivariate normal distribution, implementations of joint MVN MI typically assume that categories of discrete variables are probabilistically constructed from continuous values. We use simulations to examine the implications of these assumptions. For each approach, we assess (1) the accuracy of the imputed values; and (2) the accuracy of coefficients and fitted values from a model fit to completed data sets. These simulations consider continuous, binary, ordinal, and unordered-categorical variables. One set of simulations uses multivariate normal data, and one set uses data from the 2008 American National Election Studies. We implement a less restrictive approach than is typical when evaluating methods using simulations in the missing data literature: in each case, missing values are generated by carefully following the conditions necessary for missingness to be “missing at random” (MAR). We find that in these situations conditional MI is more accurate than joint MVN MI whenever the data include categorical variables.},

 Keywords                  = {Data imputation; missing data; multivariate normal distribution; mixed data; mar},
 Owner                     = {imke},
 Timestamp                 = {2018.11.19},
 Topics                    = {mi}
 }

@Article{larose_etal_2019,
  Title                    = {The impact of missing values on different measures of uncertainty},
  Author                   = {Larose, Chantal and Dey, Dipak K and Harel, Ofer},
  Journal                  = {Statistica Sinica},
  Volume                   = {29},
  Number                   = {2},
  Pages                    = {551--566},
  Year                     = {2019},
  Doi                      = {10.5705/ss.202016.0073},

  Abstract                 = {Entropy quantifies uncertainty in a data set. Intuition tells us that missing values should increase the uncertainty in a data set, but the affect of missing values on entropy has never been quantified. This paper develops formulae for the entropy of incomplete normal data under different missingness mechanisms. The results are compared to the fraction of missing information, which quantifies uncertainty in parameter estimates due to missing values, to compare the two measurements of uncertainty.},

  Keywords                 = {Entropy; fraction of missing information; missing data; multiple imputation},
  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {mnar; multiple imputation}
}

@Article{lee_etal_2018,
  Title                    = {Optimal design when outcome values are not missing at random},
  Author                   = {Lee, K. M. and Mitra, R. and Biedermann, S.},
  Journal                  = {Statistica Sinica},
  Year                     = {2018},
  Number                   = {4},
  Pages                    = {1821--1838},
  Volume                   = {28},
  Publisher                = {Institute of Statistical Science},

  Abstract                 = {The presence of missing values complicates statistical analyses. In design of experiments, missing values are particularly problematic when constructing optimal designs, as it is not known which values are missing at the design stage. When data are missing at random it is possible to incorporate this information into the optimality criterion that is used to find designs; Imhof, Song and Wong (2002) develop such a framework. However, when data are not missing at random this framework can lead to inefficient designs. We investigate and address the specific challenges that not missing at random values present when finding optimal designs for linear regression models. We show that the optimality criteria depend on model parameters that traditionally do not affect the design, such as regression coefficients and the residual variance. We also develop a framework that improves efficiency of designs over those found when values are missing at random.},
  Doi                      = {10.5705/ss.202016.0526},
  Keywords                 = {Covariance matrix; information matrix; linear regression model; missing observations; not missing at random; optimal design},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar}
}

@article{lee_etal_2020,
  title={Framework for the Treatment And Reporting of Missing data in Observational Studies: The TARMOS framework},
  author={Lee, Katherine J and Tilling, Kate and Cornish, Rosie P and Little, Roderick JA and Bell, Melanie L and Goetghebeur, Els and Hogan, Joseph W and Carpenter, James R},
  journal={arXiv preprint arXiv:2004.14066},
  year={2020},
  Url={https://arxiv.org/abs/2004.14066},
  Abstract={Missing data are ubiquitous in medical research. Although there is increasing guidance on how to handle missing data, practice is changing slowly and misapprehensions abound, particularly in observational research. We present a practical framework for handling and reporting the analysis of incomplete data in observational studies, which we illustrate using a case study from the Avon Longitudinal Study of Parents and Children. The framework consists of three steps: 1) Develop an analysis plan specifying the analysis model and how missing data are going to be addressed. An important consideration is whether a complete records analysis is likely to be valid, whether multiple imputation or an alternative approach is likely to offer benefits, and whether a sensitivity analysis regarding the missingness mechanism is required. 2) Explore the data, checking the methods outlined in the analysis plan are appropriate, and conduct the pre-planned analysis. 3) Report the results, including a description of the missing data, details on how the missing data were addressed, and the results from all analyses, interpreted in light of the missing data and the clinical relevance. This framework seeks to support researchers in thinking systematically about missing data, and transparently reporting the potential effect on the study results.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {diagnosis}

}

@Article{little_JASA1988,
  Title                    = {A test of missing completely at random for multivariate data with missing values},
  Author                   = {Little, R. J. A.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1988},
  Number                   = {404},
  Pages                    = {1198-1202},
  Volume                   = {83},

  Booktitle                = {Journal of the American Statistical Association},
  Doi                      = {10.2307/2290157},
  ISBN                     = {01621459},
  ISSN                     = {02776715},
  Keywords                 = {datasets; missing data; covariance matrices; sampling distributions; T tests; data sampling; maximum likelihood estimation; standard error; ratio test},
  Owner                    = {alyssa},
  Pmid                     = {9280038},
  Timestamp                = {2018.05.11},
  Topics                   = {diagnosis}
}

@Article{little_JASA1992,
  Title                    = {Regression with missing {X}'s: a review},
  Author                   = {Little, R. J. A.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1992},
  Number                   = {420},
  Pages                    = {1227-1237},
  Volume                   = {87},

  Abstract                 = {The literature of regression analysis with missing values of the independent variables is reviewed. Six classes of procedures are distinguished: complete case analysis, available case methods, least squares on imputed data, maximum likelihood, Bayesian methods, and multiple imputation. Methods are compared and illustrated when missing data are confined to one independent variable, and extensions to more general patterns are indicated. Attention is paid to the performance of methods when the missing data are not missing completely at random. Least squares methods that fill in missing X's using only data on the X's are contrasted with likelihood-based methods that use data on the X's and Y. The latter approach is preferred and provides methods for elaboration of the basic normal linear regression model. It is suggested that more widely distributed software is needed that advances beyond complete-case analysis, available-case analysis, and naive imputation methods. Bayesian simulation methods and multiple imputation are reviewed; these provide fruitful avenues for future research.},
  Doi                      = {10.2307/2290664},
  ISBN                     = {01621459},
  ISSN                     = {01621459},
  Keywords                 = {bayesian inference; imputation; incomplete data; multiple imputation},
  Owner                    = {alyssa},
  Pmid                     = {318},
  Timestamp                = {2018.06.07},
  Topics                   = {ml}
}

@Article{little_JASA1993,
  Title                    = {Pattern-mixture models for multivariate incomplete data},
  Author                   = {Little, R. J. A.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1993},
  Number                   = {421},
  Pages                    = {125-134},
  Volume                   = {88},

  Abstract                 = {Consider a random sample on variables X1, ..., XV with some values of XV missing. Selection models specify the distribution of X1, ..., XV over respondents and nonrespondents to XV, and the conditional distribution that XV is missing given X1, ..., XV. In contrast, pattern-mixture models specify the conditional distribution of X1, ..., XV given that XV is observed or missing respectively and the marginal distribution of the binary indicator for whether or not XV is missing. For multivariate data with a general pattern of missing values, the literature has tended to adopt the selection-modeling approach (see for example Little and Rubin); here, pattern-mixture models are proposed for this more general problem. Pattern-mixture models are chronically underidentified; in particular for the case of univariate nonresponse mentioned above, there are no data on the distribution of XV given X1, ..., XV-1 in the stratum with XV missing. Thus the models require restrictions or prior information to identify the parameters. Complete-case restrictions tie unidentified parameters to their (identified) analogs in the stratum of complete cases. Alternative types of restriction tie unidentified parameters to parameters in other missing-value patterns or sets of such patterns. This large set of possible identifying restrictions yields a rich class of missing-data models. Unlike ignorable selection models, which generally requires iterative methods except for special missing-data patterns, some pattern-mixture models yield explicit ML estimates for general patterns. Such models are readily amenable to Bayesian methods and form a convenient basis for multiple imputation. Some previously considered noniterative estimation methods are shown to be maximum likelihood (ML) under a pattern-mixture model. For example, Buck's method for continuous data, corrected as in Beale and Little (1975), and Brown's estimators for nonrandomly missing data are ML for pattern-mixture models with particular complete-case restrictions. Available-case analyses, where the mean and variance of Xj are computed using all cases with Xj observed and the correlation (or covariance) of Xj and Xk is computed using all cases with Xj and Xk observed, are also close to ML for another pattern-mixture model. Asymptotic theory for this class of estimators is outlined.},
  Doi                      = {10.2307/2290705},
  ISSN                     = {01621459},
  Keywords                 = {parametric models; statistical estimation; statistical models; missing data; covariance matrices; statistical variance; modeling; mathematical models; data models; sample mean},
  Owner                    = {alyssa},
  Publisher                = {[American Statistical Association, Taylor \& Francis, Ltd.]},
  Timestamp                = {2017.11.14},
  Topics                   = {mnar}
}

@Article{little_JASA1995,
  Title                    = {Modeling the drop-out mechanism in repeated-measures studies},
  Author                   = {Little, R. J. A.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1995},
  Number                   = {431},
  Pages                    = {1112-1121},
  Volume                   = {90},

  Abstract                 = {Subjects often drop out of longitudinal studies prematurely, yielding unbalanced data with unequal numbers of measures for each subject. Modern software programs for handling unbalanced longitudinal data improve on methods that discard the incomplete cases by including all the data, but also yield biased inferences under plausible models for the drop-out process. This article discusses methods that simultaneously model the data and the drop-out process within a unified model-based framework. Models are classified into two broad classes--random-coefficient selection models and random-coefficient pattern-mixture models--depending on how the joint distribution of the data and drop-out mechanism is factored. Inference is likelihood-based, via maximum likelihood or Bayesian methods. A number of examples in the literature are placed in this framework, and possible extensions outlined. Data collection on the nature of the drop-out process is advocated to guide the choice of model. In cases where the drop-out mechanism is not well understood, sensitivity analyses are suggested to assess the effect on inferences about target quantities of alternative assumptions about the drop-out process.},
  Doi                      = {10.2307/2291350},
  Keywords                 = {attrition; longitudinal data; missing data; nonrandom nonresponse; selection bias},
  Owner                    = {alyssa},
  Timestamp                = {2016.12.12},
  Topics                   = {mnar}
}

@Book{little_rubin_SAMD2002,
  Title                    = {Statistical Analysis with Missing Data},
  Author                   = {Little, R. J. A. and Rubin, D. B.},
  Publisher                = {Wiley},
  Year                     = {2002},

  Booktitle                = {Statistical analysis with missing data Second edition},
  Doi                      = {10.2307/1533221},
  ISBN                     = {0471183865},
  ISSN                     = {00324663},
  Owner                    = {alyssa},
  Pages                    = {408},
  Pmid                     = {10403256},
  Timestamp                = {2016.09.27},
  Topics                   = {general}
}

@Inproceedings{loh_etal_2011,
 Author = {Loh, Po-ling and Wainwright, Martin J},
 Booktitle = {Advances in Neural Information Processing Systems},
 Editor = {J. Shawe-Taylor and R. Zemel and P. Bartlett and F. Pereira and K. Q. Weinberger},
 Pages = {2726--2734},
 Publisher = {Curran Associates, Inc.},
 Title = {High-dimensional regression with noisy and missing data: Provable guarantees with non-convexity},
 Url = {https://proceedings.neurips.cc/paper/2011/file/ab541d874c7bc19ab77642849e02b89f-Paper.pdf},
 Volume = {24},
 Year = {2011},
Abstract = {Although the standard formulations of prediction problems involve fully-observed and noiseless data drawn in an i.i.d. manner, many applications involve noisy and/or missing data, possibly involving dependencies. We study these issues in the context of high-dimensional sparse linear regression, and propose novel estimators for the cases of noisy, missing, and/or dependent data. Many standard approaches to noisy or missing data, such as those using the EM algorithm, lead to optimization problems that are inherently non-convex, and it is difficult to establish theoretical guarantees on practical algorithms. While our approach also involves optimizing non-convex programs, we are able to both analyze the statistical error associated with any global optimum, and prove that a simple projected gradient descent algorithm will converge in polynomial time to a small neighborhood of the set of global minimizers. On the statistical side, we provide non-asymptotic bounds that hold with high probability for the cases of noisy, missing, and/or dependent data. On the computational side, we prove that under the same types of conditions required for statistical consistency, the projected gradient descent algorithm will converge at geometric rates to a near-global minimizer. We illustrate these theoretical predictions with simulations, showing agreement with the predicted scalings.},
  Topics = {supervised learning},
   Owner                    = {aude},
  Timestamp                = {2021.01.20}
}


@Misc{londschien_etal_2019,
    Title                    = {Change point detection for graphical models in presence of missing values},
    Author                   = {Malte Londschien and Solt Kov{\'a}cs and Peter B{\"u}hlmann},
    Year                     = {2019},
    Eprint                   = {1907.05409},
    ArchivePrefix            = {arXiv},
    PrimaryClass             = {stat.ML},
    Abstract                 = {We propose estimation methods for change points in high-dimensional
    covariance structures with an emphasis on challenging scenarios with missing values.
    We advocate three imputation like methods and investigate their implications on common
    losses used for change point detection. We also discuss how model selection methods
    have to be adapted to the setting of incomplete data. The methods are compared in a
    simulation study and applied to real data examples from environmental monitoring
    systems as well as financial time series.},
    Owner                    = {imke},
    Timestamp                = {2020.02.27},
    Topics                   = {machine learning; ml}
}

@Article{louis_JRSS1982,
  Title                    = {Finding the Observed Information Matrix when Using the EM Algorithm},
  Author                   = {Louis, Thomas A.},
  Journal                  = {Journal of the Royal Statistical Society. Series B (Methodological)},
  Number                   = {2},
  Pages                    = {226--233},
  Publisher                = {Royal Statistical Society, Wiley},
  Volume                   = {44},
  Year                     = {1982},
  Url                      = {http://www.jstor.org/stable/2345828},

  Abstract                 = {A procedure is derived for extracting the observed information matrix when the EM algorithm is used to find maximum likelihood estimates in incomplete data problems. The technique requires computation of a complete-data gradient vector or second derivative matrix, but not those associated with the incomplete data likelihood. In addition, a method useful in speeding up the convergence of the EM algorithm is developed. Two examples are presented.},
  ISSN                     = {00359246},
  Keywords                 = {EM algorithm; Observed Information; Maximum Likelihood; Speeding Convergence},
  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {ml; em}
}

@Article{ludtke_etal_2019,
  Title                    = {Regression models involving nonlinear effects with missing data: A sequential modeling approach using Bayesian estimation.},
  Author                   = {L{\"u}dtke, Oliver and Robitzsch, Alexander and West, Stephen G},
  Journal                  = {Psychological methods},
  Year                     = {2019},
  Publisher                = {American Psychological Association},
  Doi                      = {10.1037/met0000233},

  Abstract                 = {When estimating multiple regression models with incomplete predictor variables, it is necessary to specify a joint distribution for the predictor variables. A convenient assumption is that this distribution is a joint normal distribution, the default in many statistical software packages. This distribution will in general be misspecified if the predictors with missing data have nonlinear effects (e.g., x2) or are included in interaction terms (e.g., x.z). In the present article, we discuss a sequential modeling approach that can be applied to decompose the joint distribution of the variables into 2 parts: (a) a part that is due to the model of interest and (b) a part that is due to the model for the incomplete predictors. We demonstrate how the sequential modeling approach can be used to implement a multiple imputation strategy based on Bayesian estimation techniques that can accommodate rather complex substantive regression models with nonlinear effects and also allows a flexible treatment of auxiliary variables. In 4 simulation studies, we showed that the sequential modeling approach can be applied to estimate nonlinear effects in regression models with missing values on continuous, categorical, or skewed predictor variables under a broad range of conditions and investigated the robustness of the proposed approach against distributional misspecifications. We developed the R package mdmb, which facilitates a user-friendly application of the sequential modeling approach, and we present a real-data example that illustrates the flexibility of the software.},
  Keywords                 = {Interaction effects; Multiple imputation; Multiple regression},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {ml; regression}
}

@Article{ma_needell_2017,
  Title                    = {Stochastic Gradient Descent for Linear Systems with Missing Data},
  Author                   = {Ma, Anna and Needell, Deanna},
  Journal                  = {Numerical Mathematics: Theory, Methods and Applications},
  Number                   = {1},
  Pages                    = {1-20},
  Publisher	        = {Global-Science Press},
  Volume                   = {12},
  Year                     = {2017},

  Abstract                 = {Traditional methods for solving linear systems have quickly become imprac- tical due to an increase in the size of available data. Utilizing massive amounts of data is further complicated when the data is incomplete or has missing entries. In this work, we address the obstacles presented when working with large data and incomplete data simultaneously. In particular, we propose to adapt the Stochastic Gradient Descent method to address missing data in linear systems. Our proposed algorithm, the Stochastic Gradient Descent for Missing Data method (mSGD), is introduced and theoretical convergence guarantees are provided. In addition, we include numerical experiments on simulated and real world data that demonstrate the usefulness of our method.},
  Doi = {10.4208/nmtma.OA-2018-0066},
  ISSN                     = {00359246},
  Keywords                 = {supervised learning},
  Owner                    = {aude},
  Timestamp                = {2021.01.12},
  Topics                   = {supervised learning}


}

@inproceedings{ma_etal_2019,
Title = {Missing Not at Random in Matrix Completion: The Effectiveness of Estimating Missingness Probabilities Under a Low Nuclear Norm Assumption},
Author = {Ma, Wei and Chen, George H},
Booktitle = {Advances in Neural Information Processing Systems 32},
Editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
Pages = {14900--14909},
Year = {2019},
Publisher = {Curran Associates, Inc.},
Abstract = {Matrix completion is often applied to data with entries missing not at random (MNAR). For example, consider a recommendation system where users tend to only reveal ratings for items they like. In this case, a matrix completion method that relies on entries being revealed at uniformly sampled row and column indices can yield overly optimistic predictions of unseen user ratings. Recently, various papers have shown that we can reduce this bias in MNAR matrix completion if we know the probabilities of different matrix entries being missing. These probabilities are typically modeled using logistic regression or naive Bayes, which make strong assumptions and lack guarantees on the accuracy of the estimated probabilities. In this paper, we suggest a simple approach to estimating these probabilities that avoids these shortcomings. Our approach follows from the observation that missingness patterns in real data often exhibit low nuclear norm structure. We can then estimate the missingness probabilities by feeding the (always fully-observed) binary matrix specifying which entries are revealed to an existing nuclear-norm-constrained matrix completion algorithm by Davenport et al. [2014]. Thus, we tackle MNAR matrix completion by solving a different matrix completion problem first that recovers missingness probabilities. We establish finite-sample error bounds for how accurate these probability estimates are and how well these estimates debias standard matrix completion losses for the original matrix to be completed. Our experiments show that the proposed debiasing strategy can improve a variety of existing matrix completion algorithms, and achieves downstream matrix completion accuracy at least as good as logistic regression and naive Bayes debiasing baselines that require additional auxiliary information.},
Url = {http://papers.nips.cc/paper/9628-missing-not-at-random-in-matrix-completion-the-effectiveness-of-estimating-missingness-probabilities-under-a-low-nuclear-norm-assumption.pdf},
  Topics = {matrix completion},
   Owner                    = {aude},
  Timestamp                = {2021.01.20}
}


@InProceedings{mattei2019miwae,
  Title                    = {MIWAE: Deep generative modelling and imputation of incomplete data sets},
  Author                   = {Mattei, Pierre-Alexandre and Frellsen, Jes},
  Booktitle                = {Proceedings of the 36th International Conference on Machine Learning},
  Publisher                = {Kamalika Chaudhuri and Ruslan Salakhutdinov},
  Pages                    = {4413--4423},
  Volume = {97}, 
  Series = {Proceedings of Machine Learning Research},
  Year                     = {2019},
  Abstract = {We consider the problem of handling missing data with deep latent variable models (DLVMs). First, we present a simple technique to train DLVMs when the training set contains missing-at-random data. Our approach, called MIWAE, is based on the importance-weighted autoencoder (IWAE), and maximises a potentially tight lower bound of the log-likelihood of the observed data. Compared to the original IWAE, our algorithm does not induce any additional computational overhead due to the missing data. We also develop Monte Carlo techniques for single and multiple imputation using a DLVM trained on an incomplete data set. We illustrate our approach by training a convolutional DLVM on incomplete static binarisations of MNIST. Moreover, on various continuous data sets, we show that MIWAE provides extremely accurate single imputations, and is highly competitive with state-of-the-art methods.},
  Url = {http://proceedings.mlr.press/v97/mattei19a.html},
  Keywords                 = {deep learning; neural network},
  Owner                    = {aude},
  Timestamp                = {2021.01.12},
  Topics                   = {deep learning}
}


@Book{mclachlan_krishnan_2008,
  Title                    = {The EM Algorithm and Extensions},
  Author                   = {McLachlan, G. J. and Krishnan, T.},
  Publisher                = {Wiley},
  Year                     = {2008},
  Booktitle                = {The EM Algorithm and Extensions, Second Edition},

  Address                  = {Hoboken, NJ, USA},
  Series                   = {Wiley series in probability and statistics},

  Abstract                 = {The first unified account of the theory, methodology, and applications of the EM algorithm and its extensions. Since its inception in 1977, the Expectation-Maximization (EM) algorithm has been the subject of intense scrutiny, dozens of applications, numerous extensions, and thousands of publications. The algorithm and its extensions are now standard tools applied to incomplete data problems in virtually every field in which statistical methods are used. Until now, however, no single source offered a complete and unified treatment of the subject.The EM Algorithm and Extensions describes the formulation of the EM algorithm, details its methodology, discusses its implementation, and illustrates applications in many statistical contexts. Employing numerous examples, Geoffrey McLachlan and Thriyambakam Krishnan examine applications both in evidently incomplete data situations-where data are missing, distributions are truncated, or observations are censored or grouped-and in a broad variety of situations in which incompleteness is neither natural nor evident. They point out the algorithm's shortcomings and explain how these are addressed in the various extensions.Areas of application discussed include: Regression Medical imaging Categorical data analysis Finite mixture analysis Factor analysis Robust statistical modeling Variance-components estimation Survival analysis Repeated-measures designs For theoreticians, practitioners, and graduate students in statistics as well as researchers in the social and physical sciences, The EM Algorithm and Extensions opens the door to the tremendous potential of this remarkably versatile statistical tool.},
  ISBN                     = {9780471201700},
  Owner                    = {imke},
  Timestamp                = {2018.10.26},
  Topics                   = {em}
}

@Article{meng_rubin_B1993,
  Title                    = {Maximum likelihood estimation via the {ECM} algorithm: a general framework},
  Author                   = {Meng, S. L. and Rubin, D. B.},
  Journal                  = {Biometrika},
  Year                     = {1993},
  Number                   = {2},
  Pages                    = {267-278},
  Volume                   = {80},

  Abstract                 = {Two major reasons for the popularity of the EM algorithm are that its maximum step involves only complete-data maximum likelihood estimation, which is often computa-tionally simple, and that its convergence is stable, with each iteration increasing the likelihood. When the associated complete-data maximum likelihood estimation itself is complicated, EM is less attractive because the M-step is computationally unattractive. In many cases, however, complete-data maximum likelihood estimation is relatively simple when conditional on some function of the parameters being estimated. We introduce a class of generalized EM algorithms, which we call the ECM algorithm, for Expecta-tion/Conditional Maximization (CM), that takes advantage of the simplicity of complete-data conditional maximum likelihood estimation by replacing a complicated M-step of EM with several computationally simpler cM-steps. We show that the ECM algorithm shares all the appealing convergence properties of EM, such as always increasing the likelihood, and present several illustrative examples.},
  Doi                      = {10.1093/biomet/80.2.267},
  ISBN                     = {00063444},
  ISSN                     = {00063444},
  Keywords                 = {Bayesian inference; conditional maximization; constrained optimization; EM algorithm; Gibbs sampler; incomplete data; iterated conditional modes; iterative proportional fitting; missing data},
  Owner                    = {alyssa},
  Timestamp                = {2017.08.31},
  Topics                   = {ml}
}

@Article{meng_rubin_JASA1991,
  Title                    = {Using {EM} to obtain asymptotic variance-covariance matrices: the {SEM} algorithm},
  Author                   = {Meng, X. L. and Rubin, D. B.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1991},
  Number                   = {416},
  Pages                    = {899-909},
  Volume                   = {86},

  Abstract                 = {The expectation maximization (EM) algorithm estimation in incomplete-data parameters matrices using only the code for computing missing infornation matrix. We call this supplemented that the SEM algorithm tiparameter environments. is a popular, and often remarkably (e.g., standard errors) are not automatic byproducts, to find the increased variability EM algorithm can be a practically problems. One criticism of EM in practice is that asymptotic Raphson. In this article we define and illustrate a procedure that obtains numerically the complete-data important due to missing information the SEM algorithm. supplement SEM can also be used as a tool for monitoring whether simple, method for maximum likelihood variance-covariance matrices for as they are when using some other methods, such as Newton- stable asymptotic standard matrix operations. The basic idea is to use the fact that the rate of convergence of EM is governed by the fractions to add to the complete-data Theory and particular examples reinforce variance-covariance variance-covariance matrix, the code for EM itself, and code for of variance-covariance the conclusion to EM in many problems. SEM is especially useful in mul- problems where only a subset of the parameters are affected by missing infonnation and in parallel computing EM has converged to a (local) maximum.},
  Doi                      = {10.1080/01621459.1991.10475130},
  ISBN                     = {01621459},
  ISSN                     = {1537274X},
  Keywords                 = {Bayesian inference; convergence rate; EM algorithm; incomplete data; maximum likelihood estimation; observed information},
  Owner                    = {alyssa},
  Pmid                     = {298},
  Timestamp                = {2017.10.25},
  Topics                   = {ml}
}

@Article{meng_SAP2012,
  Title                    = {UYou want me to analyze data I don't have? Are you insane?},
  Author                   = {Meng, X. L.},
  Journal                  = {Shanghai Archives of Psychiatry},
  Year                     = {2012},
  Number                   = {5},
  Pages                    = {287-301},
  Volume                   = {24},

  Abstract                 = {Eighteen years ago, Professor Xinming Tu (one of the journal's biostatistical editors) and I were coauthors of a paper that involved missing data in chemometrics. One of the reviewer's comments included the following:

The statement, ‘The naive approach of ignoring the missing data and using only the observed portion could provide very misleading conclusions’ is nonsense to me (and I think the authors should also recognize it as nonsense in the real world). Similarly, what does it mean, ‘When analyzing such missing data, ...’; if the data are missing, you can't analyze them.

If you find nothing nonsensical in this reviewer's comments, then the current article is worth a few minutes of your time. Statistical analysis has the same inductive nature as detective work: inferring unknowns from whatever one knows and observes, including the evidence that something is missing. Few qualified detectives would ignore suspicious absences in drawing their overall conclusions. Similarly, understanding the complications and consequences of having missing data is essential to reaching statistically meaningful and scientifically defensible conclusions.},
  Doi                      = {10.3969/j.issn.1002-0829.2012.05.011},
  Url                      = {http://stat.harvard.edu/XLM/ShanghaiArchivesofPsychiatry/ShanghaiArchivesofPsychiatry_v24_n5_2012_pp297-301.pdf},
  Keywords                 = {missing data; incomplete data},
  Owner                    = {imke},
  Timestamp                = {2019.01.09},
  Topics                   = {general_informal}
}

@Article{miao_tchetgen_2018,
  Title                    = {Identification and inference with nonignorable missing covariate data},
  Author                   = {Miao, W. and Tchetgen Tchetgen, E. J.},
  Journal                  = {Statistica Sinica},
  Year                     = {2018},
  Number                   = {4},
  Pages                    = {2049--2067},
  Volume                   = {28},
  Publisher                = {Institute of Statistical Science},

  Abstract                 = {We study identification of parametric and semiparametric models with missing covariate data. When covariate data are missing not at random, identification is not guaranteed even under fairly restrictive parametric assumptions, a fact that is illustrated with several examples. We propose a general approach to establish identification of parametric and semiparametric models when a covariate is missing not at random. Without auxiliary information about the missingness process, identification of parametric models is strongly dependent on model specification. However, in the presence of a fully observed shadow variable that is correlated with the missing covariate but otherwise independent of the missingness conditional on the covariate, identification is more broadly achievable, including in fairly large semiparametric models. Special consideration is given to the generalized linear models with the missingness process unrestricted. Under such a setting, the outcome model is identified for a number of familiar generalized linear models, and we provide counterexamples when identification fails. For estimation, we describe an inverse probability weighted estimator that incorporates the shadow variable to estimate the propensity score model, and we evaluate its performance via simulations. We further illustrate the shadow variable approach with a data example about home prices in China.},
  Doi                      = {10.5705/ss.202016.0322},
  Keywords                 = {Identification; missing covariate data; missing not at random; shadow variable},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; clustering}
}

@Article{moeur_stage_FS1995,
  Title                    = {Most similar neighbor: an improved sampling inference procedure for natural resources planning},
  Author                   = {Moeur, M. and Stage, A. R.},
  Journal                  = {Forest Science},
  Year                     = {1995},
  Number                   = {1},
  Pages                    = {337-359},
  Volume                   = {42},

  Abstract                 = {To model ecosystem functioning for landscape design, analysts would like detailed data about each parcel of land in the landscape. Usually, only information of low resolution is available for the entire area, supplemented by detailed information for a sample of the parcels. These sample data, usually obtained through two-phase sampling, provide initial values of important design elements for dynamic, often nonlinear, models of ecosystem functioning. However, to represent the contribution of the nonsampled portions of the landscape to ecosystem functioning, it would be convenient to be able to operate as if the detailed design information were available for each and every parcel in the analysis. Inference procedures to complete the design information for the unsampled parcels have usually followed the techniques of stratified or regression sampling. These procedures have been developed with regard to their efficiency for estimating population means and totals rather than for their utility to model ecosystem functioning and response to intervention. Stratified sampling or regression estimates therefore do not retain the complex relationships between multivariate design attributes. We present a new multivariate inference procedure for use in such circumstances. In place of estimating design attributes element-by-element in a traditional sense for each first-phase observation, the procedure simply chooses the most similar parcel from the set of parcels with detailed examinations to act as its stand-in. The stand-in is chosen on the basis of a similarity measure that summarizes the multivariate relationships between the set of low resolution indicator attributes and the set of detailed design attributes derived from the second-phase sample. Canonical correlation analysis is used to derive a similarity function for this procedure, which we call "Most Similar Neighbor Inference." We compared most similar neighbor estimates for a multivariate forest inventory to estimates from regression, stratified sampling, and a Swedish National Forest Survey method. The indicator attributes were recorded from stand records, maps, and aerial photographs, while the design attributes were stand yield characteristics derived from on-the-ground inventories. The most similar neighbor estimates have prediction errors that are comparable in magnitude to the traditional estimates for easy-to-predict design attributes. Thus, most similar neighbor inference should be expected to perform almost as well as regression in sampling contexts requiring estimates of population means or totals. More importantly, the most similar neighbor procedure more closely reproduces the covariance structure of the design attributes. Preserving the relationships among design attributes is a vital feature when the purpose of the modeling is to evaluate management options. Furthermore, because most similar neighbor is an exact interpolator, estimates derived from it are consistent in a finite population sense.},

  Keywords                 = {Canonical correlation analysis; retaining sample variability; preserving covariance; data-splitting; jackknifing},
  Owner                    = {nathalie},
  Timestamp                = {2017.10.12},
  Doi                      = {10.1093/forestscience/41.2.337},
  Topics                   = {knn}
}

@InProceedings{mohan_etal_2018,
  Title                    = {Estimation with Incomplete Data: The Linear Case},
  Author                   = {Karthika Mohan and Felix Thoemmes and Judea Pearl},
  Booktitle                = {Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence, {IJCAI-18}},
  Publisher                = {International Joint Conferences on Artificial Intelligence Organization},
  Pages                    = {5082--5088},
  Year                     = {2018},
  Month                    = {7},
  Doi                      = {10.24963/ijcai.2018/705},
  Url                      = {https://doi.org/10.24963/ijcai.2018/705},
  Keywords                 = {MNAR; estimation; graphical models},
  Owner                    = {imke},
  Timestamp                = {2019.07.29},
  Topics                   = {mnar}
}

@TechReport{mohan_pearl_2019,
  author      = {Mohan, K. and Pearl, J.},
  institution = {Department of Computer Science, University of California, Los Angeles},
  title       = {Graphical Models for Processing Missing Data},
  year        = {2019},
  address     = {CA},
  note        = {Forthcoming, Journal of American Statistical Association (JASA)},
  number      = {R-473-L},
  abstract    = {This paper reviews recent advances in missing data research using graphical models to represent multivariate dependencies. We first examine the limitations of tra- ditional frameworks from three different perspectives: transparency, estimability and testability. We then show how procedures based on graphical models can overcome these limitations and provide meaningful performance guarantees even when data are Missing Not At Random (MNAR). In particular, we identify conditions that guarantee consistent estimation in broad categories of missing data problems, and derive procedures for implementing this estimation. Finally we derive testable implications for missing data models in both MAR (Missing At Random) and MNAR categories.},
  keywords    = {Missing data; Graphical Models; Testability; Recoverability; Non-Ignorable; Missing Not At Random (MNAR)},
  owner       = {imke},
  timestamp   = {2019.07.29},
  topics      = {mnar},
  url         = {http://ftp.cs.ucla.edu/pub/stat\_ser/r473-L.pdf},
}


@Book{molenberghs_etal_HMDM2014,
  Title                    = {Handbook of Missing Data Methodology},
  Author                   = {Molenberghs, G. and Fitzmaurice, G. and Kenward, M. G. and Tsiatis,A. and Verbeke, G.},
  Publisher                = {Chapman and Hall/CRC},
  Year                     = {2014},
  Address                  = {New York, NY, USA},
  Series                   = {Chapman \& Hall/CRC Handbooks of Modern Statistical Methods},

  Abstract                 = {Missing data affect nearly every discipline by complicating the statistical analysis of collected data. But since the 1990s, there have been important developments in the statistical methodology for handling missing data. Written by renowned statisticians in this area, Handbook of Missing Data Methodology presents many methodological advances and the latest applications of missing data methods in empirical research. Divided into six parts, the handbook begins by establishing notation and terminology. It reviews the general taxonomy of missing data mechanisms and their implications for analysis and offers a historical perspective on early methods for handling missing data. The following three parts cover various inference paradigms when data are missing, including likelihood and Bayesian methods; semi-parametric methods, with particular emphasis on inverse probability weighting; and multiple imputation methods. The next part of the book focuses on a range of approaches that assess the sensitivity of inferences to alternative, routinely non-verifiable assumptions about the missing data process. The final part discusses special topics, such as missing data in clinical trials and sample surveys as well as approaches to model diagnostics in the missing data setting. In each part, an introduction provides useful background material and an overview to set the stage for subsequent chapters. Covering both established and emerging methodologies for missing data, this book sets the scene for future research. It provides the framework for readers to delve into research and practical applications of missing data methods},
  ISBN                     = {9781439854624},
  Owner                    = {alyssa},
  Timestamp                = {2017.11.14},
  Topics                   = {general}
}

@Article{molenberghs_etal_SN1998,
  Title                    = {Monotone missing data and pattern-mixture models},
  Author                   = {Molenberghs, G. and Michiels, B. and Kenward, M. G. and Diggle, P. J.},
  Journal                  = {Statistica Neerlandica},
  Year                     = {1998},
  Number                   = {2},
  Pages                    = {153-161},
  Volume                   = {52},

  Abstract                 = {It is shown that the classical taxonomy of missing data models, namely missing completely at random, missing at random and informative missingness, which has been developed almost exclusively within a selection modelling framework, can also be applied to pattern-mixture models. In particular, intuitively appealing identifying restrictions are proposed for a pattern-mixture MAR mechanism.},
  Doi                      = {10.1111/1467-9574.00075},
  ISBN                     = {0039-0402},
  ISSN                     = {0039-0402},
  Keywords                 = {missing at random; phrases; selection model},
  Owner                    = {alyssa},
  Timestamp                = {2017.11.14},
  Topics                   = {mnar}
}

@Book{molenberghs_kenward_2007,
  Title                    = {Missing Data in Clinical Studies},
  Author                   = {Molenberghs, G. and Kenward, M. G.},
  Publisher                = {Wiley},
  Year                     = {2007},

  Address                  = {Chichester, West Sussex, UK},

  Abstract                 = {Missing Data in Clinical Studies provides a comprehensive account of the problems arising when data from clinical and related studies are incomplete, and presents the reader with approaches to effectively address them. The text provides a critique of conventional and simple methods before moving on to discuss more advanced approaches. The authors focus on practical and modeling concepts, providing an extensive set of case studies to illustrate the problems described. Provides a practical guide to the analysis of clinical trials and related studies with missing data. Examines the problems caused by missing data, enabling a complete understanding of how to overcome them. Presents conventional, simple methods to tackle these problems, before addressing more advanced approaches, including sensitivity analysis, and the MAR missingness mechanism. Illustrated throughout with real-life case studies and worked examples from clinical trials. Details the use and implementation of the necessary statistical software, primarily SAS. Missing Data in Clinical Studies has been developed through a series of courses and lectures. Its practical approach will appeal to applied statisticians and biomedical researchers, in particular those in the biopharmaceutical industry, medical and public health organisations. Graduate students of biostatistics will also find much of benefit.},
  Doi                      = {10.1002/9780470510445},
  ISBN                     = {9780470849811},
  Owner                    = {imke},
  Timestamp                = {2018.10.26},
  Topics                   = {general}
}

@Article{molnar_etal_CMAJ2008,
  Title                    = {Does analysis using ``last observation carried forward'' introduce bias in dementia research?},
  Author                   = {Molnar, F. J. and Hutton, B. and Fergusson, D.},
  Journal                  = {Canadian Medical Association Journal},
  Year                     = {2008},
  Number                   = {8},
  Pages                    = {751-753},
  Volume                   = {179},

  Abstract                 = {If there were a prize for the most inappropriate analytical technique in dementia research, ``last observation carried forward'' would be the runaway winner. As a society, we have spent millions of dollars on drug research in the hope of improving the care of the estimated 24.3 million people who have dementia worldwide. Researchers, patients and families have dedicated countless hours to carrying out trials to test the efficacy of drugs to treat dementia. We then take this invaluable data and, in accordance with US Food and Drug Administration regulation, subject it to last observation carried forward, a form of analysis that introduces bias.},
  Doi                      = {10.1503/cmaj.080820},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.09},
  Topics                   = {time series}
}

@Article{moritz_bartzbeielstein_RJ2017,
  Title                    = {{imputeTS}: time series missing value imputation in {R}},
  Author                   = {Moritz, Steffen and Bartz-Beielstein, Thomas},
  Journal                  = {The R Journal},
  Year                     = {2017},
  Number                   = {1},
  Pages                    = {207-218},
  Volume                   = {9},

  Abstract                 = {The imputeTS package specializes on univariate time series imputation. It offers multiple state-of-the-art imputation algorithm implementations along with plotting functions for time series missing data statistics. While imputation in general is a well-known problem and widely covered by R packages, finding packages able to fill missing values in univariate time series is more complicated. The reason for this lies in the fact that most imputation algorithms rely on inter-attribute correlations, while univariate time series imputation instead needs to employ time dependencies. This paper provides an introduction to the imputeTS package and its provided algorithms and tools. Furthermore, it gives a short overview about univariate time series imputation in R.},
  ISSN                     = {20734859},
  Owner                    = {alyssa},
  Timestamp                = {2018.06.07},
  Topics                   = {time series; imputation},
  Url                      = {https://journal.r-project.org/archive/2017/RJ-2017-009/index.html}
}

@Unpublished{moritz_etal_p2015,
  Title                    = {Comparison of different methods for univariate time series imputation in {R}},
  Author                   = {Moritz, S. and Sard\'a, A. and Bartz-Beielstein, T. and Zaefferer, M. and Stork, J.},
  Note                     = {Prepint arXiv 1510.03924},
  Year                     = {2015},

  Owner                    = {nathalie},
  Timestamp                = {2017.07.13},
  Topics                   = {imputation; time series},
  Url                      = {https://arxiv.org/abs/1510.03924},
  Topics                   = {time series; imputation}
}


@InProceedings{morvan_etal_2020_a,
  Title                    = {Linear predictor on linearly-generated data with missing values: non consistency and solutions},
  Author                   = {Le Morvan, Marine and Prost, Nicolas and Josse, Julie and Moreau, Thomas and Scornet, Erwan and Varoquaux, Gael},
  Booktitle                = {Proceedings of Machine Learning Research},
  Year                     = {2020},
  Pages = {3165–3174},
  Volume = {108},
  Series = {Proceedings of Machine Learning Research},
  Editor                   = {-},
  Eventdate                = {26–28 Aug 2020},

  Abstract                 = {We consider building predictors when the data have missing values. We study the seemingly-simple case where the target to predict is a linear function of the fully-observed data and we show that, in the presence of missing values, the optimal predictor may not be linear. In the particular Gaussian case, it can be written as a linear function of multiway interactions between the observed data and the various missing-value indicators. Due to its intrinsic complexity, we study a simple approximation and prove generalization bounds with finite samples, highlighting regimes for which each method performs best. We then show that multilayer perceptrons with ReLU activation functions can be consistent, and can explore good trade-offs between the true model and approximations. Our study highlights the interesting family of models that are beneficial to fit with missing values depending on the amount of data available.},
  Archiveprefix            = {arXiv},
  Url                      = {https://arxiv.org/abs/2002.00658},
   Keywords                 = {supervised learning},  
  Eprint                   = {2002.00658v2},
  Owner                    = {aude},
  Timestamp                = {2021.01.12},
  Topics                   = {supervised learning; neural network}
}


  @InProceedings{morvan_etal_2020_b,
  Title                    = {NeuMiss networks: differentiable programming for supervised learning with missing values.},
  Author                   =  {Le Morvan, Marine and Josse, Julie and Moreau, Thomas and Scornet, Erwan and Varoquaux, Gael},
  Booktitle                = {Advances in Neural Information Processing Systems, 33},
  Year                     = {2020},
  Editor                   = {-},
  Publisher                = {IEEE},

  Eventdate                = {2020-12},

  Abstract                 = {The presence of missing values makes supervised learning much more challenging. Indeed, previous work has shown that even when the response is a linear function of the complete data, the optimal predictor is a complex function of the observed entries and the missingness indicator. As a result, the computational or sample complexities of consistent approaches depend on the number of missing patterns, which can be exponential in the number of dimensions. In this work, we derive the analytical form of the optimal predictor under a linearity assumption and various missing data mechanisms including Missing at Random (MAR) and self-masking (Missing Not At Random). Based on a Neumann-series approximation of the optimal predictor, we propose a new principled architecture, named NeuMiss networks. Their originality and strength come from the use of a new type of non-linearity: the multiplication by the missingness indicator. We provide an upper bound on the Bayes risk of NeuMiss networks, and show that they have good predictive accuracy with both a number of parameters and a computational complexity independent of the number of missing data patterns. As a result they scale well to problems with many features, and remain statistically efficient for medium-sized samples. Moreover, we show that, contrary to procedures using EM or imputation, they are robust to the missing data mechanism, including difficult MNAR settings such as self-masking.},
  Archiveprefix            = {arXiv},
  Url                      = {https://arxiv.org/abs/2007.01627},
  Eprint                   = {2007.01627v4},
  
  Keywords                 = {supervised learning; informative missing values},  
  Owner                    = {aude},
  Timestamp                = {2021.01.12},
  Topics                   = {supervised learning}}

@Article{murray_reiter_2016,
  Author                   = {Murray, J. S. and Reiter, J. P.},
  Title                    = {Multiple Imputation of Missing Categorical and Continuous Values via Bayesian Mixture Models With Local Dependence},
  Journal                  = {Journal of the American Statistical Association},
  Volume                   = {111},
  Number                   = {516},
  Pages                    = {1466-1479},
  Year                     = {2016},
  Publisher                = {Taylor \& Francis},
  Doi                      = {10.1080/01621459.2016.1174132},

  Abstract                 = {We present a nonparametric Bayesian joint model for multivariate continuous and categorical variables, with the intention of developing a flexible engine for multiple imputation of missing values. The model fuses Dirichlet process mixtures of multinomial distributions for categorical variables with Dirichlet process mixtures of multivariate normal distributions for continuous variables. We incorporate dependence between the continuous and categorical variables by (1) modeling the means of the normal distributions as component-specific functions of the categorical variables and (2) forming distinct mixture components for the categorical and continuous data with probabilities that are linked via a hierarchical model. This structure allows the model to capture complex dependencies between the categorical and continuous data with minimal tuning by the analyst. We apply the model to impute missing values due to item nonresponse in an evaluation of the redesign of the Survey of Income and Program Participation (SIPP). The goal is to compare estimates from a field test with the new design to estimates from selected individuals from a panel collected under the old design. We show that accounting for the missing data changes some conclusions about the comparability of the distributions in the two datasets. We also perform an extensive repeated sampling simulation using similar data from complete cases in an existing SIPP panel, comparing our proposed model to a default application of multiple imputation by chained equations. Imputations based on the proposed model tend to have better repeated sampling properties than the default application of chained equations in this realistic setting. Supplementary materials for this article are available online.},
  Keywords                 = {hierarchical mixture model; missing  data; nonparametric bayes; stick-breaking process},
  Owner                    = {imke},
  Timestamp                = {2018.11.12},
  Topics                   = {multiple imputation}
}

@inproceedings{muzellec2020missing,
  title={Missing Data Imputation using Optimal Transport},
  author={Muzellec, Boris and Josse, Julie and Boyer, Claire and Cuturi, Marco},
  booktitle={International Conference on Machine Learning},
  pages={7130--7140},
  year={2020},
  organization={PMLR},
  Owner                    = {imke},
  Timestamp                = {2021.01.25},
  Topics                   = {multiple imputation}
}

@Article{tang_etal_2017,
  Title={Random forest missing data algorithms},
  Author={Tang, Fei and Ishwaran, Hemant},
  Journal={Statistical Analysis and Data Mining: The ASA Data Science Journal},
  Volume={10},
  Number={6},
  Pages={363--377},
  Year={2017},
  Publisher={Wiley Online Library},
  Abstract = {Random forest (RF) missing data algorithms are an attractive approach for imputing missing data. They have the desirable properties of being able to handle mixed types of missing data, they are adaptive to interactions and nonlinearity, and they have the potential to scale to big data settings. Currently there are many different RF imputation algorithms, but relatively little guidance about their efficacy. Using a large, diverse collection of data sets, imputation performance of various RF algorithms was assessed under different missing data mechanisms. Algorithms included proximity imputation, on the fly imputation, and imputation utilizing multivariate unsupervised and supervised splitting—the latter class representing a generalization of a new promising imputation algorithm called missForest. Our findings reveal RF imputation to be generally robust with performance improving with increasing correlation. Performance was good under moderate to high missingness, and even (in certain cases) when data was missing not at random.},
  Doi = {https://doi.org/10.1002/sam.11348},
  Owner                    = {aude},
  Timestamp                = {2021.01.20},
  Topics = {imputation}
}

@article{nabi_etal_2020,
  title={Full Law Identification In Graphical Models Of Missing Data: Completeness Results},
  author={Nabi, Razieh and Bhattacharya, Rohit and Shpitser, Ilya},
  journal={arXiv preprint arXiv:2004.04872},
  year={2020},
  Url={https://arxiv.org/abs/2004.04872},
  Abstract={Missing data has the potential to affect analyses conducted in all fields of scientific study, including healthcare, economics, and the social sciences. Several approaches to unbiased inference in the presence of non-ignorable missingness rely on the specification of the target distribution and its missingness process as a probability distribution that factorizes with respect to a directed acyclic graph. In this paper, we address the longstanding question of the characterization of models that are identifiable within this class of missing data distributions. We provide the first completeness result in this field of study -- necessary and sufficient graphical conditions under which, the full data distribution can be recovered from the observed data distribution. We then simultaneously address issues that may arise due to the presence of both missing data and unmeasured confounding, by extending these graphical conditions and proofs of completeness, to settings where some variables are not just missing, but completely unobserved.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {mnar; diagnosis}
}

@Book{national_research_council_2010,
  Title                    = {The Prevention and Treatment of Missing Data in Clinical Trials},
  Author                   = {National Research Council, U.S.},
  Publisher                = {National Academies Press},
  Year                     = {2010},

  Address                  = {Washington (DC), USA},

  Abstract                 = {Randomized clinical trials are the primary tool for evaluating new medical interventions. Randomization provides for a fair comparison between treatment and control groups, balancing out, on average, distributions of known and unknown factors among the participants. Unfortunately, these studies often lack a substantial percentage of data. This missing data reduces the benefit provided by the randomization and introduces potential biases in the comparison of the treatment groups. Missing data can arise for a variety of reasons, including the inability or unwillingness of participants to meet appointments for evaluation. And in some studies, some or all of data collection ceases when participants discontinue study treatment. Existing guidelines for the design and conduct of clinical trials, and the analysis of the resulting data, provide only limited advice on how to handle missing data. Thus, approaches to the analysis of data with an appreciable amount of missing values tend to be ad hoc and variable. The Prevention and Treatment of Missing Data in Clinical Trials concludes that a more principled approach to design and analysis in the presence of missing data is both needed and possible. Such an approach needs to focus on two critical elements: (1) careful design and conduct to limit the amount and impact of missing data and (2) analysis that makes full use of information on all randomized participants and is based on careful attention to the assumptions about the nature of the missing data underlying estimates of treatment effects. In addition to the highest priority recommendations, the book offers more detailed recommendations on the conduct of clinical trials and techniques for analysis of trial data.},
  Doi                      = {10.17226/12955},
  ISBN                     = {9780309158145},
  Owner                    = {imke},
  Timestamp                = {2018.10.26},
  Topics                   = {general_informal}
}

@Article{nguyen_etal_2019,
  author    = {Nguyen, Luong Trung and Kim, Junhan and Shim, Byonghyo},
  journal   = {IEEE Access},
  title     = {Low-Rank Matrix Completion: A Contemporary Survey},
  year      = {2019},
  pages     = {94215--94237},
  volume    = {7},
  publisher = {IEEE},
  Doi={10.1109/ACCESS.2019.2928130},
  Abstract={As a paradigm to recover unknown entries of a matrix from partial observations, low-rank matrix completion (LRMC) has generated a great deal of interest. Over the years, there have been lots of works on this topic, but it might not be easy to grasp the essential knowledge from these studies. This is mainly because many of these works are highly theoretical or a proposal of new LRMC technique. In this paper, we give a contemporary survey on LRMC. In order to provide a better view, insight, and understanding of potentials and limitations of the LRMC, we present early scattered results in a structured and accessible way. Specifically, we classify the state-of-the-art LRMC techniques into two main categories and then explain each category in detail. We next discuss the issues to be considered when one considers using the LRMC techniques. These include intrinsic properties required for the matrix recovery and how to exploit a special structure in the LRMC design. We also discuss the convolutional neural network (CNN)-based LRMC algorithms exploiting the graph structure of a low-rank matrix. Furthermore, we present the recovery performance and the computational complexity of state-of-the-art LRMC techniques. Our hope is that this paper will serve as a useful guide for practitioners and non-experts to catch the gist of the LRMC.},
  owner     = {imke},
  timestamp = {2021.01.19},
  topics    = {matrix completion},
}

@InProceedings{nowicki_etal_2016,
  Title                    = {Novel rough neural network for classification with missing data},
  Author                   = {Nowicki, R. K. and Scherer, R. and Rutkowski, L.},
  Booktitle                = {21st International Conference on Methods and Models in Automation and Robotics (MMAR)},

  Pages                    = {820--825},
  Year                     = {2016},
  Publisher                = {IEEE},
  Eventdate                = {2016-09-29/2016-09-01},
  Editor                   = {-},

  Abstract                 = {The paper presents a new feedforward neural network architecture. Thanks to incorporating the rough set theory, the new network is able to process imperfect input data, i.e. in the form of intervals or with missing values. The paper focuses on the last case. In contrast to imputation, marginalisation and similar solutions, the proposed architecture is able to give an imprecise answer as the result of input data imperfection. In the extreme case, the answer can be indefinite contrary to a confabulation specific for the aforementioned methods. The results of experiments performed on three classification benchmark datasets for every possible combination of missing attribute values showed the proposed solution works well with missing data with accuracy dependent on the level of missing data.},

  Doi                      = {10.1109/MMAR.2016.7575243},

  Keywords                 = {Classification; rough set theory; rough neural network; imperfect input data; feedforward neural network architecture; missing data},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {classification; deep learning; neural network}
}


@Book{okelly_ratitch_2014,
  Title                   = {Clinical Trials with Missing Data: A Guide for Practitioners},
  Author                  = {O'Kelly, M. and Ratitch, B.},
  Year                    = {2014},
  Publisher               = {John Wiley \& Sons, Ltd},
  Doi                     = {10.1002/9781118762516},

  Abstract                = {This book provides practical guidance for statisticians, clinicians, and researchers involved in clinical trials in the biopharmaceutical industry, medical and public health organisations. Academics and students needing an introduction to handling missing data will also find this book invaluable.},

  Owner                   = {imke},
  Timestamp               = {2018.12.19},
  Topics                  = {general}
}

@InProceedings{orchard_woodbury_1972,
  Title                    = {A missing information principle: theory and applications},
  Author                   = {Orchard, Terence and Woodbury, Max A},
  Booktitle                = {Proceedings of the Sixth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1: Theory of Statistic},
  Year                     = {1972},
  Volume                   = {1},
  Pages                    = {697--715},
  Editor                   = {Le Cam, L. M. and Neyman J. and Scott, E. L.},
  Publisher                = {University of California Press},

  Abstract                 = {The problem that a relatively simple analysis is changed into a complex one just because some of the information is missing, is one which faces most practicing statisticians at some point in their career. Obviously the best way to treat missing information problems is not to have them. Unfortunately circumstances arise in which information is missing and nothing can be done to replace it for one reason or another.},
  Keywords                 = {maximum likelihood estimation; analysis of variance; distribution functions; random variables; factor analysis; covariance; regression analysis; discrete distribution; discriminant analysis; factorial design; probability; statistical inference},
  Url                      = {https://apps.dtic.mil/dtic/tr/fulltext/u2/1022173.pdf},

  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {general}
}

@Article{peugh_enders_RER2004,
  Title                    = {Missing data in educational research: a review of reporting practices and suggestions for improvement},
  Author                   = {Peugh, J. L. and Enders, C. K.},
  Journal                  = {Review of Educational Research},
  Year                     = {2004},
  Number                   = {4},
  Pages                    = {525--556},
  Volume                   = {74},

  Abstract                 = {Missing data analyses have received considerable recent attention in the methodological literature, and two “modern” methods, multiple imputation and maximum likelihood estimation, are recommended. The goals of this article are to (a) provide an overview of missing-data theory, maximum likelihood estimation, and multiple imputation; (b) conduct a methodological review of missing-data reporting practices in 23 applied research journals; and (c) provide a demonstration of multiple imputation and maximum likelihood estimation using the Longitudinal Study of American Youth data. The results indicated that explicit discussions of missing data increased substantially between 1999 and 2003, but the use of maximum likelihood estimation or multiple imputation was rare; the studies relied almost exclusively on listwise and pairwise deletion.},
  Doi                      = {10.3102/00346543074004525},
  Owner                    = {alyssa},
  Timestamp                = {2018.07.12},
  Topics                   = {general_informal},
  Url                      = {http://dx.doi.org/10.3102/00346543074004525}
}

@Article{pigott_ERE2001,
  Title                    = {A review of methods for missing data},
  Author                   = {Pigott, T. D.},
  Journal                  = {Educational Research and Evaluation},
  Year                     = {2001},
  Number                   = {4},
  Pages                    = {353--383},
  Volume                   = {7},

  Abstract                 = {This paper reviews methods for handling missing data in a research study. Many researchers use ad hoc methods such as complete case analysis, available case analysis (pairwise deletion), or single-value imputation. Though these methods are easily implemented, they require assumptions about the data that rarely hold in practice. Model-based methods such as maximum likelihood using the EM algorithm and multiple imputation hold more promise for dealing with difficulties caused by missing data. While model-based methods require specialized computer programs and assumptions about the nature of the missing data, these methods are appropriate for a wider range of situations than the more commonly used ad hoc methods. The paper provides an illustration of the methods using data from an intervention study designed to increase students' ability to control their asthma symptoms.},
  Doi                      = {10.1076/edre.7.4.353.8937},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.21},
  Topics                   = {general}
}

@article{preisser_etal_SIM2002,
  Title                    = {Performance of weighted estimating equations for longitudinal binary data with drop-outs missing at random},
  Author                   = {Preisser, John S. and Lohman, Kurt K. and Rathouz, Paul J.},
  Journal                  = {Statistics in Medicine},
  Volume                   = {21},
  Number                   = {20},
  Pages                    = {3035--3054},
  Year                     = {2002},
  Publisher                = {John Wiley \& Sons, Ltd.},

  Abstract                 = {The generalized estimating equations (GEE) approach is commonly used to model incomplete longitudinal binary data. When drop-outs are missing at random through dependence on observed responses (MAR), GEE may give biased parameter estimates in the model for the marginal means. A weighted estimating equations approach gives consistent estimation under MAR when the drop-out mechanism is correctly specified. In this approach, observations or person-visits are weighted inversely proportional to their probability of being observed. Using a simulation study, we compare the performance of unweighted and weighted GEE in models for time-specific means of a repeated binary response with MAR drop-outs. Weighted GEE resulted in smaller finite sample bias than GEE. However, when the drop-out model was misspecified, weighted GEE sometimes performed worse than GEE. Weighted GEE with observation-level weights gave more efficient estimates than a weighted GEE procedure with cluster-level weights.},
  Keywords                 = {correlated data; drop-outs; estimating equations; logistic models; repeated measures},
  Doi                      = {10.1002/sim.1241},

  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {survey}
}

@Article{quartagno_carpenter_2019,
  Title                    = {Multiple imputation for discrete data: Evaluation of the joint latent normal model},
  Author                   = {Quartagno, Matteo and Carpenter, James R},
  Journal                  = {Biometrical Journal},
  Volume                   = {61},
  Number                   = {4},
  Pages                    = {1003--1019},
  Year                     = {2019},
  Publisher                = {Wiley Online Library},
  Doi                      = {10.1002/bimj.201800222},
  Abstract                 = {Missing data are ubiquitous in clinical and social research, and multiple imputation
(MI) is increasingly the methodology of choice for practitioners. Two principal strategies for imputation have been proposed in the literature: joint modelling multiple
imputation (JM-MI) and full conditional specification multiple imputation (FCS-MI).
While JM-MI is arguably a preferable approach, because it involves specification of an
explicit imputation model, FCS-MI is pragmatically appealing, because of its flexibility in handling different types of variables. JM-MI has developed from the multivariate normal model, and latent normal variables have been proposed as a natural way to
extend this model to handle categorical variables. In this article, we evaluate the latent
normal model through an extensive simulation study and an application on data from
the German Breast Cancer Study Group, comparing the results with FCS-MI. We
divide our investigation in four sections, focusing on (i) binary, (ii) categorical, (iii)
ordinal, and (iv) count data. Using data simulated from both the latent normal model
and the general location model, we find that in all but one extreme general location
model setting JM-MI works very well, and sometimes outperforms FCS-MI. We conclude the latent normal model, implemented in the R package jomo, can be used with
confidence by researchers, both for single and multilevel multiple imputation.},
  Owner                    = {imke},
  Timestamp                = {2020.03.03},
  Topics                   = {mi}
}

@Article{rahman_islam_KBS2013,
  Title                    = {Missing value imputation using decision trees and decision forests by splitting and merging records: Two novel techniques},
  Author                   = {Rahman, Geaur and Islam, Zahidul},
  Journal                  = {Knowledge-Based Systems},
  Volume                   = {53},
  Pages                    = {51--65},
  Year                     = {2013},
  Publisher                = {Elsevier},
  Doi                      = {10.1016/j.knosys.2013.08.023},
  Url                      = {http://www.sciencedirect.com/science/article/pii/S0950705113002591},

  Abstract                 = {We present two novel techniques for the imputation of both categorical and numerical missing values. The techniques use decision trees and forests to identify horizontal segments of a data set where the records belonging to a segment have higher similarity and attribute correlations. Using the similarity and correlations, missing values are then imputed. To achieve a higher quality of imputation some segments are merged together using a novel approach. We use nine publicly available data sets to experimentally compare our techniques with a few existing ones in terms of four commonly used evaluation criteria. The experimental results indicate a clear superiority of our techniques based on statistical analyses such as confidence interval.},

  Keywords                 = {Data pre-processing; Data cleansing; Missing value imputation; Decision tree algorithm; Decision forest algorithm; EM algorithm},
  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {decision trees}
}

@Article{rao_shao_B1992,
  Title                    = {Jackknife variance estimation with survey data under hot deck imputation},
  Author                   = {Rao, J. N. K. and Shao, J.},
  Journal                  = {Biometrika},
  Year                     = {1992},
  Number                   = {4},
  Pages                    = {811-822},
  Volume                   = {79},

  Abstract                 = {Hot deck imputation is commonly employed for item nonresponse in sample surveys. It is also a common practice to treat the imputed values as if they are true values, and then compute the variance estimates using standard formulae. This procedure, however, could lead to serious underestimation of the true variance, when the proportion of missing values for an item is appreciable. We propose a jackknife variance estimator for stratified multistage surveys which is obtained by first adjusting the imputed values for each pseudo-replicate and then applying the standard jackknife formula. The proposed jack-knife variance estimator is shown to be consistent as the sample size increases, assuming equal response probabilities within imputation classes and using a particular hot deck imputation.},
  Doi                      = {10.2307/2337236},
  Owner                    = {nathalie},
  Timestamp                = {2018.06.06},
  Topics                   = {hot-deck}
}

@Article{reilly_pepe_SM1997,
  Title                    = {The relationship between hot-deck multiple imputation and weighted likelihood},
  Author                   = {Reilly, M. and Pepe, M.},
  Journal                  = {Statistics in Medecine},
  Year                     = {1997},
  Number                   = {1-3},
  Pages                    = {5-19},
  Volume                   = {16},

  Abstract                 = {Hot-deck imputation is an intuitively simple and popular method of accommodating incomplete data. Users of the method will often use the usual multiple imputation variance estimator which is not appropriate in this case. However, no variance expression has yet been derived for this easily implemented method applied to missing covariates in regression models. The simple hot-deck method is in fact asymptotically equivalent to the mean-score method for the estimation of a regression model parameter, so that hot-deck can be understood in the context of likelihood methods. Both of these methods accommodate data where missingness may depend on the observed variables but not on the unobserved value of the incomplete covariate, that is, missing at random (MAR). The asymptotic properties of hot-deck are derived here for the case where the fully observed variables are categorical, though the incomplete covariate(s) may be continuous. Simulation studies indicate that the two methods compare well in small samples and for small numbers of imputations. Current users of hot-deck may now conduct their analysis using mean-score, which is a weighted likelihood method and can thus be implemented by a single pass through the data using any standard package which accommodates weighted regression models. Valid inference is now straightforward using the variance expression provided here. The equivalence of mean-score and hot-deck is illustrated using three clinical data sets where an important covariate is missing for a large number of study subjects.},
  Doi                      = {10.1002/(SICI)1097-0258(19970115)16:1%3C5::AID-SIM469%3E3.0.CO;2-8},
  Owner                    = {nathalie},
  Timestamp                = {2018.04.16},
  Topics                   = {hot-deck}
}

@Article{reiter_sadinle_2017,
    Author                 = {Reiter, Jerome P. and Sadinle, Mauricio},
    Title                  = {Itemwise conditionally independent nonresponse modelling for incomplete multivariate data},
    Journal                = {Biometrika},
    Volume                 = {104},
    Number                 = {1},
    Pages                  = {207-220},
    Year                   = {2017},
    Month                  = {01},
    Abstract               = {We introduce a nonresponse mechanism for multivariate missing data in which each study variable and its nonresponse indicator are conditionally independent given the remaining variables and their nonresponse indicators. This is a nonignorable missingness mechanism, in that nonresponse for any item can depend on values of other items that are themselves missing. We show that under this itemwise conditionally independent nonresponse assumption, one can define and identify nonparametric saturated classes of joint multivariate models for the study variables and their missingness indicators. We also show how to perform sensitivity analysis with respect to violations of the conditional independence assumptions encoded by this missingness mechanism. We illustrate the proposed modelling approach with data analyses.},
    Doi                    = {10.1093/biomet/asw063},
    Url                    = {https://doi.org/10.1093/biomet/asw063},
    eprint                 = {http://oup.prod.sis.lan/biomet/article-pdf/104/1/207/13066719/asw063.pdf},
    Keywords                 = {Loglinear model; Missing not at random; Missingness mechanism;
Nonignorable; Nonparametric saturated; Sensitivity analysis.},
  Owner                    = {imke},
  Timestamp                = {2019.03.28},
  Topics                   = {mnar; sensitivity analysis}
}


@TechReport{rieger_etal_2010,
  Title                    = {Random forests with missing values in the covariates},
  Author                   = {Rieger, A. and Hothorn, T. and Strobl, C.},
  Institution              = {University of Munich, Department of Statistics},
  Year                     = {2010},
  Number                   = {79},

  Abstract                 = {In Random Forests several trees are constructed from bootstrap- or subsamples of the original data. Random Forests have become very popular, e.g., in the fields of genetics and bioinformatics, because they can deal with high-dimensional problems including complex interaction effects. Conditional Inference Forests provide an implementation of Random Forests with unbiased variable selection. Like the original Random Forests, they employ surrogate variables to handle missing values in the predictor variables. In this paper we report the results of an extensive simulation study covering both classification and regression problems under a variety of scenarios, including different missing value generating processes as well as different correlation structures between the variables. Moreover, a high dimensional setting with a high number of noise variables was considered in each case. The results compare the performance of Conditional Inference Forests with surrogate variables to that of knn imputation prior to fitting. The results show that while in some settings one or the other approach is slightly superior, there is no overall difference in the performance of Conditional Inference Forests with surrogate variables and with prior knn-imputation.},
  Keywords                 = {surrogate variables; knn; imputation; missing at random; MAR},
  Url                      = {https://epub.ub.uni-muenchen.de/11481/1/techreport.pdf},
  Owner                    = {imke},
  Timestamp                = {2018.10.30},
  Topics                   = {knn; random forests}
}

@article{rioux_etal_2020,
  title={Reflection on modern methods: planned missing data designs for epidemiological research},
  author={Rioux, Charlie and Lewin, Antoine and Odejimi, Omolola A and Little, Todd D},
  journal={International Journal of Epidemiology},
  Doi={10.1093/ije/dyaa042},
  Year={2020},
  Abstract={Taking advantage of the ability of modern missing data treatments in epidemiological research (e.g. multiple imputation) to recover power while avoiding bias in the presence of data that is missing completely at random, planned missing data designs allow researchers to deliberately incorporate missing data into a research design. A planned missing data design may be done by randomly assigning participants to have missing items in a questionnaire (multiform design) or missing occasions of measurement in a longitudinal study (wave-missing design), or by administering an expensive gold-standard measure to a random subset of participants while the whole sample is administered a cheaper measure (two-method design). Although not common in epidemiology, these designs have been recommended for decades by methodologists for their benefits--notably that data collection costs are minimized and participant burden is reduced, which can increase validity. This paper describes the multiform, wave-missing and two-method designs, including their benefits, their impact on bias and power, and other factors that must be taken into consideration when implementing them in an epidemiological study design.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {design}
}

@Article{robins_etal_JASA1994,
  Title                    = {Estimation of Regression Coefficients When Some Regressors are not Always Observed},
  Author                   = {Robins, J. M. and Rotnitzky, A. and Zhao, L. P.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1994},
  Number                   = {427},
  Pages                    = {846-866},
  Volume                   = {89},

  Abstract                 = {In applied problems it is common to specify a model for the conditional mean of a response given a set of regressors. A subset of the regressors may be missing for some study subjects either by design or happenstance. In this article we propose a new class of semiparametric estimators, based on inverse probability weighted estimating equations, that are consistent for parameter vector α0 of the conditional mean model when the data are missing at random in the sense of Rubin and the missingness probabilities are either known or can be parametrically modeled. We show that the asymptotic variance of the optimal estimator in our class attains the semiparametric variance bound for the model by first showing that our estimation problem is a special case of the general problem of parameter estimation in an arbitrary semiparametric model in which the data are missing at random and the probability of observing complete data is bounded away from 0, and then deriving a representation for the efficient score, the semiparametric variance bound, and the influence function of any regular, asymptotically linear estimator in this more general estimation problem. Because the optimal estimator depends on the unknown probability law generating the data, we propose locally and globally adaptive semiparametric efficient estimators. We compare estimators in our class with previously proposed estimators. We show that each previous estimator is asymptotically equivalent to some, usually inefficient, estimator in our class. This equivalence is a consequence of a proposition stating that every regular asymptotic linear estimator of α0 is asymptotically equivalent to some estimator in our class. We compare various estimators in a small simulation study and offer some practical recommendations.},

  Keywords                 = {Cox proportional hazards model; Linear regression; Logistic regression; Measurement error; Missing covariates; Missing data; Nonlinear regression; Semiparametric efficiency; Survey sampling; Two-stage case-control studies; Validation study},
  Doi                      = {10.1080/01621459.1994.10476818},
  Owner                    = {imke},
  Publisher                = {American Statistical Association, Taylor \& Francis},
  Timestamp                = {2018.12.19},
  Topics                   = {ipw}
}

@Article{robins_etal_JASA1995,
  Title                    = {Analysis of semiparametric regression models for repeated outcomes in the presence of missing data},
  Author                   = {Robins, J. M. and Rotnitzky, A. and Zhao, L. P.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1995},
  Number                   = {429},
  Pages                    = {106-121},
  Volume                   = {90},

  Abstract                 = {We propose a class of inverse probability of censoring weighted estimators for the parameters of models for the dependence of the mean of a vector of correlated response variables on a vector of explanatory variables in the presence of missing response data. The proposed estimators do not require full specification of the likelihood. They can be viewed as an extension of generalized estimating equations estimators that allow for the data to be missing at random but not missing completely at random. These estimators can be used to correct for dependent censoring and nonrandom noncompliance in randomized clinical trials studying the effect of a treatment on the evolution over time of the mean of a response variable. The likelihood-based parametric G-computation algorithm estimator may also be used to attempt to correct for dependent censoring and nonrandom noncompliance. But because of possible model misspecification, the parametric G-computation algorithm estimator, in contrast with the proposed weighted estimators, may be inconsistent for the difference in treatment-arm-specific means, even when compliance is completely at random and censoring is independent. We illustrate our methods with the analysis of the effect of zidovudine (AZT) treatment on the evolution of mean CD4 count with data from an AIDS clinical trial.},
  Doi                      = {10.2307/2291134},
  ISSN                     = {01621459},
  Owner                    = {alyssa},
  Publisher                = {American Statistical Association, Taylor \& Francis},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar}
}

@Article{robins_wang_B2000,
  Title                    = {Inference for imputation estimators},
  Author                   = {Robins, J. M. and Wang, N.},
  Journal                  = {Biometrika},
  Year                     = {2000},
  Number                   = {1},
  Pages                    = {113-124},
  Volume                   = {87},

  Abstract                 = {We derive an estimator of the asymptotic variance of both single and multiple imputation estimators. We assume a parametric imputation model but allow for non- and semiparametric analysis models. Our variance estimator, in contrast to the estimator proposed by Rubin (1987), is consistent even when the imputation and analysis models are misspecified and incompatible with one another.},
  Keywords                 = {estimators; statistical variance; missing data; parametric models; data imputation; consistent estimators; modeling; datasets; analytical estimating; estimation bias},
  Owner                    = {nathalie},
  Timestamp                = {2018.04.16},
  Topics                   = {multiple imputation},
  Url                      = {https://www.jstor.org/stable/2673565}
}

@Article{rosseel_JSS2012,
  Title                    = {{lavaan}: an {R} package for structural equation modeling},
  Author                   = {Rosseel, Y.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2012},
  Number                   = {2},
  Volume                   = {48},

  Abstract                 = {Structural equation modeling (SEM) is a vast field and widely used by many applied researchers in the social and behavioral sciences. Over the years, many software packages for structural equation modeling have been developed, both free and commercial. However, perhaps the best state-of-the-art software packages in this field are still closed-source and/or commercial. The R package lavaan has been developed to provide applied researchers, teachers, and statisticians, a free, fully open-source, but commercial-quality package for latent variable modeling. This paper explains the aims behind the development of the package, gives an overview of its most important features, and provides some examples to illustrate how lavaan works in practice.},
  Doi                      = {10.18637/jss.v048.i02},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.15},
  Topics                   = {ml}
}

@Article{rotnitzky_etal_JASA1998,
  Title                    = {Semiparametric regression for repeated outcomes with nonignorable nonresponse},
  Author                   = {Rotnitzky, A. and Robins, J. M. and Scharfstein, D. O.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1998},
  Number                   = {444},
  Pages                    = {1321-1339},
  Volume                   = {93},

  Abstract                 = {We consider inference about the parameter beta* indexing the conditional mean of a vector of correlated outcomes given a vector of explanatory variables when some of the outcomes are missing in a subsample of the study and the probability of response depends on both observed and unobserved data values; that is, nonresponse is nonignorable. We propose a class of augmented inverse probability of response weighted estimators that are consistent and asymptotically normal (CAN) for estimating beta* when the response probabilities can be parametrically modeled and a CAN estimator exists. The proposed estimators do not require full specification of a parametric likelihood, and their computation does not require numerical integration. Our estimators can be viewed as an extension of generalized estimating equation estimators that allows for nonignorable nonresponse. We show that our class essentially consists of all CAN estimators of beta*. We also show that the asymptotic variance of the optimal estimator in our class attains the semiparametric variance bound for the model. When the model for nonresponse is richly parameterized, joint estimation of the regression parameter beta* and the nonresponse model parameter tau* which encodes the magnitude of nonignorable selection bias, may be difficult or impossible. Therefore we propose regarding the selection bias parameter tau* as known, rather than estimating it from the data. We then perform a sensitivity analysis that examines how inference concerning the regression parameter beta* changes as we vary tau* over a range of plausible values. We apply our approach to the analysis of ACTG Trial 002, an AIDS clinical trial.},
  Doi                      = {10.2307/2670049},
  ISSN                     = {01621459},
  Keywords                 = {curse of dimensionality; estimating equations; identification; missing data; semiparametric efficiency; sensitivity analysis},
  Owner                    = {alyssa},
  Publisher                = {American Statistical Association, Taylor \& Francis},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar}
}

@Article{rubin_B1976,
  Title                    = {Inference and missing data},
  Author                   = {Rubin, D. B.},
  Journal                  = {Biometrika},
  Year                     = {1976},
  Number                   = {3},
  Pages                    = {581-592},
  Volume                   = {63},

  Abstract                 = {When making sampling distribution inferences about the parameter of the data, theta, it is appropriate to ignore the process that causes missing data if the missing data are "missing at random" and the observed data are "observed at random", but these inferences are generally conditional on the observed pattern of missing data. When making direct-likelihood or Bayesian inferences about theta, it is appropriate to ignore the process that causes missing data if the missing data are missing at random and the parameter of the missing data process is "distinct" from theta. These conditions are the weakest general conditions under which ignoring the process that causes missing data always leads to correct inferences.},
  Doi                      = {10.1093/biomet/63.3.581},
  Keywords                 = {Bayesian inference; incomplete data; likelikhood inference; missing at random; missing data; observed at random; sampling distribution inference},
  Owner                    = {nathalie},
  Timestamp                = {2018.04.16},
  Topics                   = {ml}
}

@Article{rubin_JASA1977,
  Title                    = {Formalizing subjective notions about the effect of nonrespondents in sample surveys},
  Author                   = {Rubin, D. B.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1977},
  Number                   = {359},
  Pages                    = {538-543},
  Volume                   = {72},

  Abstract                 = {A method is given for estimating, in a subjective sense, the effect of nonresponse in sample surveys. Based on Bayesian techniques, this method produces a subjective probability interval for the statistic that would have been calculated if all nonrespondents had responded. Background information which is recorded for both respondents and nonrespondents plays an important role in sharpening the subjective interval. Real survey data of 660 schools with 188 nonrespondents indicates that the method can be useful in practical problems. The general idea can be applied to any problem with nonrespondents or missing data.},
  Doi                      = {10.2307/2286214},
  ISSN                     = {01621459},
  Owner                    = {alyssa},
  Publisher                = {American Statistical Association, Taylor \& Francis},
  Timestamp                = {2016.12.12},
  Topics                   = {survey}
}

@Article{rubin_JASA2012,
  Title                    = {Multiple imputation after 18+ years},
  Author                   = {Rubin, D. B.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {2012},
  Number                   = {434},
  Pages                    = {473-489},
  Volume                   = {91},

  Abstract                 = {Multiple imputation was designed to handle the problem of missing data in public-use data bases where the data-base constructor and the ultimate user are distinct entities. The objective is valid frequency inference for ultimate users who in general have access only to complete-data software and possess limited knowledge of specific reasons and models for nonresponse. For this situation and objective, I believe that multiple imputation by the data-base constructor is the method of choice. This article first provides a description of the assumed context and objectives, and second, reviews the multiple imputation framework and its standard results. These preliminary discussions are especially important because some recent commentaries on multiple imputation have reflected either misunderstandings of the practical objectives of multiple imputation or misunderstandings of fundamental theoretical results. Then, criticisms of multiple imputation are considered, and, finally, comparisons are made to alternative strategies},
  Doi                      = {10.1080/01621459.1996.10476908},
  ISBN                     = {0162-1459},
  ISSN                     = {0162-1459},
  Keywords                 = {confidence validity; missing data; nonresponse in surveys; public-use files; sample surveys; superefficient procedures},
  Owner                    = {alyssa},
  Timestamp                = {2016.09.27},
  Topics                   = {multiple imputation}
}

@Book{rubin_MINS1987,
  Title                    = {Multlipe Imputation for Nonresponse in Surveys},
  Author                   = {Rubin, D. B.},
  Publisher                = {Wiley},
  Year                     = {1987},

  Address                  = {Hoboken, NJ, USA},
  Abstract                 = {Multiple imputation is a statistical technique designed to take advantage of the flexibility in modem computing to handle missing data. With it, each missing value is replaced by two or more imputed values in order to represent the uncertainty about whch value to impute. The ideas for multiple imputation first arose in the early 1970s when I was working on a problem of survey nonresponse at Educational Testing Service, here summarized as Example 1.1. T h s work was published several years later as Rubin (1977a). The real impetus for multiple imputation, however, came from work encouraged and supported by Fritz Scheuren, then of the United States Social Security Administration and now head of the Statistics of Income Division at the United States Internal Revenue Service. His concern for problems of nonresponse in the Current Population Survey led to a working paper for the Social Security Administration (Rubin, 1977b), which explicitly proposed multiple imputation. Fritz’s continued support and encouragement for the idea of multiple imputation resulted in (1) an American Statistical Association invited address on multiple imputation (Rubin, 1978a); (2) continued research, such as published in Rubin (1979a); (3) joint work with Fritz and Thomas N. Herzog in the late 1970s, summarized in several papers including Herzog and Rubin (1983); and (4) application of the ideas in 1980 to file matching, which eventually was published as Rubin (1986). Another important contributor to the development of multiple imputation has been the United States Census Bureau, which several years ago supported the production of a monograph on multiple imputation (Rubin, 1980a). This monograph was the first of four nearly complete drafts that were supposed to become this book.},
  ISBN                     = {9780471655740},
  Owner                    = {alyssa},
  Timestamp                = {2016.09.27},
  Topics                   = {multiple imputation}
}

@Article{sadinle_reiter_2018,
  Title                    = {Sequential Identification of Nonignorable Missing Data Mechanisms},
  Author                   = {Sadinle, M. and Reiter, J. P.},
  Journal                  = {Statistica Sinica},
  Year                     = {2018},
  Number                   = {4},
  Pages                    = {1741--1759},
  Volume                   = {28},
  Publisher                = {Institute of Statistical Science},

  Abstract                 = {With nonignorable missing data, likelihood-based inference should be based on the joint distribution of the study variables and their missingness indicators. These joint models cannot be estimated from the data alone, thus requiring the analyst to impose restrictions that make the models uniquely obtainable from the distribution of the observed data. We present an approach for constructing classes of identifiable nonignorable missing data models. The main idea is to use a sequence of carefully set up identifying assumptions, whereby we specify potentially different missingness mechanisms for different blocks of variables. We show that the procedure results in models with the desirable property of being non-parametric saturated.},
  Doi                      = {10.5705/ss.202016.0328},
  Keywords                 = {Identification; Non-parametric saturated; Missing not at random; Partial ignorability; Sensitivity analysis},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; sensitivity analysis}
}

@Article{sadinle_reiter_2019,
  Title                    = {Sequentially additive nonignorable missing data modeling using auxiliary marginal information},
  Author                   = {Sadinle, Mauricio and Reiter, Jerome P},
  Journal                  = {arXiv preprint},
  archivePrefix            = {arXiv},
  eprint                   = {1902.06043},
  Year                     = {2019},
  primaryClass             = {stat.ME},
  Url                      = {https://arxiv.org/pdf/1902.06043.pdf},

  Abstract                 = {We study a class of missingness mechanisms, called sequentially additive nonignorable, for modeling multivariate data with item nonresponse. These mechanisms
explicitly allow the probability of nonresponse for each variable to depend on the value
of that variable, thereby representing nonignorable missingness mechanisms. These
missing data models are identified by making use of auxiliary information on marginal
distributions, such as marginal probabilities for multivariate categorical variables or
moments for numeric variables. We present theory proving identification results, and
illustrate the use of these mechanisms in an application.},
  Keywords                 = {Information projection; Missing not at random; Nonmonotone nonresponse;
Nonparametric identification; Observational equivalence},

  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {mnar}
}

@Article{santos_etal_2019,
  Title                    = {Generating Synthetic Missing Data: A Review by Missing Mechanism},
  Author                   = {Santos, Miriam Seoane and Pereira, Ricardo Cardoso and Costa, Adriana Fonseca and Soares, Jastin Pompeu and Santos, Jo{\~a}o and Abreu, Pedro Henriques},
  Journal                  = {IEEE Access},
  Volume                   = {7},
  Pages                    = {11651--11667},
  Year                     = {2019},
  Publisher                = {IEEE},
  Doi                      = {10.1109/ACCESS.2019.2891360},

  Abstract                 = {The performance evaluation of imputation algorithms often involves the generation of missing
values. Missing values can be inserted in only one feature (univariate configuration) or in several features
(multivariate configuration) at different percentages (missing rates) and according to distinct missing
mechanisms, namely, missing completely at random, missing at random, and missing not at random. Since
the missing data generation process defines the basis for the imputation experiments (configuration, missing
rate, and missing mechanism), it is essential that it is appropriately applied; otherwise, conclusions derived
from ill-defined setups may be invalid. The goal of this paper is to review the different approaches to
synthetic missing data generation found in the literature and discuss their practical details, elaborating on
their strengths and weaknesses. Our analysis revealed that creating missing at random and missing not at
random scenarios in datasets comprising qualitative features is the most challenging issue in the related
work and, therefore, should be the focus of future work in the field.},

  Keywords                 = {Data preprocessing; missing data generation; missing data mechanisms},
  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {mnar; mechanisms}
}


@Article{santos_etal_2019,
  Title                    = {Generating Synthetic Missing Data: A Review by Missing Mechanism},
  Author                   = {Santos, M S and Pereira, R C and Costa, A F and Soares, J P and Santos, J and Abreu, P H},
  Journal                  = {IEEE Access},
  Volume                   = {7},
  Pages                    = {11651--11667},
  Year                     = {2019},
  Doi                      = {10.1109/ACCESS.2019.2891360},

  Abstract                 = {The performance evaluation of imputation algorithms often involves the generation of missing values. Missing values can be inserted in only one feature (univariate configuration) or in several features (multivariate configuration) at different percentages (missing rates) and according to distinct missing mechanisms, namely, missing completely at random, missing at random, and missing not at random. Since the missing data generation process defines the basis for the imputation experiments (configuration, missing rate, and missing mechanism), it is essential that it is appropriately applied; otherwise, conclusions derived from ill-defined setups may be invalid. The goal of this paper is to review the different approaches to synthetic missing data generation found in the literature and discuss their practical details, elaborating on their strengths and weaknesses. Our analysis revealed that creating missing at random and missing not at random scenarios in datasets comprising qualitative features is the most challenging issue in the related work and, therefore, should be the focus of future work in the field.},
  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {mnar; simulation}
}

@Book{schafer_AIMD1997,
  Title                    = {Analysis of Incomplete Multivariate Data},
  Author                   = {Schafer, J. L.},
  Publisher                = {Chapman and Hall/CRC},
  Year                     = {1997},

  Address                  = {Boca Raton, FL, USA},
  Series                   = {CRC Monographs on Statistics \& Applied Probability},

  Abstract                 = {The last two decades have seen enormous developments in statistical methods for incomplete data. The EM algorithm and its extensions, multiple imputation, and Markov Chain Monte Carlo provide a set of flexible and reliable tools from inference in large classes of missing-data problems. Yet, in practical terms, those developments have had surprisingly little impact on the way most data analysts handle missing values on a routine basis. Analysis of Incomplete Multivariate Data helps bridge the gap between theory and practice, making these missing-data tools accessible to a broad audience. It presents a unified, Bayesian approach to the analysis of incomplete multivariate data, covering datasets in which the variables are continuous, categorical, or both. The focus is applied, where necessary, to help readers thoroughly understand the statistical properties of those methods, and the behavior of the accompanying algorithms. All techniques are illustrated with real data examples, with extended discussion and practical advice. All of the algorithms described in this book have been implemented by the author for general use in the statistical languages S and S Plus. The software is available free of charge on the Internet.},
  ISBN                     = {0412040611},
  Owner                    = {aimbert},
  Timestamp                = {2017.04.11},
  Topics                   = {general}
}

@Article{schafer_graham_PM2002,
  Title                    = {Missing data: our view of the state of the art},
  Author                   = {Schafer, J. L. and Graham, J. W.},
  Journal                  = {Psychological Methods},
  Year                     = {2002},
  Number                   = {2},
  Pages                    = {147-177},
  Volume                   = {7},

  Abstract                 = {Statistical procedures for missing data have vastly improved, yet misconception and unsound practice still abound. The authors frame the missing-data problem, review methods, offer advice, and raise issues that remain unresolved. They clear up common misunderstandings regarding the missing at random (MAR) concept. They summarize the evidence against older procedures and, with few exceptions, discourage their use. They present, in both technical and practical language, 2 general approaches that come highly recommended: maximum likelihood (ML) and Bayesian multiple imputation (MI). Newer developments are discussed, including some for dealing with missing data that are not MAR. Although not yet in the mainstream, these procedures may eventually extend the ML and MI methods that currently represent the state of the art.},
  Doi                      = {10.1037/1082-989X.7.2.147},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.21},
  Topics                   = {general}
}

@Article{schafer_olsen_MBR1998,
  Title                    = {Multiple Imputation for multivariate missing-data problems: a data analyst's perspective},
  Author                   = {Schafer, J. L. and Olsen, M. K.},
  Journal                  = {Multivariate Behavioral Research},
  Year                     = {1998},
  Number                   = {4},
  Pages                    = {545-571},
  Volume                   = {33},

  Abstract                 = {Analyses of multivariate data are frequently hampered by missing values. Until recently, the only missing-data methods available to most data analysts have been relatively ad1 hoc practices such as listwise deletion. Recent dramatic advances in theoretical and computational statistics, however, have produced anew generation of flexible procedures with a sound statistical basis. These procedures involve multiple imputation (Rubin, 1987), a simulation technique that replaces each missing datum with a set of m > 1 plausible values. The rn versions of the complete data are analyzed by standard complete-data methods, and the results are combined using simple rules to yield estimates, standard errors, and p-values that formally incorporate missing-data uncertainty. New computational algorithms and software described in a recent book (Schafer, 1997a) allow us to create proper multiple imputations in complex multivariate settings. This article reviews the key ideas of multiple imputation, discusses the software programs currently available, and demonstrates their use on data from the Adolescent Alcohol Prevention Trial (Hansen & Graham, 199 I).},
  Doi                      = {10.1207/s15327906mbr3304_5},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.14},
  Topics                   = {multiple imputation}
}


@Article{schafer_SMMR1999,
  Title                    = {Multiple imputation: a primer},
  Author                   = {Schafer, J. L.},
  Journal                  = {Statistical Methods in Medical Research},
  Year                     = {1999},
  Number                   = {1},
  Pages                    = {3-15},
  Volume                   = {8},

  Abstract                 = {In recent years, multiple imputation has emerged as a convenient and flexible paradigm for analysing data with missing values. Essential features of multiple imputation are reviewed, with answers to frequently asked questions about using the method in practice.},
  Doi                      = {10.1191/096228099671525676},
  ISBN                     = {0962-2802 (Print)},
  ISSN                     = {09622802},
  Owner                    = {alyssa},
  Pmid                     = {10347857},
  Timestamp                = {2016.09.27},
  Topics                   = {multiple imputation}
}
{harel_etal_2009,
  Title={Partial and latent ignorability in missing-data problems},
  Author={Harel, Ofer and Schafer, Joseph L},
  Journal={Biometrika},
  Volume={96},
  Number={1},
  Pages={37--50},
  Year={2009},
  Publisher={Oxford University Press},
  Abstract = {When an assumption of missing at random is untenable, it becomes necessary to model missing-data indicators, which carry information about the parameters of the complete-data population. Within a given application, however, researchers may believe that some aspects of missingness are ignorable but others are not. We argue that there are two different ways to formalize the notion that only part of the missingness is ignorable. These approaches correspond to assumptions that we call partially missing at random and latently missing at random. We explain these concepts and apply them in a latent-class analysis of survey questions with item nonresponse.},
  Doi = {https://doi.org/10.1093/biomet/asn069}
  Keywords = {mnar; multiple imputation; nonignorable missingness},
  Topics = {mnar},
  Owner                    = {aude},
  Timestamp                = {2021.01.20}
}

@Article{seaman_etal_2013,
  Title={What Is Meant by" Missing at Random"?},
  Author={Seaman, Shaun and Galati, John and Jackson, Dan and Carlin, John},
  Journal={Statistical Science},
  Pages={257--268},
  Year={2013},
  Publisher={JSTOR},
  Abstract = {The concept of missing at random is central in the literature on statistical analysis with missing data. In general, inference using incomplete data should be based not only on observed data values but should also take account of the pattern of missing values. However, it is often said that if data are missing at random, valid inference using likelihood approaches (including Bayesian) can be obtained ignoring the missingness mechanism. Unfortunately, the term "missing at random" has been used inconsistently and not always clearly; there has also been a lack of clarity around the meaning of "valid inference using likelihood". These issues have created potential for confusion about the exact conditions under which the missingness mechanism can be ignored, and perhaps fed confusion around the meaning of "analysis ignoring the missingness mechanism". Here we provide standardised precise definitions of "missing at random" and "missing completely at random", in order to promote unification of the theory. Using these definitions we clarify the conditions that suffice for "valid inference" to be obtained under a variety of inferential paradigms.},
  Keywords = {mcar; ignorability; frequentist inference},
  Topics = {mar},
   Owner                    = {aude},
  Timestamp                = {2021.01.20}
}

@Article{seaman_etal_2013,
  Title                    = {What Is Meant by "Missing at Random"?},
  Author                   = {Seaman, S. and Galati, J. and Jackson, D. and Carlin, J.},
  Journal                  = {Statistical Science},
  Number                   = {2},
  Pages                    = {257--268},
  Publisher                = {Institute of Mathematical Statistics},
  Volume                   = {28},
  Year                     = {2013},
  Abstract                 = {The concept of missing at random is central in the literature on statistical analysis with missing data. In general, inference using incomplete data should be based not only on observed data values but should also take account of the pattern of missing values. However, it is often said that if data are missing at random, valid inference using likelihood approaches (including Bayesian) can be obtained ignoring the missingness mechanism. Unfortunately, the term "missing at random" has been used inconsistently and not always clearly; there has also been a lack of clarity around the meaning of "valid inference using likelihood". These issues have created potential for confusion about the exact conditions under which the missingness mechanism can be ignored, and perhaps fed confusion around the meaning of "analysis ignoring the missingness mechanism". Here we provide standardised precise definitions of "missing at random" and "missing completely at random", in order to promote unification of the theory. Using these definitions we clarify the conditions that suffice for "valid inference" to be obtained under a variety of inferential paradigms.},
  Doi                      = {10.2307/43288491},
  ISSN                     = {08834237, 21688745},
  URL                      = {http://www.jstor.org/stable/43288491},
  Owner                    = {imke},
  Timestamp                = {2018.11.17},
  Keywords                 = {ignorability; direct-likelihood inference; frequentist inference; repeated sampling; missing completely at random},
  Topics                   = {mar}
}

@article{seaman_vansteelandt_SS2018,
  Title                    = {Introduction to Double Robust Methods for Incomplete Data},
  Author                   = {Seaman, Shaun R and Vansteelandt, Stijn},
  Journal                  = {Statistical Science},
  Volume                   = {33},
  Number                   = {2},
  Pages                    = {184},
  Year                     = {2018},
  Publisher                = {Europe PMC Funders},
  Doi                      = {10.1214/18-STS647},

  Abstract                 = {Most methods for handling incomplete data can be broadly classified as inverse probability weighting (IPW) strategies or imputation strategies. The former model the occurrence of incomplete data; the latter, the distribution of the missing variables given observed variables in each missingness pattern. Imputation strategies are typically more efficient, but they can involve extrapolation, which is difficult to diagnose and can lead to large bias. Double robust (DR) methods combine the two approaches. They are typically more efficient than IPW and more robust to model misspecification than imputation. We give a formal introduction to DR estimation of the mean of a partially observed variable, before moving to more general incomplete-data scenarios. We review strategies to improve the performance of DR estimators under model misspecification, reveal connections between DR estimators for incomplete data and ‘design-consistent’ estimators used in sample surveys, and explain the value of double robustness when using flexible data-adaptive methods for IPW or imputation.},

  Keywords                 = {augmented inverse probability weighting; calibration estimators; data-adaptive methods; doubly robust; empirical likelihood; imputation; inverse probability weighting; missing data; semiparametric methods},

  Owner                    = {imke},
  Timestamp                = {2019.03.28},
  Topics                   = {causal inference}
}

@Article{seaman_white_SMMR2011,
  Title                    = {Review of inverse probability weighting for dealing with missing data},
  Author                   = {Seaman, S. R. and White, I. R.},
  Journal                  = {Statistical Methods in Medical Research},
  Year                     = {2011},
  Number                   = {3},
  Pages                    = {278-295},
  Volume                   = {22},

  Abstract                 = {The simplest approach to dealing with missing data is to restrict the analysis to complete cases, i.e. individuals with no missing values. This can induce bias, however. Inverse probability weighting (IPW) is a commonly used method to correct this bias. It is also used to adjust for unequal sampling fractions in sample surveys. This article is a review of the use of IPW in epidemiological research. We describe how the bias in the complete-case analysis arises and how IPW can remove it. IPW is compared with multiple imputation (MI) and we explain why, despite MI generally being more efficient, IPW may sometimes be preferred. We discuss the choice of missingness model and methods such as weight truncation, weight stabilisation and augmented IPW. The use of IPW is illustrated on data from the 1958 British Birth Cohort},
  Doi                      = {10.1177/0962280210395740},
  Owner                    = {alyssa},
  Timestamp                = {2017.03.06},
  Topics                   = {causal inference}
}

@Article{shao_zhang_2015,
  Title                    = {A transformation approach in linear mixed-effects models with informative missing responses},
  Author                   = {Shao, J. and Zhang, J.},
  Journal                  = {Biometrika},
  Volume                   = {102},
  Number                   = {1},
  Pages                    = {107-119},
  Year                     = {2015},
  Doi                      = {10.1093/biomet/asu069},

  Abstract                 = {We consider a linear mixed-effects model in which the response panel vector has missing components and the missing data mechanism depends on observed data as well as missing responses through unobserved random effects. Using a transformation of the data that eliminates the random effects, we derive asymptotically unbiased and normally distributed estimators of certain model parameters. Estimators of model parameters that cannot be estimated using the transformed data are also constructed, and their asymptotic unbiasedness and normality are established. Simulation results are presented to examine the finite sample performance of the proposed estimators and a real data example is discussed.},

  Keywords                 = {nonignorable missing data; panel data; random effect dependent missingness; unbiasedness; unspecified missing data mechanism},
  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; nonignorable}
}

@Article{sharpe_solly_1995,
  Title                    = {Dealing with missing values in neural network-based diagnostic systems},
  Author                   = {Sharpe, P. K. and Solly, R. J.},
  Journal                  = {Neural Computing \& Applications},
  Year                     = {1995},
  Number                   = {2},
  Pages                    = {73-77},
  Volume                   = {3},

  Publisher                = {Springer-Verlag},
  ISSN                     = {1433-3058},

  Abstract                 = {Backpropagation neural networks have been applied to prediction and classification problems in many real world situations. However, a drawback of this type of neural network is that it requires a full set of input data, and real world data is seldom complete. We have investigated two ways of dealing with incomplete data — network reduction using multiple neural network classifiers, and value substitution using estimated values from predictor networks — and compared their performance with an induction method. On a thyroid disease database collected in a clinical situation, we found that the network reduction method was superior. We conclude that network reduction can be a useful method for dealing with missing values in diagnostic systems based on backpropagation neural networks.},
  Doi                      = {10.1007/BF01421959},
  Keywords                 = {Backpropagation; classification; decision support; neural networks; incomplete data},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {deep learning; neural networks}
}

@Article{simon_simonoff_JASA1986,
  Title                    = {Diagnostic plots for missing data in least squares regression},
  Author                   = {Simon, G. A. and Simonoff, J. S.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1986},
  Number                   = {394},
  Pages                    = {501-509},
  Volume                   = {81},

  Abstract                 = {The usual approach to handling missing data in a regression is to assume that the points are missing at random (MAR) and use either a fill-in method to replace the missing points or a method using maximally available pairs in the sample covariance matrix. We derive limits for the values of the least squares estimates of the coefficients (and their associated t statistics) when there are missing observations in one carrier. These limits are derived subject to a constraint on the relationship of the missing data to the present data. Calculating these limits while varying this constrained value results in a series of diagnostic plots that can be used to study the potential effect of the missing points on the regression (without assuming that the points are MAR). Simulations are performed to illustrate the use of the plots, and two real data sets are analyzed. The more general case of missing data in more than one carrier is also discussed.},
  Doi                      = {10.1080/01621459.1986.10478296},
  Keywords                 = {constrained optimization; missing at random; missing by unknown mechanism; regression diagnostics},
  Owner                    = {nathalie},
  Timestamp                = {2018.04.16},
  Topics                   = {diagnosis}
}

@Article{smieja_etal_2018,
  Title                    = {Processing of missing data by neural networks},
  Author                   = {\'Smieja, M. and Struski, \L{}. and Tabor, J. and Zieli\'nski, B. and Spurek, P.},
  Journal                  = {Computing Research Repository},
  Year                     = {2018},
  Volume                   = {abs/1805.07405},

  Abstract                 = {We propose a general, theoretically justified mechanism for processing missing data by neural networks. Our idea is to replace typical neuron response in the first hidden layer by its expected value. This approach can be applied for various types of networks at minimal cost in their modification. Moreover, in contrast to recent approaches, it does not require complete data for training. Experimental results performed on different types of architectures show that our method gives better results than typical imputation strategies and other methods dedicated for incomplete data.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {1805.07405},
  Eprint                   = {1805.07405},
  Url                      = {https://arxiv.org/abs/1805.07405},
  Keywords                 = {missing data; neural networks; incomplete samples; probability density estimation},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {deep learning}
}

@Article{sovilj_etal_2016,
  Title                    = {Extreme learning machine for missing data using multiple imputations},
  Author                   = {Sovilj, D. and Eirola, E. and Miche, Y. and Bj\"ork, J. M. and Nian, R. and Akusok, A. and Lendasse, A.},
  Journal                  = {Neurocomputing},
  Year                     = {2016},
  Number                   = {A},
  Pages                    = {220-231},
  Volume                   = {174},

  Abstract                 = {In the paper, we examine the general regression problem under the missing data scenario. In order to provide reliable estimates for the regression function (approximation), a novel methodology based on Gaussian Mixture Model and Extreme Learning Machine is developed. Gaussian Mixture Model is used to model the data distribution which is adapted to handle missing values, while Extreme Learning Machine enables to devise a multiple imputation strategy for final estimation. With multiple imputation and ensemble approach over many Extreme Learning Machines, final estimation is improved over the mean imputation performed only once to complete the data. The proposed methodology has longer running times compared to simple methods, but the overall increase in accuracy justifies this trade-off.},
  Doi                      = {10.1016/j.neucom.2015.03.108},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Keywords                 = {Extreme Learning Machine; Missing data; Multiple imputation; Gaussian mixture model; Mixture of Gaussians; Conditional distribution},
  Topics                   = {machine learning}
}


@Article{sportisse_etal_2018,
  Title                    = {Imputation and low-rank estimation with Missing Not At Random data},
  Author                   = {Sportisse, Aude and Boyer, Claire and Josse, Julie},
  Journal                  = {Statistics and Computing},
   Year                     = {2018},
   Number                   = {6},
   Pages                    = {1629-1643},
   Volume                   = {30},

  Abstract                 = {Missing values challenge data analysis because many supervised and unsu-pervised learning methods cannot be applied directly to incomplete data. Matrix completion based on low-rank assumptions are very powerful solution for dealing with missing values. However, existing methods do not consider the case of informative missing values which are widely encountered in practice. This paper proposes matrix completion methods to recover Missing Not At Random (MNAR) data. Our first contribution is to suggest a model-based estimation strategy by modelling the missing mechanism distribution. An EM algorithm is then implemented, involving a Fast Iterative Soft-Thresholding Algorithm (FISTA). Our second contribution is to suggest a computationally efficient surrogate estimation by implicitly taking into account the joint distribution of the data and the missing mechanism: the data matrix is concatenated with the mask coding for the missing values ; a low-rank structure for exponential family is assumed on this new matrix, in order to encode links between variables and missing mechanisms. The methodology that has the great advantage of handling different missing value mechanisms is robust to model specification errors.},
  Doi                      = {10.1007/s11222-020-09963-5},
  Keywords                 = {Informative missing values; denoising; matrix completion; accelerated proximal gradient method; EM algorithm; nuclear norm penalty},
  Owner                    = {imke},
  Timestamp                = {2019.03.28},
  Topics                   = {matrix completion}
}

@InProceedings{sportisse_etal_2019,
  Title                    = {Estimation with informative missing data in the low-rank model with random effects},
  Author                   =  {Sportisse, Aude and Boyer, Claire and Josse, Julie},
  Booktitle                = {Advances in Neural Information Processing Systems, 33},
  Year                     = {2020},
  Editor                   = {-},
  Publisher                = {IEEE},

  Eventdate                = {2020-12},

  Abstract                 = {Missing Not At Random values are considered to be non-ignorable and require defining a model for the missing values mechanism which involves strong a priori on the parametric form of the distribution and makes the inference or imputation tasks more complex. Methodologies to handle MNAR values also focus on simple settings assuming that only one variable (such as the outcome one) has missing entries. Recent work of Mohan and Pearl based on graphical models and causality show that specific settings of MNAR enable to recover some aspects of the distribution without specifying the MNAR mechanism. We pursue this line of research. Considering a data matrix generated from a probabilistic principal component analysis (PPCA) model containing several MNAR variables, not necessarily under the same self-masked missing mechanism, we propose estimators for the means, variances and covariances of the variables and study their consistency. The estimators present the great advantage of being computed by only using observed data. In addition, we propose an imputation method of the data matrix and an estimation of the PPCA loading matrix. We compare our proposal with results obtained for ignorable missing values based on the use of expectation-maximization algorithm.},
  Archiveprefix            = {arXiv},
  Url                      = {https://arxiv.org/abs/1906.02493},
  Eprint                   = {1906.02493v3},
  Keywords                 = {informative missing values; selection bias; latent variables; probabilistic principal component analysis; missingness graph; matrix completion},  
  Owner                    = {imke},
  Timestamp                = {2019.12.12},
  Topics                   = {mnar; factorial data analysis}}
  
  
  @InProceedings{sportisse_etal_2020,
  Title                    = {Debiasing Averaged Stochastic Gradient Descent to handle missing values},
  Author                   =  {Sportisse, Aude and Boyer, Claire and Dieuleveut, Aymeric and Josse, Julie},
  Booktitle                = {Advances in Neural Information Processing Systems, 33},
  Year                     = {2020},
  Editor                   = {-},
  Publisher                = {IEEE},

  Eventdate                = {2020-12},

  Abstract                 = {Stochastic gradient algorithm is a key ingredient of many machine learning methods, particularly appropriate for large-scale learning. However, a major caveat of large data is their incompleteness. We propose an averaged stochastic gradient algorithm handling missing values in linear models. This approach has the merit to be free from the need of any data distribution modeling and to account for heterogeneous missing proportion. In both streaming and finite-sample settings, we prove that this algorithm achieves optimal convergence rate at the iteration, the same as without missing values. We show the convergence behavior and the relevance of the algorithm not only on synthetic data but also on real data sets, including those collected from medical register.},
    Archiveprefix            = {arXiv},
  Url                      = {https://arxiv.org/abs/2002.09338},
  Eprint                   = {2002.09338v2},
  Keywords                 = {informative missing values; selection bias; latent variables; probabilistic principal component analysis; missingness graph; matrix completion},  
  Owner                    = {aude},
  Timestamp                = {2021.01.12},
  Topics                   = {supervised learning}}
  

@Article{stacklies_etal_B2007,
  Title                    = {{pcaMethods} -- a bioconductor package providing {PCA} methods for incomplete data},
  Author                   = {Stacklies, W. and Redestig, H. and Scholz, M. and Walther, D. and Selbig, J.},
  Journal                  = {Bioconductor},
  Year                     = {2007},
  Number                   = {9},
  Pages                    = {1164-1167},
  Volume                   = {23},

  Abstract                 = {pcaMethods is a Bioconductor compliant library for computing principal component analysis (PCA) on incomplete data sets. The results can be analyzed directly or used to estimate missing values to enable the use of missing value sensitive statistical methods. The package was mainly developed with microarray and metabolite data sets in mind, but can be applied to any other incomplete data set as well.},
  Doi                      = {10.1093/bioinformatics/btm069},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.09},
  Topics                   = {imputation; factorial data analysis}
}

@Article{stage_crookston_FS2007,
  Title                    = {Partitioning error components for accuracy-assessment of near-neighbor methods of imputation},
  Author                   = {Stage, A. R. and Crookston, N. L.},
  Journal                  = {Forest Science},
  Year                     = {2007},
  Number                   = {1},
  Pages                    = {62-72},
  Volume                   = {53},

  Abstract                 = {Imputation is applied for two quite different purposes: to supply missing data to complete a data set for subsequent modeling analyses or to estimate subpopulation totals. Error properties of the imputed values have different effects in these two contexts. We partition errors of imputation derived from similar observation units as arising from three sources: observation error, the distribution of observation units with respect to their similarity, and pure error given a particular choice of variables known for all observation units. Two new statistics based on this partitioning measure the accuracy of the imputations, facilitating comparison of imputation to alternative methods of estimation such as regression and comparison of alternative methods of imputation generally. Knowing the relative magnitude of the errors arising from these partitions can also guide efficient investment in obtaining additional data. We illustrate this partitioning using three extensive data sets from western North America. Application of this partitioning to compare near-neighbor imputation is illustrated for Mahalanobis- and two canonical correlation-based measures of similarity.},
  Doi                      = {10.1093/forestscience/53.1.62},
  ISBN                     = {0015-749X},
  ISSN                     = {0015749X},
  Keywords                 = {landscape modeling; missing data; most similar neighbor; k-nn inference; diagnosis},
  Owner                    = {alyssa},
  Timestamp                = {2017.11.13},
  Topics                   = {}
}

@Article{stekhoven_buhlmann_B2012,
  Title                    = {Missforest-non-parametric missing value imputation for mixed-type data},
  Author                   = {Stekhoven, D. J. and B\"uhlmann, P.},
  Journal                  = {Bioinformatics},
  Year                     = {2012},
  Number                   = {1},
  Pages                    = {112-118},
  Volume                   = {28},

  Abstract                 = {Modern data acquisition based on high-throughput technology is often facing the problem of missing data. Algorithms commonly used in the analysis of such large-scale data often depend on a complete set. Missing value imputation offers a solution to this problem. However, the majority of available imputation methods are restricted to one type of variable only: continuous or categorical. For mixed-type data, the different types are usually handled separately. Therefore, these methods ignore possible relations between variable types. We propose a non-parametric method which can cope with different types of variables simultaneously. We compare several state of the art methods for the imputation of missing values. We propose and evaluate an iterative imputation method (missForest) based on a random forest. By averaging over many unpruned classification or regression trees, random forest intrinsically constitutes a multiple imputation scheme. Using the built-in out-of-bag error estimates of random forest, we are able to estimate the imputation error without the need of a test set. Evaluation is performed on multiple datasets coming from a diverse selection of biological fields with artificially introduced missing values ranging from 10{\%} to 30{\%}. We show that missForest can successfully handle missing values, particularly in datasets including different types of variables. In our comparative study, missForest outperforms other methods of imputation especially in data settings where complex interactions and non-linear relations are suspected. The out-of-bag imputation error estimates of missForest prove to be adequate in all settings. Additionally, missForest exhibits attractive computational efficiency and can cope with high-dimensional data. The package missForest is freely available from http://stat.ethz.ch/CRAN/.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {1105.0828},
  Doi                      = {10.1093/bioinformatics/btr597},
  Eprint                   = {1105.0828},
  ISBN                     = {1367-4811 (Electronic)$\backslash$n1367-4803 (Linking)},
  ISSN                     = {13674803},
  Owner                    = {alyssa},
  Pmid                     = {22039212},
  Timestamp                = {2016.09.27},
  Topics                   = {random tree}
}

@Article{strobl_etal_2007,
  Title                    = {Unbiased split selection for classification trees based on the Gini Index},
  Author                   = {Strobl, C. and Boulesteix, A. L., and Augustin, T.},
  Journal                  = {Computational Statistics \& Data Analysis},
  Year                     = {2007},
  Number                   = {1},
  Pages                    = {483-501},
  Volume                   = {52},
  Publisher                = {Elsevier},

  Abstract                 = {Classification trees are a popular tool in applied statistics because their heuristic search approach based on impurity reduction is easy to understand and the interpretation of the output is straightforward. However, all standard algorithms suffer from a major problem: variable selection based on standard impurity measures as the Gini Index is biased. The bias is such that, e.g., splitting variables with a high amount of missing values—even if missing completely at random (MCAR)—are artificially preferred. A new split selection criterion that avoids variable selection bias is introduced. The exact distribution of the maximally selected Gini gain is derived by means of a combinatorial approach and the resulting
-value is suggested as an unbiased split selection criterion in recursive partitioning algorithms. The efficiency of the method is demonstrated in simulation studies and a real data study from veterinary gynecology in the context of binary classification and continuous predictor variables with different numbers of missing values. The proposed method is extendible to categorical and ordinal predictor variables and to other split selection criteria such as the cross-entropy.},
  Doi                      = {10.1016/j.csda.2006.12.030},
  ISSN                     = {0167-9473},
  Keywords                 = {classification trees; variable selection bias; Gini gain; missing values},
  Owner                    = {imke},
  Timestamp                = {2018.10.30},
  Topics                   = {classification trees; decision trees; random forests}
}

@Article{stuart_etal_AJE2009,
  Title                    = {Multiple imputation with large data sets: a case study of the children's mental health initiative},
  Author                   = {Stuart, E. A. and Azur, M. and Frangakis, C. and Leaf, P.},
  Journal                  = {American Journal of Epidemiology},
  Year                     = {2009},
  Number                   = {9},
  Pages                    = {1133-1139},
  Volume                   = {169},

  Abstract                 = {Multiple imputation is an effective method for dealing with missing data, and it is becoming increasingly common in many fields. However, the method is still relatively rarely used in epidemiology, perhaps in part because relatively few studies have looked at practical questions about how to implement multiple imputation in large data sets used for diverse purposes. This paper addresses this gap by focusing on the practicalities and diagnostics for multiple imputation in large data sets. It primarily discusses the method of multiple imputation by chained equations, which iterates through the data, imputing one variable at a time conditional on the others. Illustrative data were derived from 9,186 youths participating in the national evaluation of the Community Mental Health Services for Children and Their Families Program, a US federally funded program designed to develop and enhance community-based systems of care to meet the needs of children with serious emotional disturbances and their families. Multiple imputation was used to ensure that data analysis samples reflect the full population of youth participating in this program. This case study provides an illustration to assist researchers in implementing multiple imputation in their own data.},
  Doi                      = {10.1093/aje/kwp026},
  ISBN                     = {1476-6256 (Electronic)$\backslash$n0002-9262 (Linking)},
  ISSN                     = {00029262},
  Keywords                 = {mental health services; missing at random; missing data; multiple imputation},
  Owner                    = {alyssa},
  Pmid                     = {19318618},
  Timestamp                = {2017.11.08},
  Topics                   = {multiple imputation; chained equations}
}

@Article{stubbendick_ibrahim_2003,
  Title                    = {Maximum Likelihood Methods for Nonignorable Missing Responses and Covariates in Random Effects Models},
  Author                   = {Stubbendick, A. L. and Ibrahim, J. G.},
  Journal                  = {Biometrics},
  Pages                    = {1140--1150},
  Year                     = {2003},
  Publisher                = {Wiley-Blackwell},
  Volume                   = {59},
  Number                   = {4},

  Abstract                 = {This article analyzes quality of life (QOL) data from an Eastern Cooperative Oncology Group (ECOG) melanoma trial that compared treatment with ganglioside vaccination to treatment with high‐dose interferon. The analysis of this data set is challenging due to several difficulties, namely, nonignorable missing longitudinal responses and baseline covariates. Hence, we propose a selection model for estimating parameters in the normal random effects model with nonignorable missing responses and covariates. Parameters are estimated via maximum likelihood using the Gibbs sampler and a Monte Carlo expectation maximization (EM) algorithm. Standard errors are calculated using the bootstrap. The method allows for nonmonotone patterns of missing data in both the response variable and the covariates. We model the missing data mechanism and the missing covariate distribution via a sequence of one‐dimensional conditional distributions, allowing the missing covariates to be either categorical or continuous, as well as time‐varying. We apply the proposed approach to the ECOG quality‐of‐life data and conduct a small simulation study evaluating the performance of the maximum likelihood estimates. Our results indicate that a patient treated with the vaccine has a higher QOL score on average at a given time point than a patient treated with high‐dose interferon.},
  Keywords                 = {Nonignorabe missing data mechanism; Gibbs sampling; Monte Carlo EM algorithm; Normal random effects model; Selection model},
  Doi                      = {10.1111/j.0006-341X.2003.00131.x},
  ISSN                     = {0006341X},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; ml}
}

@Article{stubbendick_ibrahim_2006,
  Title                    = {Likelihood-based inference with nonignorable missing responses and covariates in models for discrete longitudinal data},
  Author                   = {Stubbendick, A. L. and Ibrahim, J. G.},
  Journal                  = {Statistica Sinica},
  Pages                    = {1143--1167},
  Year                     = {2006},
  Publisher                = {Institute of Statistical Science},
  Volume                   = {16},
  Number                   = {4},

  Abstract                 = {We propose methods for estimating parameters in two types of models for discrete longitudinal data in the presence of nonignorable missing responses and covariates. We first present the generalized linear model with random effects, also known as the the generalized linear mixed model. We specify a missing data mechanism and a missing covariate distribution and incorporate them into the complete data log-likelihood. Parameters are estimated via maximum likelihood using the Gibbs sampler and a Monte Carlo EM algorithm. The second model is a marginal model for correlated binary responses and discrete covariates with finite range, both of which may be nonignorably missing. We incorporate the missing data mechanism and the missing covariate distribution into the multivariate probit model defined by Chib and Greenberg (1998). We use the EM by method of weights (Ibrahim, 1990) and sample the latent normal variables conditional on a particular response and covariate pattern. The M-steps for each model are like a complete data maximization problem, and standard methods are used. Standard errors for the parameter estimates are computed using the multiple imputation method of Goetghebeur and Ryan (2000). We discuss the advantages and disadvantages of each model and give some guidance as to when one model might be chosen over the other. We illustrate both models using data from an environmental study of dyspnea in Chinese cotton factory workers.},
  Keywords                 = {generalized linear mixed model; Gibbs sampling; Monte Carlo EM algorithm; multivariate probit model; nonignorable missing data; sensitivity analysis},
  Url                      = {https://www.jstor.org/stable/24307781},
  ISSN                     = {10170405},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; ml; longitudinal}
}

@Article{su_etal_JSS2011,
  Title                    = {Multiple imputation with diagnostics (mi) in {R}: opening windows into the black box},
  Author                   = {Su, Y. S. and Gelman, A. and Hill, J. and Yajima, M.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2011},
  Pages                    = {2},
  Volume                   = {45},

  Abstract                 = {Our mi package in R has several features that allow the user to get inside the imputation process and evaluate the reasonableness of the resulting models and imputations. These features include: choice of predictors, models, and transformations for chained imputation models; standard and binned residual plots for checking the fit of the conditional distributions used for imputation; and plots for comparing the distributions of observed and imputed data. In addition, we use Bayesian models and weakly informative prior distributions to construct more stable estimates of imputation models. Our goal is to have a demonstration package that (a) avoids many of the practical problems that arise with existing multivariate imputation programs, and (b) demonstrates state-of-the-art diagnostics that can be applied more generally and can be incorporated into the software of others.},
  Doi                      = {10.18637/jss.v045.i02},
  Owner                    = {nathalie},
  Timestamp                = {2017.10.16},
  Topics                   = {multiple imputation; chained equations}
}

@Article{tanner_wong_JASA1987,
  Title                    = {The calculation of posterior distributions by data augmentation},
  Author                   = {Tanner, M. A. and Wong, W.},
  Journal                  = {Journal of the American Statistical Association},
  Year                     = {1987},
  Number                   = {398},
  Pages                    = {528-540},
  Volume                   = {82},

  Abstract                 = {The idea of data augmentation arises naturally in missing value problems, as exemplified by the standard ways of filling in missing cells in balanced two-way tables. Thus data augmentation refers to a scheme of augmenting the observed data so as to make it more easy to analyze. This device is used to great advantage by the EM algorithm (Dempster, Laird, and Rubin 1977) in solving maximum likelihood problems. In situations when the likelihood cannot be approximated closely by the normal likelihood, maximum likelihood estimates and the associated standard errors cannot be relied upon to make valid inferential statements. From the Bayesian point of view, one must now calculate the posterior distribution of parameters of interest. If data augmentation can be used in the calculation of the maximum likelihood estimate, then in the same cases one ought to be able to use it in the computation of the posterior distribution. It is the purpose of this article to explain how this can be done. The basic idea is quite simple. The observed data y is augmented by the quantity z, which is referred to as the latent data. It is assumed that if y and z are both known, then the problem is straightforward to analyze, that is, the augmented data posterior p(theta | y, z) can be calculated. But the posterior density that we want is p(theta | y), which may be difficult to calculate directly. If, however, one can generate multiple values of z from the predictive distribution p(z | y) (i.e., multiple imputations of z), then p(theta | y) can be approximately obtained as the average of p(theta | y, z) over the imputed z's. However, p(z | y) depends, in turn, on p(theta | y). Hence if p(theta | y) was known, it could be used to calculate p(z | y). This mutual dependency between p(theta | y) and p(z | y) leads to an iterative algorithm to calculate p(theta | y). Analytically, this algorithm is essentially the method of successive substitution for solving an operator fixed point equation. We exploit this fact to prove convergence under mild regularity conditions. Typically, to implement the algorithm, one must be able to sample from two distributions, namely p(theta | y, z) and p(z | theta, y). In many cases, it is straightforward to sample from either distribution. In general, though, either sampling can be difficult, just as either the E or the M step can be difficult to implement in the EM algorithm. For p(theta | y, z) arising from parametric submodels of the multinomial, we develop a primitive but generally applicable way to approximately sample theta. The idea is first to sample from the posterior distribution of the cell probabilities and then to project to the parametric surface that is specified by the submodel, giving more weight to those observations lying closer to the surface. This procedure should cover many of the common models for categorical data. There are several examples given in this article. First, the algorithm is introduced and motivated in the context of a genetic linkage example. Second, we apply this algorithm to an example of inference from incomplete data regarding the correlation coefficient of the bivariate normal distribution. It is seen that the algorithm recovers the bimodal nature of the posterior distribution. Finally, the algorithm is used in the analysis of the traditional latent-class model as applied to data from the General Social Survey.},
  Doi                      = {10.2307/2289457},
  ISSN                     = {01621459},
  Owner                    = {alyssa},
  Publisher                = {American Statistical Association, Taylor \& Francis},
  Timestamp                = {2017.08.31},
  Topics                   = {ML},
  Url                      = {http://www.jstor.org/stable/2289457}
}

@Article{tchetgen_etal_2018,
  Title                    = {Discrete choice models for nonmonotone nonignorable missing data: identification and inference},
  Author                   = {Tchetgen Tchetgen, E. J. and Wang, L. and Sun, B.},
  Journal                  = {Statistica Sinica},
  Year                     = {2018},
  Number                   = {4},
  Pages                    = {2069--2088},
  Volume                   = {28},
  Publisher                = {Institute of Statistical Science},

  Abstract                 = {Nonmonotone missing data arise routinely in empirical studies of the social and health sciences and, when ignored, can induce selection bias and loss of efficiency. It is common to account for nonresponse under a missing-at-random assumption which, although convenient, is rarely appropriate when nonresponse is nonmonotone. Likelihood and Bayesian missing data methodologies often require specification of a parametric model for the full data law, thus a priori ruling out any prospect for semiparametric inference. In this paper, we propose an all-purpose approach which delivers semiparametric inferences when missing data are nonmonotone and not at random. The approach is based on a discrete choice model (DCM) as a means to generate a large class of nonmonotone nonresponse mechanisms that are nonignorable. Sufficient conditions for nonparametric identification are given, and a general framework for fully parametric and semiparametric inference under an arbitrary DCM is proposed. Special consideration is given to the case of logit discrete choice nonresponse model (LDCM) for which we describe generalizations of inverse-probability weighting, pattern-mixture estimation, doubly robust estimation, and multiply robust estimation.},
  Doi                      = {10.5705/ss.202016.0325},
  Keywords                 = {Doubly robust; inverse probability weighting; missing not at random; nonmonotone missing data; pattern mixture},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; ml}
}

@Article{templ_etal_ADAC2012,
  Title                    = {Exploring Incomplete data using visualization techniques},
  Author                   = {Templ, M. and Alfons, A. and Filzmoser, P.},
  Journal                  = {Advances in Data Analysis and Classification},
  Year                     = {2012},
  Number                   = {1},
  Pages                    = {29-47},
  Volume                   = {6},

  Abstract                 = {Visualization of incomplete data allows to simultaneously explore the data and the structure of missing values. This is helpful for learning about the distribution of the incomplete information in the data, and to identify possible structures of the missing values and their relation to the available information. The main goal of this contribution is to stress the importance of exploring missing values using visualization methods and to present a collection of such visualization techniques for incomplete data, all of which are implemented in the R package VIM. Providing such functionality for this widely used statistical environment, visualization of missing values, imputation and data analysis can all be done from within R without the need of additional software.},
  Doi                      = {10.1007/s11634-011-0102-y},
  File                     = {:/home/nathalie/Private/Travail/References/Recherche/MissingData/templ_etal_ADAC2012.pdf:PDF},
  Keywords                 = {visualization; missing values; exploring incomplete data; R software},
  Owner                    = {nathalie},
  Timestamp                = {2018.04.11},
  Topics                   = {diagnosis}
}

@Article{thijs_etal_B2002,
  Title                    = {Strategies to fit pattern-mixture models},
  Author                   = {Thijs, H. and Molenberghs, G. and Michiels, B. and Verbeke, G. and Curran, D.},
  Journal                  = {Biostatistics},
  Year                     = {2002},
  Number                   = {2},
  Pages                    = {245-265},
  Volume                   = {3},

  Abstract                 = {Whereas most models for incomplete longitudinal data are formulated within the selection model framework, pattern-mixture models have gained considerable interest in recent years (Little, 1993, 1994). In this paper, we outline several strategies to fit pattern-mixture models, including the so-called identifying restrictions strategy. Multiple imputation is used to apply this strategy to realistic settings, such as quality-of-life data from a longitudinal study on metastatic breast cancer patients.},
  Doi                      = {10.1093/biostatistics/3.2.245},
  Keywords                 = {delta method; linear mixed model; missing data; repeated measures; sensitivity analysis},
  Owner                    = {alyssa},
  Timestamp                = {2017.10.11},
  Topics                   = {mnar}
}

@TechReport{tierney_cook_2018,
  Author                   = {Tierney, N. and Cook, D.},
  Title                    = {Expanding tidy data principles to facilitate missing data exploration, visualization and assessment of imputations},
  Year                     = {2018},
  Institution              = {Monash University, Department of Econometrics and Business Statistics},
  Type                     = {Monash Econometrics and Business Statistics Working Papers},
  Url                      = {https://ideas.repec.org/p/msh/ebswps/2018-14.html},
  Number                   = {14/18},
  Abstract                 = {Despite the large body of research on missing value distributions and imputation, there is comparatively little literature on how to make it easy to handle, explore, and impute missing values in data. This paper addresses this gap. The new methodology builds upon tidy data principles, with a goal to integrating missing value handling as an integral part of data analysis workflows. New data structures are defined along with new functions (verbs) to perform common operations. Together these provide a cohesive framework for handling, exploring, and imputing missing values. These methods have been made available in the R package naniar.},
  Keywords                 = {workflow; statistical computing; data science; data visualization; tidyverse; data pipeline},
  Owner                    = {imke},
  Timestamp                = {2018.11.17},
  Topics                   = {diagnosis; exploration; r}
}

@Article{tierney_etal_BMJO2015,
  Title                    = {Using decision trees to understand structure in missing data},
  Author                   = {Tierney, N. J. and Harden, F. A. and Harden, M. J. and Mengersen, K. L.},
  Journal                  = {BMJ Open},
  Year                     = {2015},
  Number                   = {6},
  Pages                    = {e007450},
  Volume                   = {5},

  Abstract                 = {Objectives Demonstrate the application of decision trees - classification and regression trees (CARTs), and their cousins, boosted regression trees (BRTs) - to understand structure in missing data. Setting Data taken from employees at 3 different industrial sites in Australia. Participants 7915 observations were included. Materials and methods The approach was evaluated using an occupational health data set comprising results of questionnaires, medical tests and environmental monitoring. Statistical methods included standard statistical tests and the 'rpart' and 'gbm' packages for CART and BRT analyses, respectively, from the statistical software 'R'. A simulation study was conducted to explore the capability of decision tree models in describing data with missingness artificially introduced. Results CART and BRT models were effective in highlighting a missingness structure in the data, related to the type of data (medical or environmental), the site in which it was collected, the number of visits, and the presence of extreme values. The simulation study revealed that CART models were able to identify variables and values responsible for inducing missingness. There was greater variation in variable importance for unstructured as compared to structured missingness. Discussion Both CART and BRT models were effective in describing structural missingness in data. CART models may be preferred over BRT models for exploratory analysis of missing data, and selecting variables important for predicting missingness. BRT models can show how values of other variables influence missingness, which may prove useful for researchers. Conclusions Researchers are encouraged to use CART and BRT models to explore and understand missing data.},
  Doi                      = {10.1136/bmjopen-2014-007450},
  Owner                    = {nathalie},
  Timestamp                = {2018.03.30},
  Topics                   = {decision trees; cart; diagnosis}
}

@InProceedings{tran_etal_2017,
  Title                    = {Missing Modalities Imputation via Cascaded Residual Autoencoder},
  Author                   = {Tran, L. and Liu, X. and Zhou, J. and Jin, R.},
  Booktitle                = {2017 IEEE Conference on Computer Vision and PAttern Recognition (CVPR)},
  Year                     = {2017},
  Editor                   = {-},

  Pages                    = {4971-4980},
  Publisher                = {IEEE},

  Eventdate                = {2017-07-21/2017-07-26},
  ISSN                     = {31063-6919},

  Abstract                 = {Affordable sensors lead to an increasing interest in acquiring and modeling data with multiple modalities. Learning from multiple modalities has shown to significantly improve performance in object recognition. However, in practice it is common that the sensing equipment experiences unforeseeable malfunction or configuration issues, leading to corrupted data with missing modalities. Most existing multi-modal learning algorithms could not handle missing modalities, and would discard either all modalities with missing values or all corrupted data. To leverage the valuable information in the corrupted data, we propose to impute the missing data by leveraging the relatedness among different modalities. Specifically, we propose a novel Cascaded Residual Autoencoder (CRA) to impute missing modalities. By stacking residual autoencoders, CRA grows iteratively to model the residual between the current prediction and original data. Extensive experiments demonstrate the superior performance of CRA on both the data imputation and the object recognition task on imputed data.},
  Doi                      = {10.1109/CVPR.2017.528},
  Keywords                 = {Encoding; missing modalities imputation; missing data; corrupted data; data imputation; cascaded residual autoencoder; multimodla learning algorithms},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {imputation; deep learning}
}

@Article{troyanskaya_etal_B2001,
  Title                    = {Missing value estimation methods for {DNA} microarrays},
  Author                   = {Troyanskaya, O. and Cantor, M. and Sherlock, G. and Brown, P. and Hastie, T. and Tibshirani, R. and Botstein, D. and Altman, R. B.},
  Journal                  = {Bioinformatics},
  Year                     = {2001},
  Number                   = {6},
  Pages                    = {520-525},
  Volume                   = {17},

  Abstract                 = {Motivation: Gene expression microarray experiments can generate data sets with multiple missing expression values. Unfortunately, many algorithms for gene expression analysis require a complete matrix of gene array values as input. For example, methods such as hierarchical clustering and K-means clustering are not robust to missing data, and may lose effectiveness even with a few missing values. Methods for imputing missing data are needed, therefore, to minimize the effect of incomplete data sets on analyses, and to increase the range of data sets to which these algorithms can be applied. In this report, we investigate automated methods for estimating missing data. Results: We present a comparative study of several methods for the estimation of missing values in gene microarray data. We implemented and evaluated three methods: a Singular Value Decomposition (SVD) based method (SVDimpute), weighted K-nearest neighbors (KNNimpute), and row average. We evaluated the methods using a variety of parameter settings and over different real data sets, and assessed the robustness of the imputation methods to the amount of missing data over the range of 1-20% missing values. We show that KNNimpute appears to provide a more robust and sensitive method for missing value estimation than SVDimpute, and both SVDimpute and KNNimpute surpass the commonly used row average method (as well as filling missing values with zeros). We report results of the comparative experiments and provide recommendations and tools for accurate estimation of missing microarray data under a variety of conditions. Availability: The software is available at http://smi-web.stanford.edu/projects/helix/pubs/impute/},
  Doi                      = {10.1093/bioinformatics/17.6.520},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.14},
  Topics                   = {imputation; knn}
}

@Article{twala_etal_2008,
  Title                    = {Good methods for coping with missing data in decision trees},
  Author                   = {Twala, B. E. T. H. and Jones, M. C. and Hand, D. J.},
  Journal                  = {Pattern Recognition Letters},
  Year                     = {2008},
  Number                   = {7},
  Pages                    = {950-956},
  Volume                   = {29},

  Abstract                 = {We propose a simple and effective method for dealing with missing data in decision trees used for classification. We call this approach “missingness incorporated in attributes” (MIA). It is very closely related to the technique of treating “missing” as a category in its own right, generalizing it for use with continuous as well as categorical variables. We show through a substantial data-based study of classification accuracy that MIA exhibits consistently good performance across a broad range of data types and of sources and amounts of missingness. It is competitive with the best of the rest (particularly, a multiple imputation EM algorithm method; EMMI) while being conceptually and computationally simpler. A simple combination of MIA and EMMI is slower but even more accurate.},
  Doi                      = {10.1016/j.patrec.2008.01.010},
  ISSN                     = {0167-8655},
  Keywords                 = {CART; EM algorithm; multiple imputation; fractional cases; missingness as attribute},
  Owner                    = {imke},
  Timestamp                = {2018.10.30},
  Topics                   = {decision trees; random forests; CART}
}

@Article{unnebrink_windeler_SM2001,
  Title                    = {Intention-to-treat: methods for dealing with missing values in clinical trials of progressively deteriorating diseases},
  Author                   = {Unnebrink, K. and Windeler, J.},
  Journal                  = {Statistics in Medecine},
  Year                     = {2001},
  Number                   = {24},
  Pages                    = {3931-3946},
  Volume                   = {20},

  Abstract                 = {Since it came up in the 1960s, the principle of intention-to-treat (ITT) has become widely accepted for the analysis of controlled clinical trials. In this context the question of how to perform such an analysis in the presence of missing information about the main endpoint is of major importance. Uncritical use of several ad hoc strategies for dealing with missing values is common in the practice of clinical trials. On the other hand, little is known about possible dangers and problems of applying these strategies. We therefore performed a detailed investigation of different methods for dealing with missing values in order to develop recommendations for their practical use. A simulation study was performed investigating possible consequences on type I error and power of applying different methods for dealing with missing values. The simulations were based on a clinical trial of osteoporosis, a progressively deteriorating disease. The strategies examined can be roughly classified into numerical imputation strategies (last observation carried forward, mean and regression based methods) and non-parametric strategies (rank and dichotomization based methods). Different drop-out mechanisms and different types of progression of disease are considered. The type I error increases drastically for the different strategies, especially if the courses of disease vary between treatment groups. The loss in power can be substantial. There is no strategy which is adequate for all different combinations of drop-out mechanisms, drop-out rates and courses of disease over time. For drop-out rates less than 20 per cent and similar courses of disease in the treatment groups, missing values might be replaced by the mean of the other group, or counted as treatment failures after dichotomization of the endpoint. For larger drop-out rates or less similar courses of disease, no adequate recommendations can be given. Because of the drastic consequences of increasing drop-out rates, it has to be a primary goal in clinical trials to keep missing values to a minimum. Unobserved information cannot be reliably regained by any methodological resources. As there are no strategies for universal use, reasons for the choice of a certain method have to be provided when designing and analysing clinical trials.},
  Doi                      = {10.1002/sim.1149},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.09},
  Topics                   = {imputation; time series}
}

@Article{vanbuuren_etal_JSCS2006,
  Title                    = {Fully conditional specification in multivariate imputation},
  Author                   = {van Buuren, S. and Brand, J. P. L. and Groothuis-Oudshoorn, C. G. M. and Rubin, D. B.},
  Journal                  = {Journal of Statistical Computation and Simulation},
  Year                     = {2006},
  Number                   = {12},
  Pages                    = {1049-1064},
  Volume                   = {76},

  Doi                      = {10.1080/10629360600810434},
  Owner                    = {nathalie},
  Timestamp                = {2016.09.28},
  Topics                   = {fcs; multiple imputation}
}

@Book{vanbuuren_FIMD2012,
  Title                    = {Flexible Imputation of Missing Data},
  Author                   = {van Buuren, S.},
  Publisher                = {Chapman and Hall/CRC},
  Year                     = {2018},

  Address                  = {Boca Raton, FL},

  Abstract                 = {Missing data pose challenges to real-life data analysis. Simple ad-hoc fixes, like deletion or mean imputation, only work under highly restrictive conditions, which are often not met in practice. Multiple imputation replaces each missing value by multiple plausible values. The variability between these replacements reflects our ignorance of the true (but missing) value. Each of the completed data set is then analyzed by standard methods, and the results are pooled to obtain unbiased estimates with correct confidence intervals. Multiple imputation is a general approach that also inspires novel solutions to old problems by reformulating the task at hand as a missing-data problem. This is the second edition of a popular book on multiple imputation, focused on explaining the application of methods through detailed worked examples using the MICE package as developed by the author. This new edition incorporates the recent developments in this fast-moving field. This class-tested book avoids mathematical and technical details as much as possible: formulas are accompanied by verbal statements that explain the formula in accessible terms. The book sharpens the reader’s intuition on how to think about missing data, and provides all the tools needed to execute a well-grounded quantitative analysis in the presence of missing data.},
  Url                      = {https://stefvanbuuren.name/fimd/},
  Owner                    = {imke},
  Timestamp                = {2018.11.12},
  Topics                   = {general}
}

@Article{vanbuuren_groothuisoudshoorn_JSS2011,
  Title                    = {{MICE}: multivariate imputation by chained equations in {R}},
  Author                   = {van Buuren, S. and Groothuis-Oudshoorn, K.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2011},
  Pages                    = {3},
  Volume                   = {45},

  Abstract                 = {The R package mice imputes incomplete multivariate data by chained equations. The software mice 1.0 appeared in the year 2000 as an S-PLUS library, and in 2001 as an R package. mice 1.0 introduced predictor selection, passive imputation and automatic pooling. This article documents mice 2.9, which extends the functionality of mice 1.0 in several ways. In mice 2.9, the analysis of imputed data is made completely general, whereas the range ofmodels under which pooling works is substantially extended. mice 2.9 adds new functionality for imputing multilevel data, automatic predictor selection, data handling, post-processing imputed values, specialized pooling routines, model selection tools, and diagnostic graphs. Imputation of categorical data is improved in order to bypass problems caused by perfect prediction. Special attention is paid to transformations, sum scores, indices and interactions using passive imputation, and to the proper setup of the predictor matrix. mice 2.9 can be downloaded from the Comprehensive R Archive Network. This article provides a hands-on, stepwise approach to solve applied incomplete data problems.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {NIHMS150003},
  Doi                      = {10.18637/jss.v045.i03},
  Eprint                   = {NIHMS150003},
  ISBN                     = {9067436771},
  ISSN                     = {1548-7660},
  Owner                    = {alyssa},
  Pmid                     = {22289957},
  Timestamp                = {2017.10.16},
  Topics                   = {multiple imputation; chained equations}
}

@Article{vanbuuren_SMMR2007,
  Title                    = {Multiple imputation of discrete and continuous data by fully conditional specification},
  Author                   = {van Buuren, S.},
  Journal                  = {Statistical Methods in Medical Research},
  Year                     = {2007},
  Pages                    = {219-242},
  Volume                   = {16},

  Abstract                 = {The goal of multiple imputation is to provide valid inferences for statistical estimates from incomplete data. To achieve that goal, imputed values should preserve the structure in the data, as well as the uncertainty about this structure, and include any knowledge about the process that generated the missing data. Two approaches for imputing multivariate data exist: joint modeling (JM) and fully conditional specification (FCS). JM is based on parametric statistical theory, and leads to imputation procedures whose statistical properties are known. JM is theoretically sound, but the joint model may lack flexibility needed to represent typical data features, potentially leading to bias. FCS is a semi-parametric and flexible alternative that specifies the multivariate model by a series of conditional models, one for each incomplete variable. FCS provides tremendous flexibility and is easy to apply, but its statistical properties are difficult to establish. Simulation work shows that FCS behaves very well in the cases studied. The present paper reviews and compares the approaches. JM and FCS were applied to pubertal development data of 3801 Dutch girls that had missing data on menarche (two categories), breast development (five categories) and pubic hair development (six stages). Imputations for these data were created under two models: a multivariate normal model with rounding and a conditionally specified discrete model. The JM approach introduced biases in the reference curves, whereas FCS did not. The paper concludes that FCS is a useful and easily applied flexible alternative to JM when no convenient and realistic joint distribution can be specified.},
  Doi                      = {10.1177/0962280206074463},
  Owner                    = {alyssa},
  Timestamp                = {2016.09.27},
  Topics                   = {multiple imputation; fcs}
}

@Article{vanderwal_geskus_JSS2011,
  Title                    = {{ipw}: an {R} package for inverse probability weighting},
  Author                   = {van der Wal, Willem M. and Geskus, Ronald B.},
  Journal                  = {Journal of Statistical Software},
  Year                     = {2011},
  Number                   = {13},
  Volume                   = {43},

  Abstract                 = {We describe the R package ipw for estimating inverse probability weights. We show how to use the package to fit marginal structural models through inverse probability weighting, to estimate causal effects. Our package can be used with data from a point treatment situation as well as with a time-varying exposure and time-varying confounders. It can be used with binomial, categorical, ordinal and continuous exposure variables.},
  Doi                      = {10.18637/jss.v043.i13},
  Owner                    = {nathalie},
  Timestamp                = {2018.06.07},
  Topics                   = {causal inference}
}

@Article{vandevelden_bijmolt_P2006,
  Title                    = {Generalized canonical correlation analysis of matrices with missing rows: a simulation study},
  Author                   = {van de Velden, M. and Bijmolt, T. H. A.},
  Journal                  = {Psychometrika},
  Year                     = {2006},

  Abstract                 = {A method is presented for generalized canonical correlation analysis of two or more matrices with missing rows. The method is a combination of Carroll's (1968) method and the missing data approach of the OVERALS technique (Van der Burg, 1988). In a simulation study we assess the performance of the method and compare it to an existing procedure called GENCOM, proposed by Green and Carroll (1988). We find that the proposed method outperforms the GENCOM algorithm both with respect to model fit and recovery of the true structure.},
  Doi                      = {10.1007/s11336-004-1168-9},
  Keywords                 = {generalized canonical correlation analysis,OVERALS,missing values,Monte carlo simulation},
  Owner                    = {nathalie},
  Timestamp                = {2016.09.28},
  Topics                   = {factorial data analysis}
}

@Article{vansteelandt_etal_2007,
  Title                    = {Estimation of regression models for the mean of repeated outcomes under nonignorable nonmonotone nonresponse},
  Author                   = {Vansteelandt, S. and Rotnitzky, A. and Robins, J.},
  Journal                  = {Biometrika},
  Volume                   = {94},
  Number                   = {4},
  Pages                    = {841--860},
  Year                     = {2007},
  Publisher                = {Oxford University Press},

  Abstract                 = {We propose a new class of models for making inference about the mean of a vector of repeated outcomes when the outcome vector is incompletely observed in some study units and missingness is nonmonotone. Each model in our class is indexed by a set of unidentified selection-bias functions which quantify the residual association of the outcome at each occasion t and the probability that this outcome is missing after adjusting for variables observed prior to time t and for the past nonresponse pattern. In particular, selection-bias functions equal to zero encode the investigator's a priori belief that nonresponse of the next outcome does not depend on that outcome after adjusting for the observed past. We call this assumption sequential explainability. Since each model in our class is nonparametric, it fits the data perfectly well. As such, our models are ideal for conducting sensitivity analyses aimed at evaluating the impact that different degrees of departure from sequential explainability have on inference about the marginal means of interest. Although the marginal means are identified under each of our models, their estimation is not feasible in practice because it requires the auxiliary estimation of conditional expectations and probabilities given high-dimensional variables. We henceforth discuss the estimation of the marginal means under each model in our class assuming, additionally, that at each occasion either one of the following two models holds: a parametric model for the conditional probability of nonresponse given current outcomes and past recorded data or a parametric model for the conditional mean of the outcome on the nonrespondents given the past recorded data. We call the resulting procedure 2^T-multiply robust as it protects at each of the T time points against misspecification of one of these two working models, although not against simultaneous misspecification of both. We extend our proposed class of models and estimators to incorporate data configurations which include baseline covariates and a parametric model for the conditional mean of the vector of repeated outcomes given the baseline covariates.},
  Doi                      = {10.1093/biomet/asm070},
  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Keywords                 = {double robustness; generalized estimating equations; intermittent missingness; longitudinal study; missing at random; semiparametric inference},
  Topics                   = {mar; longitudinal}
}

@Article{vansteelandt_etal_M2010,
  Title                    = {Analysis of incomplete data using inverse probability weighting and doubly robust estimators},
  Author                   = {Vansteelandt, Stijn and Carpenter, James and Kenward, Michael G},
  Journal                  = {Methodology -- European Journal of Research Methods for the Behavioral and Social Sciences},
  Year                     = {2010},
  Volume                   = {6},
  Number                   = {1},
  Pages                    = {37--48},
  Location                 = {Tilburg, The Netherlands},
  Doi                      = {10.1027/1614-2241/a000005},

  Abstract                 = {This article reviews inverse probability weighting methods and doubly robust estimation methods for the analysis of incomplete data sets. We first consider methods for estimating a population mean when the outcome is missing at random, in the sense that measured covariates can explain whether or not the outcome is observed. We then sketch the rationale of these methods and elaborate on their usefulness in the presence of influential inverse weights. We finally outline how to apply these methods in a variety of settings, such as for fitting regression models with incomplete outcomes or covariates, emphasizing the use of standard software programs.},
  Keywords                 = {extrapolation; doubly robust estimation; extreme weights; Horvitz-Thompson estimator; inverse probability weighting; missing data; multiple imputation; regression models; repeated outcomes; inference; nonresponse},

  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {ipw}
}

@Article{verbanck_etal_SC2015,
  Title                    = {Regularised {PCA} to denoise and visualise data},
  Author                   = {Verbanck, M. and Josse, J. and Husson, F.},
  Journal                  = {Statistics and Computing},
  Year                     = {2015},
  Number                   = {2},
  Pages                    = {471-486},
  Volume                   = {25},

  Abstract                 = {Principal component analysis (PCA) is a well-established dimensionality reduction method commonly used to denoise and visualise data. A classical PCA model is the fixed effect model in which data are generated as a fixed structure of low rank corrupted by noise. Under this model, PCA does not provide the best recovery of the underlying signal in terms of mean squared error. Following the same principle as in ridge regression, we suggest a regularised version of PCA that essentially selects a certain number of dimensions and shrinks the corresponding singular values. Each singular value is multiplied by a term which can be seen as the ratio of the signal variance over the total variance of the associated dimension. The regularised term is analytically derived using asymptotic results and can also be justified from a Bayesian treatment of the model. Regularised PCA provides promising results in terms of the recovery of the true signal and the graphical outputs in comparison with classical PCA and with a soft thresholding estimation strategy. The distinction between PCA and regularised PCA becomes especially important in the case of very noisy data.},
  Doi                      = {10.1007/s11222-013-9444-y},
  Keywords                 = {principal component analysis; shrinkage; regularised PCA; fixed effect model; denoising; visualisation},
  Owner                    = {nathalie},
  Timestamp                = {2018.05.09},
  Topics                   = {factorial data analysis; imputation}
}

@Article{verbeke_etal_B2001,
  Title                    = {Sensitivity analysis for nonrandom dropout: a local influence approach},
  Author                   = {Verbeke, G. and Molenberghs, G. and Thijs, H. and Lesaffre, E. and Kenward, M. G.},
  Journal                  = {Biometrics},
  Year                     = {2001},
  Number                   = {1},
  Pages                    = {7-14},
  Volume                   = {57},

  Abstract                 = {Diggle and Kenward (1994, Applied Statistics43, 49-93) proposed a selection model for continuous longitudinal data subject to nonrandom dropout. It has provoked a large debate about the role for such models. The original enthusiasm was followed by skepticism about the strong but untestable assumptions on which this type of model invariably rests. Since then, the view has emerged that these models should ideally be made part of a sensitivity analysis. This paper presents a formal and flexible approach to such a sensitivity assessment based on local influence (Cook, 1986, Journal of the Royal Statistical Society, Series B48, 133-169). The influence of perturbing a missing-at-random dropout model in the direction of nonrandom dropout is explored. The method is applied to data from a randomized experiment on the inhibition of testosterone production in rats.},
  Doi                      = {10.1111/j.0006-341X.2001.00007.x},
  ISSN                     = {1541-0420},
  Keywords                 = {compound symmetry; global influence; linear mixed model; missing data; normal curvature},
  Owner                    = {alyssa},
  Publisher                = {Blackwell Publishing Ltd},
  Topics                   = {diagnosis; mnar}
}

@Article{voillet_etal_BMCB2016,
  Title                    = {Handling missing rows in multi-omics data integration: multiple imputation in multiple factor analysis framework},
  Author                   = {Voillet, V. and Besse, P. and Liaubet, L. and San Cristobal, M. and Gonz\'ales, I.},
  Journal                  = {BMC Bioinformatics},
  Year                     = {2016},
  Note                     = {Forthcoming},
  Number                   = {402},
  Volume                   = {17},

  Doi                      = {10.1186/s12859-016-1273-5},
  Owner                    = {nathalie},
  Timestamp                = {2016.09.27},
  Topics                   = {multiple imputation; hot-deck}
}

@Book{wainer_1986,
  Title                    = {Drawing Inferences from Self-Selected Samples},
  Publisher                = {Springer},
  Editor                   = {Wainer, H.},
  Year                     = {1986},

  Address                  = {New York, NY, USA},

  Abstract                 = {This volume contains a collection of essays and discussions which serve as an introduction and guide to current research in the area of drawing inferences from self-selected samples. This topic is of direct interest to a professional audience of survey researchers, pollsters, market researchers, policymakers, statisticians, demographers, economists, and sociologists. The essays themselves and their associated critical discussions are clear and careful; the contributors are among the foremost experts in the field.},
  Keywords                 = {selection bias; missing data; nonrandom nonresponse},
  Owner                    = {imke},
  Timestamp                = {2018.10.26},
  Topics                   = {mnar}
}

@Article{wang_robins_B1998,
  Title                    = {Large-sample theory for parametric multiple imputation procedures},
  Author                   = {Wang, Naisyin and Robins, James M.},
  Journal                  = {Biometrika},
  Volume                   = {85},
  Number                   = {4},
  Pages                    = {935--948},
  Year                     = {1998},
  Doi                      = {10.1093/biomet/85.4.935},

  Abstract                 = {We consider the asymptotic behaviour of various parametric multiple imputation procedures which include but are not restricted to the ‘proper’ imputation procedures proposed by Rubin (1978). The asymptotic variance structure of the resulting estimators is provided. This result is used to compare the relative efficiencies of different imputation procedures. It also provides a basis to understand the behaviour of two Monte Carlo iterative estimators, stochastic EM (Celeux \& Diebolt, 1985; Wei \& Tanner, 1990) and simulated EM (Ruud, 1991). We further develop properties of these estimators when they stop at iteration K with imputation size m. An application to a measurement error problem is used to illustrate the results.},
  Keywords                 = {Asymptotic distribution; EM algorithm; Log-likelihood score; Measurement error model; Missing data},

  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {mi}
}

@Article{white_etal_2018,
  Title                    = {A mean score method for sensitivity analysis to departures from the missing at random assumption in randomised trials},
  Author                   = {White, I. R. and Carpenter, J. and Horton, N. J.},
  Journal                  = {Statistica Sinica},
  Year                     = {2018},
  Number                   = {4},
  Pages                    = {1985--2003},
  Volume                   = {28},
  Publisher                = {Institute of Statistical Science},

  Abstract                 = {Most analyses of randomised trials with incomplete outcomes make untestable assumptions and should therefore be subjected to sensitivity analyses. However, methods for sensitivity analyses are not widely used. We propose a mean score approach for exploring global sensitivity to departures from missing at random or other assumptions about incomplete outcome data in a randomised trial. We assume a single outcome analysed under a generalised linear model. One or more sensitivity parameters, specified by the user, measure the degree of departure from missing at random in a pattern mixture model. Advantages of our method are that its sensitivity parameters are relatively easy to interpret and so can be elicited from subject matter experts; it is fast and non-stochastic; and its point estimate, standard error and confidence interval agree perfectly with standard methods when particular values of the sensitivity parameters make those standard methods appropriate. We illustrate the method using data from a mental health trial.},
  Doi                      = {10.5705/ss.202016.0308},
  Keywords                 = {Intention-to-treat analysis; longitudinal data analysis; mean score; missing data; randomised triaals; sensitivity analysis},

  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mnar; sensitivity analysis; longitudinal}
}

@Article{wu_carroll_B1988,
  Title                    = {Estimation and comparison of changes in the presence of informative right censoring by modeling the censoring process},
  Author                   = {Wu, M. C. and Carroll, R. J.},
  Journal                  = {Biometrics},
  Year                     = {1988},
  Number                   = {1},
  Pages                    = {175-188},
  Volume                   = {44},

  Abstract                 = {In the estimation and comparison of the rates of change of a continuous variable between two groups, the unweighted averages of individual simple least squares estimates from each group are often used. Under a linear random effects model, when all individuals have complete observations at identical time points, these statistics are maximum likelihood estimates for the expected rates of change. However, with censored or missing data, these estimates are no longer efficient when compared to generalized least squares estimates. When, in addition, the right-censoring process is dependent on the individual rates of change (i.e., informative right censoring), the generalized least squares estimates will be biased. Likelihood-ratio tests for informativeness of the censoring process and maximum likelihood estimates for the expected rates of change and the parameters of the right-censoring process are developed under a linear random effects model with a probit model for the right-censoring process. In realistic situations, we illustrate that the bias in estimating group rate of change and the reduction of power in comparing group differences could be substantial when strong dependency of the right-censoring process on individual rates of change is ignored.},
  Doi                      = {10.2307/2531905},
  ISSN                     = {0006341X, 15410420},
  Owner                    = {alyssa},
  Timestamp                = {2017.10.25},
  Topics                   = {mnar}
}

@Article{xie_meng_SS2017,
  Title                    = {Dissecting multiple imputation from a multi-phase inference perspective: what happens when God's, imputer's and analyst's models are uncongenial?},
  Author                   = {Xie, X. and Meng, X. L.},
  Journal                  = {Statistica Sinica},
  Volume                   = {27},
  Number                   = {4},
  pages                    = {1485--1594},
  Year                     = {2017},
  Doi                      = {10.5705/ss.2014.067},

  Abstract                 = {Real-life data are almost never really real. By the time the data arrive at an investigator’s desk or disk, the raw data, however defined, have most likely gone through at least one “cleaning” process, such as standardization, re-calibration, imputation, or de-sensitization. Dealing with such a reality scientifically requires a more holistic multi-phase perspective than is permitted by the usual framework of “God’s model versus my model.” This article provides an in-depth look, from this broader perspective, into multiple-imputation (MI) inference (Rubin (1987)) under uncongeniality (Meng (1994)). We present a general estimating-equation decomposition theorem, resulting in an analytic (asymptotic) description of MI inference as an integration of the knowledge of the imputer and the analyst, and establish a characterization of self-efficiency (Meng (1994)) for regulating estimation procedures. These results help to reveal how the quality of and relationship between the imputer’s model and analyst’s procedure affect MI inference, including how a seemingly perfect procedure under the “God-versus-me” paradigm is actually inadmissible when God’s, imputer’s, and analyst’s models are uncongenial to each other. Our theoretical investigation also leads to useful procedures that are as trivially implementable as Rubin’s combining rules, yet with confidence coverage guaranteed to be minimally the nominal level, under any degree of uncongeniality. We reveal that the relationship is very complex between the validity of approaches taken for individual phases and the validity of the final multi-phase inference, and indeed that it is a nontrivial matter to quantify or even qualify the meaning of validity itself in such settings. These results and many open problems are presented to raise the general awareness that the multi-phase inference paradigm is an uncongenial forest populated by thorns, as well as some fruits, many of which are still low-hanging.},

  Keywords                 = {Confidence validity; data cleaning; estimating equation decomposition; incomplete data; multi-phase inference; pre-processing; self-efficiency; strong efficiency; uncongeniality},
  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {mi}
}

@Article{xue_qu_2020,
  Title                    = {Integrating multi-source block-wise missing data in model selection},
  Author                   = {Xue, Fei and Qu, Annie},
  Journal                  = {Journal of the American Statistical Association},
  Number                   = {just-accepted},
  Pages                    = {1--36},
  Year                     = {2020},
  Publisher                = {Taylor \& Francis},
  Abstract                 = {For multi-source data, blocks of variable information from certain sources are
likely missing. Existing methods for handling missing data do not take
structures of block-wise missing data into consideration. In this paper, we
propose a Multiple Block-wise Imputation (MBI) approach, which incorporates
imputations based on both complete and incomplete observations. Specifically,
for a given missing pattern group, the imputations in MBI incorporate more
samples from groups with fewer observed variables in addition to the group with
complete observations. We propose to construct estimating equations based on
all available information, and integrate informative estimating functions to
achieve efficient estimators. We show that the proposed method has estimation
and model selection consistency under both fixed-dimensional and high-dimensional settings. Moreover, the proposed estimator is asymptotically more
efficient than the estimator based on a single imputation from complete
observations only. In addition, the proposed method is not restricted to missing
completely at random. Numerical studies and ADNI data application confirm that
the proposed method outperforms existing variable selection methods under
various missing mechanisms.},

  Doi                      = {10.1080/01621459.2020.1751176},

  Keywords                 = {ADNI; data integration; dimension reduction; generalized method of moments; informative missing; missing at random},
  Owner                    = {imke},
  Timestamp                = {2020.05.14},
  Topics                   = {inference}
}

@Article{yang_etal_2017,
  Title                    = {Identification and estimation of causal effects with confounders subject to instrumental missingness},
  Author                   = {Yang, Shu and Wang, Linbo and Ding, Peng},
  Journal                  = {Statistics Methodology Repository},
  Year                     = {2017},

  Abstract                 = {Drawing causal inference from unconfounded observational studies is of great importance, which, however, is jeopardized if the confounders are subject to missingness. Generally, it is impossible to identify causal effects if the confounders are missing not at random. In this paper, we propose a novel framework to nonparametrically identify the causal effects with confounders missing not at random, but subject to instrumental missingness, that is, the missing data mechanism is independent of the outcome, given the treatment and possibly missing confounder values. The average causal effect is then estimated using a nonparametric two-stage least squares estimator based on series approximation.},

  Archiveprefix            = {arXiv},
  Arxivid                  = {1702.03951},
  Url                      = {https://arxiv.org/abs/1702.03951},

  Keywords                 = {Completeness; Fredholm integral equation; Ill-posed inverse problem; Kernel-based estimator; MNAR},
  Owner                    = {imke},
  Timestamp                = {2018.12.19},
  Topics                   = {causal inference}
}

@InProceedings{yoon_etal_2018,
  author    = {Yoon, J. and Jordon, J. and van der Schaar, M.},
  title     = {{GAIN}: Missing Data Imputation using Generative Adversarial Nets},
  booktitle = {Proceedings of the 35th International Conference on Machine Learning},
  year      = {2018},
  editor    = {Dy, J. and Krause, A.},
  volume    = {80},
  series    = {Proceedings of Machine Learning Research},
  pages     = {5689--5698},
  address   = {Stockholmsm\"assan, Stockholm Sweden},
  publisher = {PMLR},
  abstract  = {We propose a novel method for imputing missing data by adapting the well-known Generative Adversarial Nets (GAN) framework. Accordingly, we call our method Generative Adversarial Imputation Nets (GAIN). The generator (G) observes some components of a real data vector, imputes the missing components conditioned on what is actually observed, and outputs a completed vector. The discriminator (D) then takes a completed vector and attempts to determine which components were actually observed and which were imputed. To ensure that D forces G to learn the desired distribution, we provide D with some additional information in the form of a hint vector. The hint reveals to D partial information about the missingness of the original sample, which is used by D to focus its attention on the imputation quality of particular components. This hint ensures that G does in fact learn to generate according to the true data distribution. We tested our method on various datasets and found that GAIN significantly outperforms state-of-the-art imputation methods.},
  eventdate = {2018-07-10/2018-07-15},
  file      = {yoon18a.pdf:http\://proceedings.mlr.press/v80/yoon18a/yoon18a.pdf:PDF},
  keywords  = {machine learning; statistics; deep learning; GAN},
  owner     = {imke},
  timestamp = {2018.11.07},
  topics    = {deep learning; gan},
  url       = {http://proceedings.mlr.press/v80/yoon18a.html},
}

@inproceedings{yoon_sull_2020,
  title={GAMIN: Generative Adversarial Multiple Imputation Network for Highly Missing Data},
  author={Yoon, Seongwook and Sull, Sanghoon},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={8456--8464},
  year={2020},
  Url={https://openaccess.thecvf.com/content_CVPR_2020/html/Yoon_GAMIN_Generative_Adversarial_Multiple_Imputation_Network_for_Highly_Missing_Data_CVPR_2020_paper.html},
  Abstract={We propose a novel imputation method for highly missing data. Though most existing imputation methods focus on moderate missing rate, imputation for high missing rate over 80\% is still important but challenging. As we expect that multiple imputation is indispensable for high missing rate, we propose a generative adversarial multiple imputation network (GAMIN) based on generative adversarial network (GAN) for multiple imputation. Compared with similar imputation methods adopting GAN, our method has three novel contributions: 1) We propose a novel imputation architecture which generates candidates of imputation. 2) We present a confidence prediction method to perform reliable multiple imputation. 3) We realize them with GAMIN and train it using novel loss functions based on the confidence. We synthesized highly missing datasets using MNIST and CelebA to perform various experiments. The results show that our method outperforms baseline methods at high missing rate from 80\% to 95\%.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {deep learning; gan}
}

@Article{zhang_etal_2018,
  Title                    = {Missing Value Imputation Based on Deep Generative Models},
  Author                   = {Zhang, H. and Xie, P. and Xing, E.},
  Journal                  = {Computing Research Repository},
  Year                     = {2018},
  Volume                   = {abs/1808.01684},

  Abstract                 = {Missing values widely exist in many real-world datasets, which hinders the performing of advanced data analytics. Properly filling these missing values is crucial but challenging, especially when the missing rate is high. Many approaches have been proposed for missing value imputation (MVI), but they are mostly heuristics-based, lacking a principled foundation and do not perform satisfactorily in practice. In this paper, we propose a probabilistic framework based on deep generative models for MVI. Under this framework, imputing the missing entries amounts to seeking a fixed-point solution between two conditional distributions defined on the missing entries and latent variables respectively. These distributions are parameterized by deep neural networks (DNNs) which possess high approximation power and can capture the nonlinear relationships between missing entries and the observed values. The learning of weight parameters of DNNs is performed by maximizing an approximation of the log-likelihood of observed values. We conducted extensive evaluation on 13 datasets and compared with 11 baselines methods, where our methods largely outperforms the baselines.},
  Archiveprefix            = {arXiv},
  Arxivid                  = {1808.01684},
  Url                      = {https://arxiv.org/abs/1808.01684},
  Keywords                 = {Missing data; imputation; neural networks; mvi; fixed-point problem},
  Owner                    = {imke},
  Timestamp                = {2018.11.08},
  Topics                   = {imputation; deep learning}
}

@Article{zhang_JSS2012,
  Title                    = {Nearest neighbor selection for iterative {kNN} imputation},
  Author                   = {Zhang, S.},
  Journal                  = {Journal of Systems and Software},
  Year                     = {2012},
  Number                   = {11},
  Pages                    = {2541-2552},
  Volume                   = {85},

  Abstract                 = {Existing kNN imputation methods for dealing with missing data are designed according to Minkowski distance or its variants, and have been shown to be generally efficient for numerical variables (features, or attributes). To deal with heterogeneous (i.e., mixed-attributes) data, we propose a novel kNN (k nearest neighbor) imputation method to iteratively imputing missing data, named GkNN (gray kNN) imputation. GkNN selects k nearest neighbors for each missing datum via calculating the gray distance between the missing datum and all the training data rather than traditional distance metric methods, such as Euclidean distance. Such a distance metric can deal with both numerical and categorical attributes. For achieving the better effectiveness, GkNN regards all the imputed instances (i.e., the missing data been imputed) as observed data, which with complete instances (instances without missing values) together to iteratively impute other missing data. We experimentally evaluate the proposed approach, and demonstrate that the gray distance is much better than the Minkowski distance at both capturing the proximity relationship (or nearness) of two instances and dealing with mixed attributes. Moreover, experimental results also show that the GkNN algorithm is much more efficient than existent kNN imputation methods.},
  Doi                      = {10.1016/j.jss.2012.05.073},
  Keywords                 = {Missing data; k nearest neighbors; kNN imputation},
  Owner                    = {alyssa},
  Timestamp                = {2017.02.20},
  Topics                   = {imputation; knn}
}

@article{zhao_2020,
  title={Statistical inference for missing data mechanisms},
  author={Zhao, Yang},
  journal={Statistics in Medicine},
  volume={39},
  number={28},
  pages={4325--4333},
  year={2020},
  publisher={Wiley Online Library},
  Doi={10.1002/sim.8727},
  Abstract={In the literature of statistical analysis with missing data there is a significant gap in statistical inference for missing data mechanisms especially for nonmonotone missing data, which has essentially restricted the use of the estimation methods which require estimating the missing data mechanisms. For example, the inverse probability weighting methods (Horvitz and Thompson, 1952; Little and Rubin, 2002), including the popular augmented inverse probability weighting (Robins et al, 1994), depend on sufficient models for the missing data mechanisms to reduce estimation bias while improving estimation efficiency. This research proposes a semiparametric likelihood method for estimating missing data mechanisms where an EM algorithm with closed form expressions for both E‐step and M‐step is used in evaluating the estimate (Zhao et al, 2009; Zhao, 2020). The asymptotic variance of the proposed estimator is estimated from the profile score function. The methods are general and robust. Simulation studies in various missing data settings are performed to examine the finite sample performance of the proposed method. Finally, we analysis the missing data mechanism of Duke cardiac catheterization coronary artery disease diagnostic data to illustrate the method.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {inference}
}

@inproceedings{zhao2020missing,
  title={Missing value imputation for mixed data via gaussian copula},
  author={Zhao, Yuxuan and Udell, Madeleine},
  booktitle={Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
  pages={636--646},
  year={2020},
  Doi={10.1145/3394486.3403106},
  Abstract={Missing data imputation forms the first critical step of many data analysis pipelines. The challenge is greatest for mixed data sets, including real, Boolean, and ordinal data, where standard techniques for imputation fail basic sanity checks: for example, the imputed values may not follow the same distributions as the data. This paper proposes a new semiparametric algorithm to impute missing values, with no tuning parameters. The algorithm models mixed data as a Gaussian copula. This model can fit arbitrary marginals for continuous variables and can handle ordinal variables with many levels, including Boolean variables as a special case. We develop an efficient approximate EM algorithm to estimate copula parameters from incomplete mixed data. The resulting model reveals the statistical associations among variables. Experimental results on several synthetic and real datasets show the superiority of our proposed algorithm to state-of-the-art imputation algorithms for mixed data.},
  Owner                    = {imke},
  Timestamp             = {2021.01.19},
  Topics                   = {imputation; matrix completion}
}

@Article{zhou_etal_2010,
  Title                    = {Block-conditional missing at random models for missing data},
  Author                   = {Zhou, Y. and Little, R. J. A. and Kalbfleisch, J. D.},
  Journal                  = {Statistical Science},
  Volume                   = {25},
  Number                   = {4},
  Pages                    = {517--532},
  Year                     = {2010},
  Publisher                = {Institute of Mathematical Statistics},

  Abstract                 = {Two major ideas in the analysis of missing data are (a) the EM algorithm [Dempster, Laird and Rubin, J. Roy. Statist. Soc. Ser. B 39 (1977) 1–38] for maximum likelihood (ML) estimation, and (b) the formulation of models for the joint distribution of the data Z and missing data indicators M, and associated "missing at random" (MAR) condition under which a model for M is unnecessary [Rubin, Biometrika 63 (1976) 581–592]. Most previous work has treated Z and M as single blocks, yielding selection or pattern-mixture models depending on how their joint distribution is factorized. This paper explores "block-sequential" models that interleave subsets of the variables and their missing data indicators, and then make parameter restrictions based on assumptions in each block. These include models that are not MAR. We examine a subclass of block-sequential models we call block-conditional MAR (BCMAR) models, and an associated block-monotone reduced likelihood strategy that typically yields consistent estimates by selectively discarding some data. Alternatively, full ML estimation can often be achieved via the EM algorithm. We examine in some detail BCMAR models for the case of two multinomially distributed categorical variables, and a two block structure where the first block is categorical and the second block arises from a (possibly multivariate) exponential family distribution.},
  Doi                      = {10.1214/10-STS344},

  Keywords                 = {block-sequential missing data models; block-conditional MAR models; EM algorithm; categorical data},
  Owner                    = {imke},
  Timestamp                = {2018.11.11},
  Topics                   = {mar; em}
}

@Article{zhu_etal_2019,
  Title                    = {High-dimensional principal component analysis with heterogeneous missingness},
  Author                   = {Zhu, Ziwei and Wang, Tengyao and Samworth, Richard J},
  Journal                  = {arXiv preprint},
  Year                     = {2019},

  Archiveprefix            = {arXiv},
  Arxivid                  = {1906.12125},
  Url                      = {https://arxiv.org/abs/1906.12125},

  Abstract                 = {We study the problem of high-dimensional Principal Component Analysis (PCA) with missing observations. In simple, homogeneous missingness settings with a noise level of constant order, we show that an existing inverse-probability weighted (IPW) estimator of the leading principal components can (nearly) attain the minimax optimal rate of convergence. However, deeper investigation reveals both that, particularly in more realistic settings where the missingness mechanism is heterogeneous, the empirical performance of the IPW estimator can be unsatisfactory, and moreover that, in the noiseless case, it fails to provide exact recovery of the principal components. Our main contribution, then, is to introduce a new method for high-dimensional PCA, called `primePCA', that is designed to cope with situations where observations may be missing in a heterogeneous manner. Starting from the IPW estimator, primePCA iteratively projects the observed entries of the data matrix onto the column space of our current estimate to impute the missing entries, and then updates our estimate by computing the leading right singular space of the imputed data matrix. It turns out that the interaction between the heterogeneity of missingness and the low-dimensional structure is crucial in determining the feasibility of the problem. We therefore introduce an incoherence condition on the principal components and prove that in the noiseless case, the error of primePCA converges to zero at a geometric rate when the signal strength is not too small. An important feature of our theoretical guarantees is that they depend on average, as opposed to worst-case, properties of the missingness mechanism. Our numerical studies on both simulated and real data reveal that primePCA exhibits very encouraging performance across a wide range of scenarios.},

  Keywords                 = {missing values; heterogeneous missingness; incoherence; principal component analysis; inverse propensity weighting},
  Owner                    = {imke},
  Timestamp                = {2019.07.05},
  Topics                   = {factorial data analysis; imputation; causal inference}
}