Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
3214 lines (2782 sloc) 320 KB
% This file was created with JabRef 2.10.
% Encoding: UTF-8
@article{yoon2018gain,
title={Gain: Missing data imputation using generative adversarial nets},
author={Yoon, Jinsung and Jordon, James and Van Der Schaar, Mihaela},
journal={arXiv preprint arXiv:1806.02920},
year={2018},
topic={imputation}
}
@article{bertsimas2017predictive,
title={From predictive methods to missing data imputation: an optimization approach},
author={Bertsimas, Dimitris and Pawlowski, Colin and Zhuo, Ying Daisy},
journal={The Journal of Machine Learning Research},
volume={18},
number={1},
pages={7133--7171},
year={2017},
publisher={JMLR. org},
topic={imputation}
}
@Article{abayomi_etal_JRSSC2008,
Title = {Diagnostics for multivariate imputations},
Author = {Abayomi, K. and Gelman, A. and Levy, M.},
Journal = {Journal of the Royal Statistical Society, Series C (Applied Statistics)},
Year = {2008},
Number = {3},
Pages = {273-291},
Volume = {57},
Abstract = {We consider three sorts of diagnostics for random imputations: displays of the completed data, which are intended to reveal unusual patterns that might suggest problems with the imputations, comparisons of the distributions of observed and imputed data values and checks of the fit of observed data to the model that is used to create the imputations. We formulate these methods in terms of sequential regression multivariate imputation, which is an iterative procedure in which the missing values of each variable are randomly imputed conditionally on all the other variables in the completed data matrix. We also consider a recalibration procedure for sequential regression imputations. We apply these methods to the 2002 environmental sustainability index, which is a linear aggregation of 64 environmental variables on 142 countries.},
Doi = {10.1111/j.1467-9876.2007.00613.x},
ISSN = {1467-9876},
Keywords = {missing values; multiple imputation: multivariate statistics; sustainability; environmental statistics},
Owner = {alyssa},
Publisher = {Blackwell Publishing Ltd},
Timestamp = {2017.11.08},
Topics = {mi}
}
@Article{albert_follmann_B2000,
Title = {Modeling repeated count data subject to informative dropout},
Author = {Albert, P. S. and Follmann, D. A.},
Journal = {Biometrics},
Year = {2000},
Number = {3},
Pages = {667-677},
Volume = {56},
Abstract = {In certain diseases, outcome is the number of morbid events over the course of follow-up. In epilepsy, e.g., daily seizure counts are often used to reflect disease severity. Follow-up of patients in clinical trials of such diseases is often subject to censoring due to patients dying or dropping out. If the sicker patients tend to be censored in such trials, estimates of the treatment effect that do not incorporate the censoring process may be misleading. We extend the shared random effects approach of Wu and Carroll (1988, Biometrics 44, 175-188) to the setting of repeated counts of events. Three strategies are developed. The first is a likelihood-based approach for jointly modeling the count and censoring processes. A shared random effect is incorporated to introduce dependence between the two processes. The second is a likelihood-based approach that conditions on the dropout times in adjusting for informative dropout. The third is a generalized estimating equations (GEE) approach, which also conditions on the dropout times but makes fewer assumptions about the distribution of the count process. Estimation procedures for each of the approaches are discussed, and the approaches are applied to data from an epilepsy clinical trial. A simulation study is also conducted to compare the various approaches. Through analyses and simulations, we demonstrate the flexibility of the likelihood-based conditional model for analyzing data from the epilepsy trial.},
Doi = {10.1111/j.0006-341X.2000.00667.x},
ISSN = {0006341X, 15410420},
Owner = {alyssa},
Publisher = {[Wiley, International Biometric Society]},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Book{allison_MD2001,
Title = {Missing Data},
Author = {Allison, P. D.},
Publisher = {Sage Publications},
Year = {2001},
Address = {Thousand Oaks, CA, USA},
Series = {Quantitative Applications in the Social Sciences},
Doi = {10.1136/bmj.38977.682025.2C},
ISBN = {9780761916727},
ISSN = {0959-8138},
Mendeley-groups = {missing data},
Owner = {nathalie},
Timestamp = {2017.03.06},
Topics = {general}
}
@Article{andridge_little_ISR2010,
Title = {A review of hot deck imputation for survey non-response},
Author = {Andridge, R. and Little, R. J. A.},
Journal = {International Statistical Review},
Year = {2010},
Number = {1},
Pages = {40-64},
Volume = {78},
Abstract = {Hot deck imputation is a method for handling missing data in which each missing value is replaced with an observed response from a ``similar'' unit. Despite being used extensively in practice, the theory is not as well developed as that of other imputation methods. We have found that no consensus exists as to the best way to apply the hot deck and obtain inferences from the completed data set. Here we review different forms of the hot deck and existing research on its statistical properties. We describe applications of the hot deck currently in use, including the U.S. Census Bureau's hot deck for the Current Population Survey (CPS). We also provide an extended example of variations of the hot deck applied to the third National Health and Nutrition Examination Survey (NHANES III). Some potential areas for future research are highlighted.},
Annote = {A review of Hot deck imputation for survey Non-response},
Doi = {10.1111/j.1751-5823.2010.00103.x},
Keywords = {item non-response; missing data; multiple imputation; variance estimation},
Mendeley-groups = {missing data},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {hot-deck}
}
@Article{audigier_etal_ADAC2016,
Title = {A principal component method to impute missing values for mixed data},
Author = {Audigier, V. and Husson, F. and Josse, J.},
Journal = {Advances in Data Analysis and Classification},
Year = {2016},
Number = {1},
Pages = {5-26},
Volume = {10},
Abstract = {We propose a new method to impute missing values in mixed data sets. It is based on a principal component method, the factorial analysis for mixed data, which balances the influence of all the variables that are continuous and categorical in the construction of the principal components. Because the imputation uses the principal axes and components, the prediction of the missing values is based on the similarity between individuals and on the relationships between variables. The properties of the method are illustrated via simulations and the quality of the imputation is assessed using real data sets. The method is compared to a recent method (Stekhoven and Buhlmann Bioinformatics 28:113_118, 2011) based on random forest and shows better performance especially for the imputation of categorical variables and situations with highly linear relationships between continuous variables.},
Doi = {10.1007/s11634-014-0195-1},
Keywords = {missing values; mixed data; imputation; principal component method; factorial analysis of mixed data},
Owner = {alyssa},
Timestamp = {2017.02.22},
Topics = {factorial data analysis; imputation}
}
@Article{audigier_etal_SC2016,
Title = {{MIMCA}: multiple imputation for categorical variables with multiple correspondence analysis},
Author = {Audigier, V. and Husson, F. and Josse, J.},
Journal = {Statistics and Computing},
Year = {2016},
Number = {2},
Pages = {1-18},
Volume = {27},
Abstract = {We propose a multiple imputation method to deal with incomplete categorical data. This method imputes the missing entries using the principal components method dedicated to categorical data: multiple correspondence analysis {\{}(MCA).{\}} The uncertainty concerning the parameters of the imputation model is reflected using a non-parametric bootstrap. Multiple imputation using {\{}MCA{\}} {\{}(MIMCA){\}} requires estimating a small number of parameters due to the dimensionality reduction property of {\{}MCA.{\}} It allows the user to impute a large range of data sets. In particular, a high number of categories per variable, a high number of variables or a small the number of individuals are not an issue for {\{}MIMCA.{\}} Through a simulation study based on real data sets, the method is assessed and compared to the reference methods (multiple imputation using the loglinear model, multiple imputation by logistic regressions) as well to the latest works on the topic (multiple imputation by random forests or by the Dirichlet process mixture of products of multinomial distributions model). The proposed method shows good performances in terms of bias and coverage for an analysis model such as a main effects logistic regression model. In addition, {\{}MIMCA{\}} has the great advantage that it is substantially less time consuming on data sets of high dimensions than the other multiple imputation methods.},
Archiveprefix = {arXiv},
Arxivid = {1505.08116},
Doi = {10.1007/s11222-016-9635-4},
Eprint = {1505.08116},
ISSN = {15731375},
Keywords = {bootstrap; categorical data; missing values; multiple correspondence analysis; multiple imputation},
Owner = {alyssa},
Publisher = {Springer US},
Timestamp = {2017.07.06},
Topics = {factorial data analysis; multiple imputation}
}
@Article{audigier_etal_JSCS2015,
Title = {Multiple imputation for continuous variables using a {B}ayesian principal component analysis},
Author = {Audigier, V. and Husson, F. and Josse, J.},
Journal = {Journal of Statistical Computation and Simulation},
Year = {2015},
Number = {11},
Pages = {2140-2156},
Volume = {86},
Abstract = {We propose a multiple imputation method based on principal component analysis (PCA) to deal with incomplete continuous data. To reflect the uncertainty of the parameters from one imputation to the next, we use a Bayesian treatment of the PCA model. Using a simulation study and real data sets, the method is compared to two classical approaches: multiple imputation based on joint modelling and on fully conditional modelling. Contrary to the others, the proposed method can be easily used on data sets where the number of individuals is less than the number of variables and when the variables are highly correlated. In addition, it provides unbiased point estimates of quantities of interest, such as an expectation, a regression coefficient or a correlation coefficient, with a smaller mean squared error. Furthermore, the widths of the confidence intervals built for the quantities of interest are often smaller whilst ensuring a valid coverage.},
Doi = {10.1080/00949655.2015.1104683},
Keywords = {missing values; continuous data; multiple imputaiton; bayesian principal component analysis; data augmentation},
Owner = {alyssa},
Timestamp = {2017.02.23},
Topics = {factorial data analysis; multiple imputation}
}
@Article{bang_robins_B2005,
Title = {Doubly robust estimation in missing data and causal inference models},
Author = {Bang, H. and Robins, J. M.},
Journal = {Biometrics},
Year = {2005},
Number = {4},
Pages = {962-973},
Volume = {61},
Abstract = {The goal of this article is to construct doubly robust (DR) estimators in ignorable missing data and causal inference models. In a missing data model, an estimator is DR if it remains consistent when either (but not necessarily both) a model for the missingness mechanism or a model for the distribution of the complete data is correctly specified. Because with observational data one can never be sure that either a missingness model or a complete data model is correct, perhaps the best that can be hoped for is to find a DR estimator. DR estimators, in contrast to standard likelihood-based or (nonaugmented) inverse probability-weighted estimators, give the analyst two chances, instead of only one, to make a valid inference. In a causal inference model, an estimator is DR if it remains consistent when either a model for the treatment assignment mechanism or a model for the distribution of the counterfactual data is correctly specified. Because with observational data one can never be sure that a model for the treatment assignment mechanism or a model for the counterfactual data is correct, inference based on DR estimators should improve upon previous approaches. Indeed, we present the results of simulation studies which demonstrate that the finite sample performance of DR estimators is as impressive as theory would predict. The proposed method is applied to a cardiovascular clinical trial.},
Doi = {10.1111/j.1541-0420.2005.00377.x},
ISBN = {0006-341X},
ISSN = {0006341X},
Keywords = {causal inference; doubly robust estimation; longitudinal data; marginal structural model; missing data; semiparametrics},
Owner = {alyssa},
Pmid = {16401269},
Timestamp = {2017.05.29},
Topics = {causal inference}
}
@Article{baraldi_enders_JSP2010,
Title = {An introduction to modern missing data analysis},
Author = {Baraldi, A. N. and Enders, C. K.},
Journal = {Journal of School Psychology},
Year = {2010},
Number = {1},
Pages = {5-37},
Volume = {48},
Abstract = {A great deal of recent methodological research has focused on two modern missing data analysis methods: maximum likelihood and multiple imputation. These approaches are advantageous to traditional techniques (e.g. deletion and mean imputation techniques) because they require less stringent assumptions and mitigate the pitfalls of traditional techniques. This article explains the theoretical underpinnings of missing data analyses, gives an overview of traditional missing data techniques, and provides accessible descriptions of maximum likelihood and multiple imputation. In particular, this article focuses on maximum likelihood estimation and presents two analysis examples from the Longitudinal Study of American Youth data. One of these examples includes a description of the use of auxiliary variables. Finally, the paper illustrates ways that researchers can use intentional, or planned, missing data to enhance their research designs.},
Doi = {10.1016/j.jsp.2009.10.001},
Keywords = {missing data; multiple imputation; maximum likelihood; planned missingness},
Owner = {alyssa},
Timestamp = {2017.02.21},
Topics = {general_informal}
}
@Article{baretta_santaniello_BMCMIDM2016,
Title = {Nearest neighbor imputation algorithms: a critical evaluation},
Author = {Baretta, L. and Santaniello, A.},
Journal = {BMC Medical Informatics and Decision Making},
Year = {2016},
Number = {Supp. 3},
Pages = {74},
Volume = {16},
Abstract = {Background Nearest neighbor (NN) imputation algorithms are efficient methods to fill in missing data where each missing value on some records is replaced by a value obtained from related cases in the whole set of records. Besides the capability to substitute the missing data with plausible values that are as close as possible to the true value, imputation algorithms should preserve the original data structure and avoid to distort the distribution of the imputed variable. Despite the efficiency of NN algorithms little is known about the effect of these methods on data structure. Methods Simulation on synthetic datasets with different patterns and degrees of missingness were conducted to evaluate the performance of NN with one single neighbor (1NN) and with k neighbors without (kNN) or with weighting (wkNN) in the context of different learning frameworks: plain set, reduced set after ReliefF filtering, bagging, random choice of attributes, bagging combined with random choice of attributes (Random-Forest-like method). Results Whatever the framework, kNN usually outperformed 1NN in terms of precision of imputation and reduced errors in inferential statistics, 1NN was however the only method capable of preserving the data structure and data were distorted even when small values of k neighbors were considered; distortion was more severe for resampling schemas. Conclusions The use of three neighbors in conjunction with ReliefF seems to provide the best trade-off between imputation error and preservation of the data structure. The very same conclusions can be drawn when imputation experiments were conducted on the single proton emission computed tomography (SPECTF) heart dataset after introduction of missing data completely at random.},
Doi = {10.1186/s12911-016-0318-z},
Keywords = {near neighbour; imputation method; imputation algorithm; near neighbour algorithm; Minkowski norm},
Owner = {nathalie},
Series = {Proceedings of the 5th Translational Bioinformatics Conference (TBC 2015): medical informatics and decision making},
Timestamp = {2018.05.17},
Topics = {knn}
}
@article{bartlett_etal_2015,
Title = {Asymptotically unbiased estimation of exposure odds ratios in complete records logistic regression},
Author = {Bartlett, Jonathan W and Harel, Ofer and Carpenter, James R},
Journal = {American journal of epidemiology},
Volume = {182},
Number = {8},
Pages = {730--736},
Year = {2015},
Publisher = {Oxford University Press},
Doi = {10.1093/aje/kwv114},
Abstract = {Missing data are a commonly occurring threat to the validity and efficiency of epidemiologic studies. Perhaps the most common approach to handling missing data is to simply drop those records with 1 or more missing values, in so-called “complete records” or “complete case” analysis. In this paper, we bring together earlier-derived yet perhaps now somewhat neglected results which show that a logistic regression complete records analysis can provide asymptotically unbiased estimates of the association of an exposure of interest with an outcome, adjusted for a number of confounders, under a surprisingly wide range of missing-data assumptions. We give detailed guidance describing how the observed data can be used to judge the plausibility of these assumptions. The results mean that in large epidemiologic studies which are affected by missing data and analyzed by logistic regression, exposure associations may be estimated without bias in a number of settings where researchers might otherwise assume that bias would occur.},
Keywords = {complete case analysis; logistic regression; missing data; odds ratio},
Owner = {imke},
Timestamp = {2019.04.01},
Topics = {causal inference}
}
@InProceedings{bengio_gingras_1995,
Title = {Recurrent neural networks for missing or asynchronous data},
Author = {Bengio, Y. and Gingras, F.},
Booktitle = {Proceedings of the 8th International Conference on Neural Information Processing Systems},
Pages = {395-401},
Year = {1995},
Editor = {-},
Address = {Cambridge, MA, USA},
Eventdate = {1995-11-27/1995-12-02},
Publisher = {MIT Press},
Abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more "discriminant" approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},
Url = {http://papers.nips.cc/paper/1126-recurrent-neural-networks-for-missing-or-asynchronous-data.pdf},
Owner = {imke},
Timestamp = {2018.11.08},
Keywords = {machine learning; deep learning; rnn; sequential data},
Topics = {deep learning; rnn}
}
@InProceedings{biessmann_CIKM2018,
Title = {"Deep" Learning for Missing Value Imputation in Tables with Non-Numerical Data},
Author = {Biessmann, F. and Salinas, D. and Schelter, S. and Schmidt, P. and Lange, D.},
Booktitle = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
Series = {CIKM '18},
Year = {2018},
ISBN = {978-1-4503-6014-2},
Location = {Torino, Italy},
Pages = {2017--2025},
Url = {http://doi.acm.org/10.1145/3269206.3272005},
Doi = {10.1145/3269206.3272005},
Publisher = {ACM},
Address = {New York, NY, USA},
Editor = {-},
Abstract = {The success of applications that process data critically depends on the quality of the ingested data. Completeness of a data source is essential in many cases. Yet, most missing value imputation approaches suffer from severe limitations. They are almost exclusively restricted to numerical data, and they either offer only simple imputation methods or are difficult to scale and maintain in production. Here we present a robust and scalable approach to imputation that extends to tables with non-numerical values, including unstructured text data in diverse languages. Experiments on public data sets as well as data sets sampled from a large product catalog in different languages (English and Japanese) demonstrate that the proposed approach is both scalable and yields more accurate imputations than previous approaches. Training on data sets with several million rows is a matter of minutes on a single machine. With a median imputation F1 score of 0.93 across a broad selection of data sets our approach achieves on average a 23-fold improvement compared to mode imputation. While our system allows users to apply state-of-the-art deep learning models if needed, we find that often simple linear n-gram models perform on par with deep learning methods at a much lower operational cost. The proposed method learns all parameters of the entire imputation pipeline automatically in an end-to-end fashion, rendering it attractive as a generic plugin both for engineers in charge of data pipelines where data completeness is relevant, as well as for practitioners without expertise in machine learning who need to impute missing values in tables with non-numerical data.},
Owner = {imke},
Timestamp = {2018.12.18},
Keywords = {data cleaning; missing value imputation},
Topics = {deep learning; neural networks}
}
@Article{blake_etal_2019,
Title = {Propensity scores using missingness pattern information: a practical guide},
Author = {Blake, Helen A. and Leyrat, Clémence and Mansfield, Kate and Seaman, Shaun and Tomlinson, Laurie and Carpenter, James and Williamson, Elizabeth},
Year = {2019},
Journal = {arXiv preprint},
archivePrefix = {arXiv},
Year = {2018},
eprint = {1901.03981},
primaryClass = {stat.ME},
Abstract = {Electronic health records are a valuable data source for investigating health-related questions, and propensity score analysis has become an increasingly popular approach to address confounding bias in such investigations. However, because electronic health records are typically routinely recorded as part of standard clinical care, there are often missing values, particularly for potential confounders. In our motivating study -- using electronic health records to investigate the effect of renin-angiotensin system blockers on the risk of acute kidney injury -- two key confounders, ethnicity and chronic kidney disease stage, have 59% and 53% missing data, respectively.
The missingness pattern approach (MPA), a variant of the missing indicator approach, has been proposed as a method for handling partially observed confounders in propensity score analysis. In the MPA, propensity scores are estimated separately for each missingness pattern present in the data. Although the assumptions underlying the validity of the MPA are stated in the literature, it can be difficult in practice to assess their plausibility.
In this paper, we explore the MPA's underlying assumptions by using causal diagrams to assess their plausibility in a range of simple scenarios, drawing general conclusions about situations in which they are likely to be violated. We present a framework providing practical guidance for assessing whether the MPA's assumptions are plausible in a particular setting and thus deciding when the MPA is appropriate. We apply our framework to our motivating study, showing that the MPA's underlying assumptions appear reasonable, and we demonstrate the application of MPA to this study.},
Keywords = {Electronic health records; Missing confounder data; Missing indicator; Missingness pattren; Propensity score analysis},
Url = {https://researchonline.lshtm.ac.uk/4651159/1/1901.03981v1.pdf},
Owner = {imke},
Timestamp = {2019.02.13},
Topics = {causal inference}
}
@Article{buck_JRSSB1960,
Title = {A method of estimation of missing values in multivariate data suitable for use with an electronic computer},
Author = {Buck, S. F.},
Journal = {Journal of the Royal Statistical Society, Series B},
Year = {1960},
Pages = {302-306},
Volume = {22},
Abstract = {Procedures for treating missing data in the statistical analysis of survey data are reviewed. The main topics covered are: (1) how to assess the nature of missing data especially with regard to randomness, (2) a comparison of listwise and pairwise deletion, and (3) methods for using maximum information to estimate (a) parameters or (b) missing values.},
Doi = {10.1177/004912417700600206},
Owner = {nathalie},
Timestamp = {2016.09.28},
Topics = {survey}
}
@InProceedings{burns_ARC1990,
Title = {Multiple and replicate item imputation in a complex sample survey},
Author = {Burns, R. M.},
Booktitle = {Proceedings of the 6th Annual Research Conference},
Year = {1990},
Address = {Washington DC, USA},
Editor = {Bureau of the Census},
Pages = {655-665},
Owner = {nathalie},
Timestamp = {2018.06.06}
}
@Book{vanbuuren_FIMD2012,
Title = {Flexible Imputation of Missing Data},
Author = {van Buuren, S.},
Publisher = {Chapman and Hall/CRC},
Year = {2018},
Address = {Boca Raton, FL},
Abstract = {Missing data pose challenges to real-life data analysis. Simple ad-hoc fixes, like deletion or mean imputation, only work under highly restrictive conditions, which are often not met in practice. Multiple imputation replaces each missing value by multiple plausible values. The variability between these replacements reflects our ignorance of the true (but missing) value. Each of the completed data set is then analyzed by standard methods, and the results are pooled to obtain unbiased estimates with correct confidence intervals. Multiple imputation is a general approach that also inspires novel solutions to old problems by reformulating the task at hand as a missing-data problem. This is the second edition of a popular book on multiple imputation, focused on explaining the application of methods through detailed worked examples using the MICE package as developed by the author. This new edition incorporates the recent developments in this fast-moving field. This class-tested book avoids mathematical and technical details as much as possible: formulas are accompanied by verbal statements that explain the formula in accessible terms. The book sharpens the reader’s intuition on how to think about missing data, and provides all the tools needed to execute a well-grounded quantitative analysis in the presence of missing data.},
Url = {https://stefvanbuuren.name/fimd/},
Owner = {imke},
Timestamp = {2018.11.12},
Topics = {general}
}
@Article{vanbuuren_SMMR2007,
Title = {Multiple imputation of discrete and continuous data by fully conditional specification},
Author = {van Buuren, S.},
Journal = {Statistical Methods in Medical Research},
Year = {2007},
Pages = {219-242},
Volume = {16},
Abstract = {The goal of multiple imputation is to provide valid inferences for statistical estimates from incomplete data. To achieve that goal, imputed values should preserve the structure in the data, as well as the uncertainty about this structure, and include any knowledge about the process that generated the missing data. Two approaches for imputing multivariate data exist: joint modeling (JM) and fully conditional specification (FCS). JM is based on parametric statistical theory, and leads to imputation procedures whose statistical properties are known. JM is theoretically sound, but the joint model may lack flexibility needed to represent typical data features, potentially leading to bias. FCS is a semi-parametric and flexible alternative that specifies the multivariate model by a series of conditional models, one for each incomplete variable. FCS provides tremendous flexibility and is easy to apply, but its statistical properties are difficult to establish. Simulation work shows that FCS behaves very well in the cases studied. The present paper reviews and compares the approaches. JM and FCS were applied to pubertal development data of 3801 Dutch girls that had missing data on menarche (two categories), breast development (five categories) and pubic hair development (six stages). Imputations for these data were created under two models: a multivariate normal model with rounding and a conditionally specified discrete model. The JM approach introduced biases in the reference curves, whereas FCS did not. The paper concludes that FCS is a useful and easily applied flexible alternative to JM when no convenient and realistic joint distribution can be specified.},
Doi = {10.1177/0962280206074463},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {multiple imputation; fcs}
}
@Article{vanbuuren_etal_JSCS2006,
Title = {Fully conditional specification in multivariate imputation},
Author = {van Buuren, S. and Brand, J. P. L. and Groothuis-Oudshoorn, C. G. M. and Rubin, D. B.},
Journal = {Journal of Statistical Computation and Simulation},
Year = {2006},
Number = {12},
Pages = {1049-1064},
Volume = {76},
Doi = {10.1080/10629360600810434},
Owner = {nathalie},
Timestamp = {2016.09.28},
Topics = {fcs; multiple imputation}
}
@Article{vanbuuren_groothuisoudshoorn_JSS2011,
Title = {{MICE}: multivariate imputation by chained equations in {R}},
Author = {van Buuren, S. and Groothuis-Oudshoorn, K.},
Journal = {Journal of Statistical Software},
Year = {2011},
Pages = {3},
Volume = {45},
Abstract = {The R package mice imputes incomplete multivariate data by chained equations. The software mice 1.0 appeared in the year 2000 as an S-PLUS library, and in 2001 as an R package. mice 1.0 introduced predictor selection, passive imputation and automatic pooling. This article documents mice 2.9, which extends the functionality of mice 1.0 in several ways. In mice 2.9, the analysis of imputed data is made completely general, whereas the range ofmodels under which pooling works is substantially extended. mice 2.9 adds new functionality for imputing multilevel data, automatic predictor selection, data handling, post-processing imputed values, specialized pooling routines, model selection tools, and diagnostic graphs. Imputation of categorical data is improved in order to bypass problems caused by perfect prediction. Special attention is paid to transformations, sum scores, indices and interactions using passive imputation, and to the proper setup of the predictor matrix. mice 2.9 can be downloaded from the Comprehensive R Archive Network. This article provides a hands-on, stepwise approach to solve applied incomplete data problems.},
Archiveprefix = {arXiv},
Arxivid = {NIHMS150003},
Doi = {10.18637/jss.v045.i03},
Eprint = {NIHMS150003},
ISBN = {9067436771},
ISSN = {1548-7660},
Owner = {alyssa},
Pmid = {22289957},
Timestamp = {2017.10.16},
Topics = {multiple imputation; chained equations}
}
@Article{candes_etal_IEEETSP2013,
Title = {Unbiased risk estimates for singular value thresholding and spectral estimators},
Author = {Cand\`es, E. J. and Sing-Long, C. A. and Trzasko, J. D.},
Journal = {IEEE Transactions on Signal Processing},
Year = {2013},
Number = {19},
Pages = {4643-4657},
Volume = {61},
Abstract = {In an increasing number of applications, it is of interest to recover an approximately low-rank data matrix from noisy observations. This paper develops an unbiased risk estimate -- holding in a Gaussian model -- for any spectral estimator obeying some mild regularity assumptions. In particular, we give an unbiased risk estimate formula for singular value thresholding (SVT), a popular estimation strategy that applies a soft-thresholding rule to the singular values of the noisy observations. Among other things, our formulas offer a principled and automated way of selecting regularization parameters in a variety of problems. In particular, we demonstrate the utility of the unbiased risk estimation for SVT-based denoising of real clinical cardiac MRI series data. We also give new results concerning the differentiability of certain matrix-valued functions.},
Doi = {10.1109/TSP.2013.2270464},
Owner = {nathalie},
Timestamp = {2018.05.09},
Topics = {factorial data analysis; misc}
}
@article{carpenter_etal_JRSS2006,
Title = {A comparison of multiple imputation and doubly robust estimation for analyses with missing data},
Author = {Carpenter, James R. and Kenward, Michael G. and Vansteelandt, Stijn},
Journal = {Journal of the Royal Statistical Society: Series A (Statistics in Society)},
Volume = {169},
Number = {3},
Pages = {571--584},
Year = {2006},
Abstract = {Multiple imputation is now a well-established technique for analysing data sets where some units have incomplete observations. Provided that the imputation model is correct, the resulting estimates are consistent. An alternative, weighting by the inverse probability of observing complete data on a unit, is conceptually simple and involves fewer modelling assumptions, but it is known to be both inefficient (relative to a fully parametric approach) and sensitive to the choice of weighting model. Over the last decade, there has been a considerable body of theoretical work to improve the performance of inverse probability weighting, leading to the development of ‘doubly robust’ or ‘doubly protected’ estimators. We present an intuitive review of these developments and contrast these estimators with multiple imputation from both a theoretical and a practical viewpoint.},
Keywords = {Double robustness; Inverse probability weighting; Missing at random; Multiple imputation},
Doi = {10.1111/j.1467-985X.2006.00407.x},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {ipw; mi}
}
@Book{carpenter_kenward_MIA2013,
Title = {Multiple Imputation and its Application},
Author = {Carpenter, J. and Kenward, M.},
Publisher = {Wiley},
Year = {2013},
Address = {Chichester, West Sussex, UK},
Abstract = {A practical guide to analysing partially observed data. Collecting, analysing and drawing inferences from data is central to research in the medical and social sciences. Unfortunately, it is rarely possible to collect all the intended data. The literature on inference from the resulting incomplete data is now huge, and continues to grow both as methods are developed for large and complex data structures, and as increasing computer power and suitable software enable researchers to apply these methods. This book focuses on a particular statistical method for analysing and drawing inferences from incomplete data, called Multiple Imputation (MI). MI is attractive because it is both practical and widely applicable. The authors aim is to clarify the issues raised by missing data, describing the rationale for MI, the relationship between the various imputation models and associated algorithms and its application to increasingly complex data structures.},
Doi = {10.1002/9781119942283},
ISBN = {9780470740521},
Owner = {alyssa},
Timestamp = {2017.04.11},
Topics = {multiple imputation; general}
}
@InProceedings{chen_guestrin_2016,
Title = {XGBoost: A Scalable Tree Boosting System},
Author = {Chen, T. and Guestrin, C.},
Booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Year = {2016},
Editor = {-},
Address = {New York, NY, USA},
Pages = {785-794},
Publisher = {ACM},
Abstract = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable end-to-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
Doi = {10.1145/2939672.2939785},
Eventdate = {2016-08-13/2016-08-17},
ISBN = {0450342322},
Owner = {imke},
Timestamp = {2018.10.30},
Keywords = {large-scale machine learning},
Topics = {random forests}
}
@Article{chen_shao_JOS2000,
Title = {Nearest neighbor imputation for survey data},
Author = {Chen, J. and Shao, J.},
Journal = {Journal of Official Statistics},
Year = {2000},
Number = {2},
Pages = {113-131},
Volume = {16},
Abstract = {Nearest neighbor imputation is one of the hot deck methods used to compensate for nonresponse in sample surveys. Although it has a long history of application, few theoretical properties of the nearest neighbor imputation method are known prior to the current article. We show that under some conditions, the nearest neighbor imputation method provides asymptotically unbiased and consistent estimators of functions of population means (or totals), population distributions, and population quantiles. We also derive the asymptotic variances for estimators based on nearest neighbor imputation and consistent estimators of these asymptotic variances. Some simulation results show that the estimators based on nearest neighbor imputation and the proposed variance estimators have good performances.},
ISSN = {0282-423X},
Keywords = {biases; hot deck; quantiles; sample means; variance estimation},
Mendeley-groups = {missing data},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {knn},
Url = {http://www.jos.nu/Articles/abstract.asp?article=162113}
}
@Article{collins_etal_PM2007,
Title = {A comparison of inclusive and restrictive strategies in modern missing data procedures},
Author = {Collins, L. M. and Schafer, J. L. and Chi-Ming, K.},
Journal = {Psychological Methods},
Year = {2007},
Number = {4},
Pages = {330-351},
Volume = {6},
Abstract = {Two classes of modem missing data procedures, maximum likelihood (ML) and multiple imputation (MI), tend to yield similar results when implemented in comparable ways. In either approach, it is possible to include auxiliary variables solely for the purpose of improving the missing data procedure. A simulation was presented to assess the potential costs and benefits of a restrictive strategy, which makes minimal use of auxiliary variables, versus an inclusive strategy, which makes liberal use of such variables. The simulation showed that the inclusive strategy is to be greatly preferred. With an inclusive strategy not only is there a reduced chance of inadvertently omitting an important cause of missingness, there is also the possibility of noticeable gains in terms of increased efficiency and reduced bias, with only minor costs. As implemented in currently available software, the ML approach tends to encourage the use of a restrictive strategy, whereas the MI approach makes it relatively simple to use an inclusive strategy.},
Doi = {10.1037/1082-989X.6.4.330},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {multiple imputation; ml}
}
@Article{cranmer_gill_BJPS2012,
Title = {We have to be discrete about this: a non-parametric imputation technique for missing categorical data},
Author = {Cranmer, S. J. and Gill, J.},
Journal = {British Journal of Political Science},
Year = {2012},
Pages = {425-449},
Volume = {43},
Abstract = {Missing values are a frequent problem in empirical political science research. Surprisingly, the match between the measurement of the missing values and the correcting algorithms applied is seldom studied. While multiple imputation is a vast improvement over the deletion of cases with missing values, it is often unsuitable for imputing highly non-granular discrete data. We develop a simple technique for imputing missing values in such situations, which is a variant of hot deck imputation, drawing from the conditional distribution of the variable with missing values to preserve the discrete measure of the variable. This method is tested against existing techniques using Monte Carlo analysis and then applied to real data on democratization and modernization theory. Software for our imputation technique is provided in a free, easy-to-use package for the R statistical environment.},
Doi = {10.1017/S0007123412000312},
Owner = {nathalie},
Timestamp = {2016.02.15},
Topics = {knn; imputation}
}
@Article{crookston_finley_JSS2008,
Title = {{yaImpute}: an {R} package for {kNN} imputation},
Author = {Crookston, N. L. and Finley, A. O.},
Journal = {Journal of Statistical Software},
Year = {2008},
Pages = {10},
Volume = {23},
Abstract = {This article introduces yaImpute, an R package for nearest neighbor search and imputation. Although nearest neighbor imputation is used in a host of disciplines, the methods implemented in the yaImpute package are tailored to imputation-based forest attribute estimation and mapping. The impetus to writing the yaImpute is a growing interest in nearest neighbor imputation methods for spatially explicit forest inventory, and a need within this research community for software that facilitates comparison among different nearest neighbor search algorithms and subsequent imputation techniques. yaImpute provides directives for defining the search space, subsequent distance calculation, and imputation rules for a given number of nearest neighbors. Further, the package offers a suite of diagnostics for comparison among results generated from different imputation analyses and a set of functions for mapping imputation results.},
Doi = {10.18637/jss.v023.i10},
Owner = {nathalie},
Timestamp = {2017.10.09},
Topics = {knn; imputation}
}
@Article{dax_2014,
Title = {Imputing Missing Entries of a Data Matrix: A review},
Author = {Dax, A.},
Journal = {Journal of Advanced Computing},
Year = {2014},
Pages = {98-222},
Volume = {3},
Number = {3},
Abstract = {This review presents a practical summary of the missing data literature, including a sketch of missing data theory and descriptions of normal-model multiple imputation (MI) and maximum likelihood methods. Practical missing data analysis issues are discussed, most notably the inclusion of auxiliary variables for improving power and reducing bias. Solutions are given for missing data challenges such as handling longitudinal, categorical, and clustered data with normal-model MI; including interactions in the missing data model; and handling large numbers of variables. The discussion of attrition and nonignorable missingness emphasizes the need for longitudinal diagnostics and for reducing the uncertainty about the missing data mechanism under attrition. Strategies suggested for reducing attrition bias include using auxiliary variables, collecting follow-up data on a sample of those initially missing, and collecting data on intent to drop out. Suggestions are given for moving forward with research on missing data and attrition.},
Doi = {10.7726/jac.2014.1007},
Keywords = {imputation; missing data; matrix completion problems; low-rank approximations; nearest neighbors; iterative SVD; least squares methods; rank minimization; nuclear norm minimization; error assessment; training set; probe set; cross-validation; rank determination},
Owner = {imke},
Timestamp = {2018.11.07},
Topics = {general_informal; knn; imputation}
}
@Article{dempster_etal_JRSSB1977,
Title = {Maximum likelihood from incomplete data via the {EM} algorithm},
Author = {Dempster, A. P. and Laird, N. M. and Rubin, D. B.},
Journal = {Journal of the Royal Statistical Society, Series B (Methodological)},
Year = {1977},
Number = {1},
Pages = {1-38},
Volume = {39},
Keywords = {maximum likelihood estimation; statistical variance; statism; factor analysis; algorithms; estimation methods; missing data; censored data; perceptron convergence procedure},
Owner = {nathalie},
Timestamp = {2018.05.11},
Topics = {ML},
Url = {http://www.jstor.org/stable/2984875}
}
@Article{diggle_kenward_AP1994,
Title = {Informative drop-out in longitudinal data analysis},
Author = {Diggle, P. and Kenward, M. G.},
Journal = {Journal of the Royal Statistical Society, Series C (Applied Statistics)},
Year = {1994},
Number = {1},
Pages = {49-93},
Volume = {43},
Abstract = {A model is proposed for continuous longitudinal data with non-ignorable or informative drop-out (ID). The model combines a multivariate linear model for the underlying response with a logistic regression model for the drop-out process. The latter incorporates dependence of the probability of drop-out on unobserved, or missing, observations. Parameters in the model are estimated by using maximum likelihood (ML) and inferences drawn through conventional likelihood procedures. In particular, likelihood ratio tests can be used to assess the informativeness of the drop-out process through comparison of the full model with reduced models corresponding to random drop-out (RD) and completely random processes. A simulation study is used to assess the procedure in two settings: the comparison of time trends under a linear regression model with autocorrelated errors and the estimation of period means and treatment differences from a four-period four-treatment crossover trial. It is seen in both settings that, when data are generated under an ID process, the ML estimators from the ID model do not suffer from the bias that is present in the ordinary least squares and RD ML estimators. The approach is then applied to three examples. These derive from a milk protein trial involving three groups of cows, milk yield data from a study of mastitis in dairy cattle and data from a multicentre clinical trial on the study of depression. All three examples provide evidence of an underlying ID process, two with some strength. It is seen that the assumption of an ID rather than an RD process has practical implications for the interpretation of the data.},
Doi = {10.2307/2986113},
ISBN = {00359254},
ISSN = {00359254, 14679876},
Keywords = {longitudinal methods; missing data},
Mendeley-groups = {missing data},
Owner = {alyssa},
Pmid = {6121453},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Article{ding_li_SS2018,
Title = {Causal Inference: A Missing Data Perspective},
Author = {Ding, P. and Li, F.},
Journal = {Statistical Science},
Year = {2018},
Volume = {33},
Number = {2},
Pages = {214--237},
Abstract = {Inferring causal effects of treatments is a central goal in many disciplines. The potential outcomes framework is a main statistical approach to causal inference, in which a causal effect is defined as a comparison of the potential outcomes of the same units under different treatment conditions. Because for each unit at most one of the potential outcomes is observed and the rest are missing, causal inference is inherently a missing data problem. Indeed, there is a close analogy in the terminology and the inferential framework between causal inference and missing data. Despite the intrinsic connection between the two subjects, statistical analyses of causal inference and missing data also have marked differences in aims, settings and methods. This article provides a systematic review of causal inference from the missing data perspective. Focusing on ignorable treatment assignment mechanisms, we discuss a wide range of causal inference methods that have analogues in missing data analysis, such as imputation, inverse probability weighting and doubly robust methods. Under each of the three modes of inference—Frequentist, Bayesian and Fisherian randomization—we present the general structure of inference for both finite-sample and super-population estimands, and illustrate via specific examples. We identify open questions to motivate more research to bridge the two fields.},
Doi = {10.1214/18-STS645},
Keywords = {assignment mechanism; ignorability; imputation; missing data mechanism; observational studies; potential outcome; propensity score; randomizatoin; weighting},
Owner = {imke},
Timestamp = {2018.12.11},
Topics = {causal inference}
}
@Article{ding_simonoff_JMLR2010,
Title = {An investigation of missing data methods for classification trees applied to binary response data},
Author = {Ding, Y. and Simonoff, J. S.},
Journal = {Journal of Machine Learning Research},
Year = {2010},
Pages = {131-170},
Volume = {11},
Number = {1},
Abstract = {There are many different methods used by classification tree algorithms when missing data occur in the predictors, but few studies have been done comparing their appropriateness and performance. This paper provides both analytic and Monte Carlo evidence regarding the effectiveness of six popular missing data methods for classification trees applied to binary response data. We show that in the context of classification trees, the relationship between the missingness and the dependent variable, as well as the existence or non-existence of missing values in the testing data, are the most helpful criteria to distinguish different missing data methods. In particular, separate class is clearly the best method to use when the testing set has missing values and the missingness is related to the response variable. A real data set related to modeling bankruptcy of a firm is then analyzed. The paper concludes with discussion of adaptation of these results to logistic regression, and other potential generalizations.},
Keywords = {classification tree; missing data; separate class; rpart; C4.5; cart},
Owner = {nathalie},
Timestamp = {2016.11.30},
Topics = {imputation; surrogate variables; classification trees},
Url = {http://www.jmlr.org/papers/v11/ding10a.html}
}
@Article{dong_peng_SP2013,
Title = {Principled missing data methods for researchers},
Author = {Dong, Yiran and Peng, Chao-Ying Joanne},
Journal = {SpringerPlus},
Year = {2013},
Pages = {222},
Volume = {2},
Abstract = {The impact of missing data on quantitative research can be serious, leading to biased estimates of parameters, loss of information, decreased statistical power, increased standard errors, and weakened generalizability of findings. In this paper, we discussed and demonstrated three principled missing data methods: multiple imputation, full information maximum likelihood, and expectation-maximization algorithm, applied to a real-world data set. Results were contrasted with those obtained from the complete data set and from the listwise deletion method. The relative merits of each method are noted, along with common features they share. The paper concludes with an emphasis on the importance of statistical assumptions, and recommendations for researchers. Quality of research will be enhanced if (a) researchers explicitly acknowledge missing data problems and the conditions under which they occurred, (b) principled methods are employed to handle missing data, and (c) the appropriate treatment of missing data is incorporated into review standards of manuscripts submitted for publication.},
Doi = {10.1186/2193-1801-2-222},
Keywords = {missing data; listwise deletion; mi; fiml; em; mar; mcar; mnar},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {general_informal}
}
@Book{enders_AMDA2010,
Title = {Applied Missing Data Analysis},
Author = {Enders, C. K.},
Publisher = {Guilford Press},
Year = {2010},
Abstract = {Walking readers step by step through complex concepts, this book translates missing data techniques into something that applied researchers and graduate students can understand and utilize in their own research. Enders explains the rationale and procedural details for maximum likelihood estimation, Bayesian estimation, multiple imputation, and models for handling missing not at random (MNAR) data. Easy-to-follow examples and small simulated data sets illustrate the techniques and clarify the underlying principles. The companion website includes data files and syntax for the examples in the book as well as up-to-date information on software. The book is accessible to substantive researchers while providing a level of detail that will satisfy quantitative specialists.},
ISBN = {9781606236390},
Owner = {alyssa},
Pages = {401},
Timestamp = {2016.09.27},
Topics = {general}
}
@Article{enders_SEM2001,
Title = {A primer on maximum likelihood algorithms available for use with missing data},
Author = {Enders, C. K.},
Journal = {Structural Equation Modeling},
Year = {2001},
Number = {1},
Pages = {128-141},
Volume = {8},
Abstract = {Maximum likelihood algorithms for use with missing data are becoming commonplace in microcomputer packages. Specifically, 3 maximum likelihood algorithms are currently available in existing software packages: the multiple-group approach, full information maximum likelihood estimation, and the EM algorithm. Although they belong to the same family of estimator, confusion appears to exist over the differences among the 3 algorithms. This article provides a comprehensive, nontechnical overview of the 3 maximum likelihood algorithms. Multiple imputation, which is frequently used in conjunction with the EM algorithm, is also discussed.},
Doi = {10.1207/S15328007SEM0801_7},
Owner = {alyssa},
Timestamp = {2017.07.07},
Topics = {ml}
}
@article{fang_etal_2018,
Title = {Imputation-based adjusted score equations in generalized linear models with nonignorable missing covariate values},
Author = {Fang, F. and Zhao, J. and Shao, J.},
Journal = {Statistica Sinica},
Volume = {28},
Year = {2018},
Number = {4},
Pages = {1677--1701},
Publisher = {Institute of Statistical Science},
Abstract = {We consider the estimation of unknown parameters in a generalized linear model when some covariates have nonignorable missing values. When an instrument, a covariate that helps identifying parameters under nonignorable missingness, is appropriately specified, a pseudo likelihood approach similar to that in Tang, Little and Raghunathan (2003) or Zhao and Shao (2015) can be applied. However, this approach does not work well when the instrument is a weak predictor of the response given other covariates. We show that the asymptotic variances of the pseudo likelihood estimators for the regression coefficients of covariates other than the instrument diverge to infinity as the regression coefficient of the instrument goes to 0. By an imputation-based adjustment for the score equations, we propose a new estimator for the regression coefficients of the covariates other than the instrument. This works well even if the instrument is a weak predictor. It is semiparametric since the propensity of missing covariate data is completely unspecified. To solve the adjusted score equation, we develop an iterative algorithm that can be applied by using standard softwares at each iteration. We establish some theoretical results on the convergence of the proposed iterative algorithm and asymptotic normality of the resulting estimators. A variance estimation formula is also derived. Some simulation results and a data example are presented for illustration.},
Doi = {10.5705/ss.202015.0437},
Keywords = {Adjusted likelihood; Identifiability; Nonignorable missing covariate data; Pseudo-likelihood; Semiparametric},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar}
}
@Article{fay_JASA1996,
Title = {Alternative paradigms for the analysis of imputed survey data},
Author = {Fay, R. E.},
Journal = {Journal of the American Statistical Association},
Year = {1996},
Number = {434},
Pages = {490-498},
Volume = {91},
Abstract = {Rubin has offered multiple imputation as a general approach to inference from survey data sets with missing values filled in through imputation. In many situations the multiple imputation variance estimator is consistent. In tum, this observation has lent support to a number of complex applications. In fact, however, the multiple imputation variance estimator is inconsistent under some simple conditions. This article extends previous work of Rao and Shao and of Fay directed toward consistent variance estimation under wider conditions. Extensions of Rao and Shao's results to fractionally weighted imputation combines the estimation efficiency of multiple imputation and the consistency of the Rao-Shao variance estimator.},
Doi = {10.1080/01621459.1996.10476909},
Keywords = {fractionally weighted imputation; missing data; multiple imputation; Rao-Shao variance estimator},
Owner = {nathalie},
Timestamp = {2018.05.16},
Topics = {multiple imputation}
}
@Article{fellegi_holt_JASA1976,
Title = {A systematic approach to automatic edit and imputation},
Author = {Fellegi, I. P. and Holt, D.},
Journal = {Journal of the American Statistical Association},
Year = {1976},
Number = {353},
Pages = {17-35},
Volume = {71},
Doi = {10.2307/2285726},
Owner = {nathalie},
Timestamp = {2018.05.23},
Topics = {imputation}
}
@Article{ferrari_etal_CSDA2011,
Title = {An imputation method for categorical variables with application to nonlinear principal component analysis},
Author = {Ferrari, Pier Alda and Annoni, Paola and Barbiero, Alessandro and Manzi, Giancario},
Journal = {Computational Statistics \& Data Analysis},
Year = {2011},
Number = {7},
Pages = {2410-2420},
Volume = {55},
Abstract = {The problem of missing data in building multidimensional composite indicators is a delicate problem which is often underrated. An imputation method particularly suitable for categorical data is proposed. This method is discussed in detail in the framework of nonlinear principal component analysis and compared to other missing data treatments which are commonly used in this analysis. Its performance vs. these other methods is evaluated throughout a simulation procedure performed on both an artificial case, varying the experimental conditions, and a real case. The proposed procedure is implemented using R.},
Doi = {10.1016/j.csda.2011.02.007},
Keywords = {composite indicators; forward imputation; imputation procedure; listwise deletion; nearest neighbor; ordinal data; passive treatment},
Owner = {nathalie},
Timestamp = {2018.06.07},
Topics = {imputation; knn; factorial data analysis}
}
@Article{finkbeiner_P1979,
Title = {Estimation for the multiple factor model when data are missing},
Author = {Finkbeiner, C.},
Journal = {Psychometrika},
Year = {1979},
Number = {4},
Pages = {409-420},
Volume = {44},
Abstract = {A maximum likelihood method of estimating the parameters of the multiple factor model when data are missing from the sample is presented. A Monte Carlo study compares the method with 5 heuristic methods of dealing with the problem. The present method shows some advantage in accuracy of estimation over the heuristic methods but is considerably more costly computationally.},
Doi = {10.1007/BF02296204},
Keywords = {factor analysis; missing data},
Owner = {nathalie},
Timestamp = {2018.05.11},
Topics = {imputation; ml}
}
@article{fitzmorice_etal_JRSS1995,
Title = {Regression Models for Longitudinal Binary Responses with Informative Drop-Outs},
Author = {Fitzmaurice, Garrett M. and Molenberghs, Geert and Lipsitz, Stuart R.},
Journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
Number = {4},
Pages = {691--704},
Publisher = {[Royal Statistical Society, Wiley]},
Volume = {57},
Year = {1995},
Abstract = {This paper reviews both likelihood-based and non-likelihood (generalized estimating equations) regression models for longitudinal binary responses when there are drop-outs. Throughout, it is assumed that the regression parameters for the marginal expectations of the binary responses are of primary scientific interest. The association or time dependence between the responses is largely regarded as a nuisance characteristic of the data. The performance of the methods is compared, in terms of asymptotic bias, under misspecification of the association between the responses and the missing data mechanism or drop-out process.},
ISSN = {00359246},
Url = {http://www.jstor.org/stable/2345937},
Keywords = {Generalized Estimating Equations; Maximum Likelihood Estimation; Missing Data; Repeated Measures},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {survey}
}
@Article{follman_wu_B1995,
Title = {An approximate generalized linear model with random effects for informative missing data},
Author = {Follmann, D. and Wu, M.},
Journal = {Biometrics},
Year = {1995},
Number = {1},
Pages = {151-168},
Volume = {51},
Abstract = {This paper develops a class of models to deal with missing data from longitudinal studies. We assume that separate models for the primary response and missingness (e.g., number of missed visits) are linked by a common random parameter. Such models have been developed in the econometrics (Heckman, 1979, Econometrica 47, 153-161) and biostatistics (Wu and Carroll, 1988, Biometrics 44, 175-188) literature for a Gaussian primary response. We allow the primary response, conditional on the random parameter, to follow a generalized linear model and approximate the generalized linear model by conditioning on the data that describes missingness. The resultant approximation is a mixed generalized linear model with possibly heterogeneous random effects. An example is given to illustrate the approximate approach, and simulations are performed to critique the adequacy of the approximation for repeated binary data.},
Doi = {10.2307/2533322},
ISSN = {0006341X, 15410420},
Owner = {alyssa},
Publisher = {[Wiley, International Biometric Society]},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Article{gad_darwish_AJAMS2013,
Title = {A shared parameter model for longitudinal data with missing values},
Author = {Gad, A. M. and Darwish, N. M. M.},
Journal = {American Journal of Applied Mathematics and Statistics},
Year = {2013},
Number = {2},
Pages = {30-35},
Volume = {1},
Abstract = {Longitudinal studies represent one of the principal research strategies employed in medical and social research. These studies are the most appropriate for studying individual change over time. The prematurely withdrawal of some subjects from the study (dropout) is termed nonrandom when the probability of missingness depends on the missing value. Nonrandom dropout is common phenomenon associated with longitudinal data and it complicates statistical inference. The shared parameter model is used to fit longitudinal data in the presence of nonrandom dropout. The stochastic EM algorithm is developed to obtain the model parameter estimates. Also, parameter estimates of the dropout model have been obtained. Standard errors of estimates have been calculated using the developed Monte Carlo method. The proposed approach performance is evaluated through a simulation study. Also, the proposed approach is applied to a real data set.},
Owner = {alyssa},
Timestamp = {2017.08.07},
Topics = {mnar},
Url = {http://pubs.sciepub.com/ajams/1/2/3}
}
@Article{gelman_etal_1998,
Title = {Not asked and not answered: Multiple imputation for multiple surveys},
Author = {Gelman, A. and King, G. and Liu, C.},
Journal = {Journal of the American Statistical Association},
Volume = {93},
Number = {443},
Pages = {846--857},
Year = {1998},
Publisher = {Taylor \& Francis Group},
Abstract = {We present a method of analyzing a series of independent cross-sectional surveys in which some questions are not answered in some surveys and some respondents do not answer some of the questions posed. The method is also applicable to a single survey in which different questions are asked or different sampling methods are used in different strata or clusters. Our method involves multiply imputing the missing items and questions by adding to existing methods of imputation designed for single surveys a hierarchical regression model that allows covariates at the individual and survey levels. Information from survey weights is exploited by including in the analysis the variables on which the weights were based, and then reweighting individual responses (observed and imputed) to estimate population quantities. We also develop diagnostics for checking the fit of the imputation model based on comparing imputed data to nonimputed data. We illustrate with the example that motivated this project: a study of pre-election public opinion polls in which not all the questions of interest are asked in all the surveys, so that it is infeasible to impute within each survey separately.},
Keywords = {Bayesian inference; cluster sampling; diagnostics; hierarchical models; ignorable nonresponse; missing data; political science; sample surveys; stratified sampling},
Doi = {10.1080/01621459.1998.10473737},
Owner = {imke},
Timestamp = {2018.11.19},
Topics = {mi; survey}
}
@Article{gelman_etal_2005,
Title = {Multiple Imputation for Model Checking: Completed-Data Plots with Missing and Latent Data},
Author = {Gelman, A. and van Mechelen, I. and Verbeke, G. and Heitjan, D. F. and Meulders, M.},
Journal = {Biometrics},
Volume = {61},
Number = {1},
Pages = {74--85},
Year = {2005},
Publisher = {Wiley Online Library},
Abstract = {In problems with missing or latent data, a standard approach is to first impute the unobserved data, then perform all statistical analyses on the completed dataset -- corresponding to the observed data and imputed unobserved data -- using standard procedures for complete‐data inference. Here, we extend this approach to model checking by demonstrating the advantages of the use of completed‐data model diagnostics on imputed completed datasets. The approach is set in the theoretical framework of Bayesian posterior predictive checks (but, as with missing‐data imputation, our methods of missing‐data model checking can also be interpreted as “predictive inference” in a non‐Bayesian context). We consider the graphical diagnostics within this framework. Advantages of the completed‐data approach include: (1) One can often check model fit in terms of quantities that are of key substantive interest in a natural way, which is not always possible using observed data alone. (2) In problems with missing data, checks may be devised that do not require to model the missingness or inclusion mechanism; the latter is useful for the analysis of ignorable but unknown data collection mechanisms, such as are often assumed in the analysis of sample surveys and observational studies. (3) In many problems with latent data, it is possible to check qualitative features of the model (for example, independence of two variables) that can be naturally formalized with the help of the latent data. We illustrate with several applied examples.},
Keywords = {Bayesian model checking; exploratory data analysis; multiple imputation; nonresponse; posterior predictive checks; realized discrepancies; residuals},
Doi = {10.1111/j.0006-341X.2005.031010.x},
Owner = {imke},
Timestamp = {2018.11.19},
Topics = {mi}
}
@InProceedings{gill_etal_1997,
Title = {Coarsening at random: Characterizations, conjectures, counter-examples},
Author = {Gill, Richard D and Van Der Laan, Mark J and Robins, James M},
Booktitle = {Proceedings of the First Seattle Symposium in Biostatistics},
Pages = {255--294},
Year = {1997},
Organization = {Springer},
Abstract = {The notion of coarsening at random (CAR) was introduced by Heitjan and Rubin (1991) to describe the most general form of randomly grouped, censored, or missing data, for which the coarsening mechanism can be ignored when making likelihood-based inference about the parameters of the distribution of the variable of interest. The CAR assumption is popular, and applications abound. However the full implications of the assumption have not been realized. Moreover a satisfactory theory of CAR for continuously distributed data -- which is needed in many applications, particularly in survival analysis -- hardly exists as yet. This paper gives a detailed study of CAR. We show that grouped data from a finite sample space always fit a CAR model: a nonparametric model for the variable of interest together with the assumption of an arbitrary CAR mechanism puts no restriction at all on the distribution of the observed data. In a slogan, CAR is everything. We describe what would seem to be the most general way CAR data could occur in practice, a sequential procedure called randomized monotone coarsening. We show that CAR mechanisms exist which are not of this type. Such a coarsening mechanism uses information about the underlying data which is not revealed to the observer, without this affecting the observer’s conclusions. In a second slogan, CAR is more than it seems. This implies that if the analyst can argue from subject-matter considerations that coarsened data is CAR, he or she has knowledge about the structure of the coarsening mechanism which can be put to good use in non-likelihood-based inference procedures. We argue that this is a valuable option in multivariate survival analysis. We give a new definition of CAR in general sample spaces, criticising earlier proposals, and we establish parallel results to the discrete case. The new definition focusses on the distribution rather than the density of the data. It allows us to generalise the theory of CAR to the important situation where coarsening variables (e.g., censoring times) are partially observed as well as the variables of interest.},
Keywords = {coarsening at random; CAR; missingness mechanisms; survival analysis},
Doi = {10.1007/978-1-4684-6316-3_14},
Owner = {imke},
Timestamp = {2019.08.02},
Topics = {mnar; mechanisms}
}
@InProceedings{gondara_wang_2018,
Title = {MIDA: Multiple Imputation using Denoising Autoencoders},
Author = {Gondara, L. and Wang, K.},
Booktitle = {Proceedings of the 22nd Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 2018)},
Series = {Lecture Notes in Computer Science},
Year = {2018},
Editor = {Phung, D. and Tseng, V. and Webb, G. and Ho, B. and Ganji, M. and Rashidi, L.},
Pages = {260-272},
Publisher = {Springer International Publishing},
Eventdate = {2018-06-03/2018-06-06},
ISBN = {3319930404},
Abstract = {Missing data is a significant problem impacting all domains. State-of-the-art framework for minimizing missing data bias is multiple imputation, for which the choice of an imputation model remains nontrivial. We propose a multiple imputation model based on overcomplete deep denoising autoencoders. Our proposed model is capable of handling different data types, missingness patterns, missingness proportions and distributions. Evaluation on several real life datasets show our proposed model significantly outperforms current state-of-the-art methods under varying conditions while simultaneously improving end of the line analytics.},
Doi = {10.1007/978-3-319-93040-4_21},
Url = {https://arxiv.org/abs/1705.02737},
Keywords = {Multiple imputation; denoising autoencoders; DAE},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {multiple imputation; deep learning}
}
@InProceedings{goodfellow_etal_2013,
Title = {Multi-Prediction Deep Boltzmann Machines},
Author = {Goodfellow, I. and Mirza, M. and Courville, A. and Bengio, Y.},
Booktitle = {Proceedings of the 26th International Conference on Neural Information Processing Systems},
Series = {Advances in Neural Information Processing Systems 26},
Editor = {Burges, C.J.C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K.Q.},
Pages = {548--556},
Year = {2013},
Publisher = {Curran Associates, Inc.},
Eventdate = {2013-12-05/2013-12-10},
Abstract = {We introduce the Multi-Prediction Deep Boltzmann Machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.},
Url = {http://papers.nips.cc/paper/5024-multi-prediction-deep-boltzmann-machines.pdf},
Keywords = {Classification; deep Boltzmann Machines; DBM; pseudolikelihood},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {classification; deep learning}
}
@Article{graham_ARP2009,
Title = {Missing data analysis: making it work in the real world},
Author = {Graham, J. W.},
Journal = {Annual Review of Psychology},
Year = {2009},
Pages = {549-576},
Volume = {60},
Abstract = {This review presents a practical summary of the missing data literature, including a sketch of missing data theory and descriptions of normal-model multiple imputation (MI) and maximum likelihood methods. Practical missing data analysis issues are discussed, most notably the inclusion of auxiliary variables for improving power and reducing bias. Solutions are given for missing data challenges such as handling longitudinal, categorical, and clustered data with normal-model MI; including interactions in the missing data model; and handling large numbers of variables. The discussion of attrition and nonignorable missingness emphasizes the need for longitudinal diagnostics and for reducing the uncertainty about the missing data mechanism under attrition. Strategies suggested for reducing attrition bias include using auxiliary variables, collecting follow-up data on a sample of those initially missing, and collecting data on intent to drop out. Suggestions are given for moving forward with research on missing data and attrition.},
Doi = {10.1146/annurev.psych.58.110405.085530},
ISBN = {0066-4308 (Print) 0066-4308 (Linking)},
ISSN = {0066-4308},
Mendeley-groups = {missing data},
Owner = {alyssa},
Pmid = {18652544},
Shorttitle = {Missing Data Analysis},
Timestamp = {2016.11.30},
Topics = {general}
}
@InBook{graham_etal_SP1997,
Title = {The Science of Prevention: Methodological Advances from Alcohol and Substance Abuse Research},
Booktitle = {The Science of Prevention: Methodological Advances from Alcohol and Substance Abuse Research},
Author = {Graham, J. W. and Hofer, S. M. and Donaldson, S. I. and MacKinnon, D. P. and Schafer, J. L.},
Chapter = {Analysis with missing data in prevention research},
Editor = {Bryant, K.J. and Windle, M. and West, S.G.},
Pages = {325-366},
Publisher = {American Psychological Association},
Year = {1997},
Address = {Washington, DC, USA},
Abstract = {(from the chapter) Outlines leading approaches to dealing with missing data problems, specifically as they apply to alcohol and drug prevention research. First, the authors discuss methods for missing continuous data. (the expectation-maximization algorithm, multiple imputation, multiple-group structural equation modeling, and raw maximum-likelihood). Next, they discuss missing categorical data, present the beginnings of a maximum-likelihood approach to analysis with missing categorical data, discuss the use of a multiple imputation procedure for categorical data, and touch on the use of continuous-data methods for analyzing categorical data. The authors then discuss what happens when the assumptions underlying their recommended approach are not met fully with new data being presented relating to the causes of missingness. A general sensitivity analysis for the case in which the assumptions of the recommended missing data procedures are not met fully is presented. Finally, the authors discuss new approaches available to prevention and applied psychological researchers and suggest that prevention studies in general may be relatively free from serious attrition biases (if recommended analyses are used). (PsycINFO Database Record (c) 2002 APA, all rights reserved).},
Doi = {10.1037/10222-010},
ISBN = {1-55798-439-5},
ISSN = {1046-9516},
Keywords = {alcohol abuse; drug abuse prevention; experimentation; statistical estimation; maximum likelihood},
Owner = {nathalie},
Pmid = {9243532},
Timestamp = {2018.07.12},
Topics = {}
}
@Article{graham_etal_PS2007,
Title = {How many imputations are really needed? Some practical clarifications of multiple imputation theory},
Author = {Graham, John W. and Olchowski, Allison E. and Gilreath, Tamika E.},
Journal = {Prevention Science},
Year = {2007},
Number = {3},
Pages = {206-213},
Volume = {8},
Abstract = {Multiple imputation (MI) and full information maximum likelihood (FIML) are the two most common approaches to missing data analysis. In theory, MI and FIML are equivalent when identical models are tested using the same variables, and when m, the number of imputations performed with MI, approaches infinity. However, it is important to know how many imputations are necessary before MI and FIML are sufficiently equivalent in ways that are important to prevention scientists. MI theory suggests that small values of m, even on the order of three to five imputations, yield excellent results. Previous guidelines for sufficient m are based on relative efficiency, which involves the fraction of missing information (gamma) for the parameter being estimated, and m. In the present study, we used a Monte Carlo simulation to test MI models across several scenarios in which gamma and m were varied. Standard errors and p-values for the regression coefficient of interest varied as a function of m, but not at the same rate as relative efficiency. Most importantly, statistical power for small effect sizes diminished as m became smaller, and the rate of this power falloff was much greater than predicted by changes in relative efficiency. Based our findings, we recommend that researchers using MI should perform many more imputations than previously considered sufficient. These recommendations are based on gamma, and take into consideration one's tolerance for a preventable power falloff (compared to FIML) due to using too few imputations.},
Doi = {10.1007/s11121-007-0070-9},
Keywords = {multiple imputation; number of imputations; full information maximum likelihood; missing data; statistical power},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {multiple imputation}
}
@Article{heckman_E1979,
Title = {Sample selection bias as a specification error},
Author = {Heckman, J.},
Journal = {Econometrica},
Year = {1979},
Number = {1},
Pages = {153-161},
Volume = {47},
Abstract = {This paper discusses the bias that results from using nonrandomly selected samples to estimate behavioral relationships as an ordinary specification error or "omitted variables" bias. A simple consistent two stage estimator is considered that enables analysts to utilize simple regression methods to estimate behavioral functions by least squares methods. The asymptotic distribution of the estimator is derived.},
Doi = {10.2307/1912352},
ISSN = {00129682, 14680262},
Owner = {alyssa},
Publisher = {[Wiley, Econometric Society]},
Timestamp = {2017.10.25},
Topics = {}
}
@Article{heckman_AESM1976,
Title = {The common structure of statistical models of truncation, sample selection and limited dependent variables and a simple estimator for such models},
Author = {Heckman, J. J.},
Journal = {Annals of Economic and Social Measurement},
Year = {1976},
Number = {4},
Pages = {475-492},
Volume = {5},
ISBN = {0691003637},
ISSN = {0361-8595},
Mendeley-groups = {missing data},
Owner = {alyssa},
Pmid = {9590},
Timestamp = {2017.10.25},
Topics = {mnar},
Url = {http://ideas.repec.org/h/nbr/nberch/10491.html}
}
@Article{hogan_laird_SM1997,
Title = {Mixture models for the joint distribution of repeated measures and event times},
Author = {Hogan, J. W. and Laird, N. M.},
Journal = {Statistics in Medecine},
Year = {1997},
Number = {1-3},
Pages = {239-257},
Volume = {16},
Abstract = {Many long-term clinical trials collect both a vector of repeated measurements and an event time on each subject; often, the two outcomes are dependent. One example is the use of surrogate markers to predict disease onset or survival. Another is longitudinal trials which have outcome-related dropout. We describe a mixture model for the joint distribution which accommodates incomplete repeated measures and right-censored event times, and provide methods for full maximum likelihood estimation. The methods are illustrated through analysis of data from a clinical trial for a new schizophrenia therapy; in the trial, dropout time is closely related to outcome, and the dropout process differs between treatments. The parameter estimates from the model are used to make a treatment comparison after adjusting for the effects of dropout. An added benefit of the analysis is that it permits using the repeated measures to increase efficiency of estimates of the event time distribution.},
Keywords = {mnar; mixture model; ml; clinical trial},
Owner = {alyssa},
Timestamp = {2017.08.07},
Topics = {}
}
@article{hogan_lancaster_2004,
Title = {Instrumental variables and inverse probability weighting for causal inference from longitudinal observational studies},
Author = {Hogan, J. W. and Lancaster, T.},
Journal = {Statistical Methods in Medical Research},
Volume = {13},
Number = {1},
Pages = {17-48},
Year = {2004},
Doi = {10.1191/0962280204sm351ra},
Abstract = {Inferring causal effects from longitudinal repeated measures data has high relevance to a number of areas of research, including economics, social sciences and epidemiology. In observational studies in particular, the treatment receipt mechanism is typically not under the control of the investigator; it can depend on various factors, including the outcome of interest. This results in differential selection into treatment levels, and can lead to selection bias when standard routines such as least squares regression are used to estimate causal effects. Interestingly, both the characterization of and methodology for handling selection bias can differ substantially by disciplinary tradition. In social sciences and economics, instrumental variables (IV) is the standard method for estimating linear and nonlinear models in which the error term may be correlated with an observed covariate. When such correlation is not ruled out, the covariate is called endogenous and least squares estimates of the covariate effect are typically biased. The availability of an instrumental variable can be used to reduce or eliminate the bias.In public health and clinical medicine (e.g., epidemiology and biostatistics), selection bias is typically viewed in terms of confounders, and the prevailing methods are geared toward making proper adjustments via explicit use of observed confounders (e.g., stratification, standardization). A class of methods known as inverse probability weighting (IPW) estimators, which relies on modeling selection in terms of confounders, is gaining in popularity for making such adjustments.Our objective is to review and compare IPW and IV for estimating causal treatment effects from longitudinal data, where the treatment may vary with time. We accomplish this by defining the causal estimands in terms of a linear stochastic model of potential outcomes (counterfactuals). Our comparison includes a review of terminology typically used in discussions of causal inference (e.g., confounding, endogeneity); a review of assumptions required to identify causal effects and their implications for estimation and interpretation; description of estimation via inverse weighting and instrumental variables; and a comparative analysis of data from a longitudinal cohort study of HIV-infected women. In our discussion of assumptions and estimation routines, we try to emphasize sufficient conditions needed to implement relatively standard analyses that can essentially be formulated as regression models. In that sense this review is geared toward the quantitative practitioner.The objective of the data analysis is to estimate the causal (therapeutic) effect of receiving combination antiviral therapy on longitudinal CD4 cell counts, where receipt of therapy varies with time and depends on CD4 count and other covariates. Assumptions are reviewed in context, and resulting inferences are compared. The analysis illustrates the importance of considering the existence of unmeasured confounding and of checking for ‘weak instruments.’ It also suggests that IV methodology may have a role in longitudinal cohort studies where potential instrumental variables are available.},
Keywords = {causal inference; longitudinal data; ipw; instrumental variables},
Owner = {imke},
Timestamp = {2018.11.12},
Topics = {causal inference}
}
@Article{honaker_etal_JSS2011,
Title = {Amelia {II}: a program for missing data},
Author = {Honaker, J. and King, G. and Blackwell, M.},
Journal = {Journal of Statistical Software},
Year = {2011},
Number = {7},
Volume = {45},
Abstract = {Amelia II "multiply imputes" missing data in a single cross-section (such as a survey), from a time series (like variables collected for each year in a country), or from a time-series-cross-sectional data set (such as collected by years for each of several countries). Amelia II implements our bootstrapping-based algorithm that gives essentially the same answers as the standard IP or EMis approaches, is usually considerably faster than existing approaches and can handle many more variables. Unlike Amelia I and other statistically rigorous imputation software, it virtually never crashes (but please let us know if you find to the contrary!). The program also generalizes existing approaches by allowing for trends in time series across observations within a cross-sectional unit, as well as priors that allow experts to incorporate beliefs they have about the values of missing cells in their data. Amelia II also includes useful diagnostics of the fit of multiple imputation models. The program works from the R command line or via a graphical user interface that does not require users to know R.},
Archiveprefix = {arXiv},
Arxivid = {arXiv:1501.0228},
Doi = {10.18637/jss.v045.i07},
Eprint = {arXiv:1501.0228},
ISBN = {1548-7660},
ISSN = {15487660},
Owner = {alyssa},
Pmid = {18291371},
Timestamp = {2017.10.16},
Topics = {multiple imputation}
}
@Article{hothorn_etal_2012,
Title = {Unbiased Recursive Partitioning: A Conditional Inference Framework},
Author = {Hothorn, T. and Hornik, K. and Zeileis, A.},
Journal = {Journal of Computational and Graphical Statistics},
Year = {2012},
Number = {3},
Pages = {651-674},
Volume = {15},
Abstract = {Recursive binary partitioning is a popular tool for regression analysis. Two fundamental problems of exhaustive search procedures usually applied to fit such models have been known for a long time: overfitting and a selection bias towards covariates with many possible splits or missing values. While pruning procedures are able to solve the overfitting problem, the variable selection bias still seriously affects the interpretability of tree-structured regression models. For some special cases unbiased procedures have been suggested, however lacking a common theoretical foundation. We propose a unified framework for recursive partitioning which embeds tree-structured regression models into a well defined theory of conditional inference procedures. Stopping criteria based on multiple test procedures are implemented and it is shown that the predictive performance of the resulting trees is as good as the performance of established exhaustive search procedures. It turns out that the partitions and therefore the models induced by both approaches are structurally different, confirming the need for an unbiased variable selection. Moreover, it is shown that the prediction accuracy of trees with early stopping is equivalent to the prediction accuracy of pruned trees with unbiased variable selection. The methodology presented here is applicable to all kinds of regression problems, including nominal, ordinal, numeric, censored as well as multivariate response variables and arbitrary measurement scales of the covariates. Data from studies on glaucoma classification, node positive breast cancer survival and mammography experience are re-analyzed.},
Doi = {10.1198/106186006X133933},
Keywords = {multiple testing; multivariate regression trees; ordinal regression trees; permutation tests; variable selection},
Owner = {imke},
Timestamp = {2018.10.30},
Topics = {regression trees; variable selection}
}
@Article{horton_kleinman_2007,
Title = {Much Ado About Nothing - A Comparison of Missing Data Methods and Software to Fit Incomplete Data Regression Models},
Author = {Horton, N. J. and Kleinman, K. P.},
Journal = {The American Statistician},
Year = {2017},
Number = {1},
Pages = {79-90},
Volume = {61},
Shorttitle = {Much Ado About Nothing},
Abstract = {Missing data are a recurring problem that can cause bias or lead to inefficient analyses. Statistical methods to address missingness have been actively pursued in recent years, including imputation, likelihood, and weighting approaches. Each approach is more complicated when there are many patterns of missing values, or when both categorical and continuous random variables are involved. Implementations of routines to incorporate observations with incomplete variables in regression models are now widely available. We review these routines in the context of a motivating example from a large health services research dataset. While there are still limitations to the current implementations, and additional efforts are required of the analyst, it is feasible to incorporate partially observed values, and these methods should be used in practice.},
Doi = {10.1198/000313007X172556},
Keywords = {multiple imputation; conditional gaussian; health services research; maximum likelihood; psychiatric epidemiology},
Owner = {imke},
Timestamp = {2018.11.07},
Topics = {general_informal}
}
@Article{huisman_QQ2000,
Title = {Imputation of missing item responses: some simple techniques},
Author = {Huisman, M.},
Journal = {Quality \& Quantity},
Year = {2000},
Number = {4},
Pages = {331-351},
Volume = {34},
Abstract = {Among the wide variety of procedures to handle missing data, imputing the missing values is a popular strategy to deal with missing item responses. In this paper some simple and easily implemented imputation techniques like item and person mean substitution, and somehot-deck procedures, are investigated. A simulation study was performed based on responses to items forming a scale to measure a latent trait of the respondents. The effects of different imputation procedures on the estimation of the latent ability of the respondents were investigated, as well as the effect on the estimation of Cronbach's alpha (indicating the reliability of the test) and Loevinger's H-coefficient (indicating scalability). The results indicate that procedures which use the relationships between items perform best, although they tend to overestimate the scale quality.},
Doi = {10.1023/A:1004782230065},
Keywords = {missing data; mean imputation; hot-deck imputation; item response theory; simulation},
Owner = {nathalie},
Timestamp = {2018.05.16},
Topics = {imputation; hot-deck}
}
@Article{husson_josse_FQP2013,
Title = {Handling missing values in multiple factor analysis},
Author = {Husson, F. and Josse, J.},
Journal = {Food Quality and Preference},
Year = {2013},
Pages = {77-85},
Volume = {30},
Doi = {10.1016/j.foodqual.2013.04.013},
Keywords = {exploratory multivariate analysis, missing values, multi-table data, multiple factor analysis, napping data},
Owner = {nathalie},
Timestamp = {2016.09.28},
Topics = {factorial data analysis; imputation}
}
@Article{ibrahim_etal_RSS1999,
Title = {Missing Covariates in Generalized Linear Models When the Missing Data Mechanism is Non-Ignorable},
Author = {Ibrahim, J. G. and Lipsitz, S. R. and Chen, M.},
Journal = {Journal of the Royal Statistical Society},
Year = {1999},
Number = {1},
Pages = {173-190},
Volume = {61},
Series = {Series B (Statistical Methodology)},
Publisher = {Wiley for the Royal Statistical Society},
Abstract = {We propose a method for estimating parameters in generalized linear models with missing covariates and a non‐ignorable missing data mechanism. We use a multinomial model for the missing data indicators and propose a joint distribution for them which can be written as a sequence of one‐dimensional conditional distributions, with each one‐dimensional conditional distribution consisting of a logistic regression. We allow the covariates to be either categorical or continuous. The joint covariate distribution is also modelled via a sequence of one‐dimensional conditional distributions, and the response variable is assumed to be completely observed. We derive the E‐ and M‐steps of the EM algorithm with non‐ignorable missing covariate data. For categorical covariates, we derive a closed form expression for the E‐ and M‐steps of the EM algorithm for obtaining the maximum likelihood estimates (MLEs). For continuous covariates, we use a Monte Carlo version of the EM algorithm to obtain the MLEs via the Gibbs sampler. Computational techniques for Gibbs sampling are proposed and implemented. The parametric form of the assumed missing data mechanism itself is not `testable' from the data, and thus the non‐ignorable modelling considered here can be viewed as a sensitivity analysis concerning a more complicated model. Therefore, although a model may have `passed' the tests for a certain missing data mechanism, this does not mean that we have captured, even approximately, the correct missing data mechanism. Hence, model checking for the missing data mechanism and sensitivity analyses play an important role in this problem and are discussed in detail. Several simulations are given to demonstrate the methodology. In addition, a real data set from a melanoma cancer clinical trial is presented to illustrate the methods proposed.},
Url = {http://www.jstor.org/stable/2680744},
Doi = {10.1111/1467-9868.00170},
Owner = {imke},
Timestamp = {2018.11.07},
Keywords = {EM algorithm; Gibbs sampling; Logistic regression; Maximum likelihood estimation; Missing data mechanism; Monte Carlo EM algorithm},
Topics = {em; mnar; glm; mcem; ml}
}
@Article{ibrahim_etal_2001,
Title = {Missing responses in generalised linear mixed models when the missing data mechanism is nonignorable},
Author = {Ibrahim, J. G. and Chen, M. and Lipsitz, S. R.},
Journal = {Biometrika},
Year = {2001},
Number = {2},
Pages = {551-564},
Volume = {88},
Publisher = {Oxford University Press},
Abstract = {We propose a method for estimating parameters in the generalised linear mixed model with nonignorable missing response data and with nonmonotone patterns of missing data in the response variable. We develop a Monte Carlo EM algorithm for estimating the parameters in the model via the Gibbs sampler. For the normal random effects model, we derive a novel analytical form for the E‐ and M‐steps, which is facilitated by integrating out the random effects. This form leads to a computationally feasible and extremely efficient Monte Carlo EM algorithm for computing maximum likelihood estimates and standard errors. In addition, we propose a very general joint multinomial model for the missing data indicators, which can be specified via a sequence of one‐dimensional conditional distributions. This multinomial model allows for an arbitrary correlation structure between the missing data indicators, and has the potential of reducing the number of nuisance parameters. Real datasets from the International Breast Cancer Study Group and an environmental study involving dyspnoea in cotton workers are presented to illustrate the proposed methods.},
Doi = {10.1093/biomet/88.2.551},
Owner = {imke},
Timestamp = {2018.11.11},
Keywords = {EM algorithm; Gibbs sampling; Maximum likelihood estimation; Missing data mechanism; Monte Carlo EM algorithm; Random effects model},
Topics = {em; mnar; glm; mcem; ml}
}
@Article{ilin_raiko_JMLR2010,
Title = {Practical approaches to {P}rincipal {C}omponent {A}nalysis in the presence of missing values},
Author = {Ilin, A. and Raiko, T.},
Journal = {Journal of Machine Learning Research},
Year = {2010},
Pages = {1957-2000},
Volume = {11},
Abstract = {Principal component analysis (PCA) is a classical data analysis technique that finds linear transformations of data that retain the maximal amount of variance. We study a case where some of the data values are missing, and show that this problem has many features which are usually associated with nonlinear models, such as overfitting and bad locally optimal solutions. A probabilistic formulation of PCA provides a good foundation for handling missing values, and we provide formulas for doing that. In case of high dimensional and very sparse data, overfitting becomes a severe problem and traditional algorithms for PCA are very slow. We introduce a novel fast algorithm and extend it to variational Bayesian learning. Different versions of PCA are compared in artificial experiments, demonstrating the effects of regularization and modeling of posterior variance. The scalability of the proposed algorithm is demonstrated by applying it to the Netflix problem.},
ISBN = {1532-4435},
Keywords = {missing values; overfitting; principal component analysis; regularization; variational},
Owner = {alyssa},
Timestamp = {2016.11.30},
Topics = {factorial data analysis; imputation},
Url = {http://jmlr.csail.mit.edu/papers/v11/ilin10a.html}
}
@Article{imbert_etal_B2018,
Title = {Multiple hot-deck imputation for network inference from {RNA} sequencing data},
Author = {Imbert, A. and Valsesia, A. and Le Gall, C. and Armenise, C. and Lefebvre, G. and Gourraud, P. and Viguerie, N. and Villa-Vialaneix, N.},
Journal = {Bioinformatics},
Year = {2018},
Number = {10},
Pages = {1726-1732},
Volume = {34},
Abstract = {Motivation: Network inference provides a global view of the relations existing between gene expression in a given transcriptomic experiment (often only for a restricted list of chosen genes). However, it is still a challenging problem: even if the cost of sequencing techniques has decreased over the last years, the number of samples in a given experiment is still (very) small compared to the number of genes. Results: We propose a method to increase the reliability of the inference when RNA-seq expression data have been measured together with an auxiliary dataset that can provide external information on gene expression similarity between samples. Our statistical approach, hd-MI, is based on imputation for samples without available RNA-seq data that are considered as missing data but are observed on the secondary dataset. hd-MI can improve the reliability of the inference for missing rates up to 30% and provides more stable networks with a smaller number of false positive edges. On a biological point of view, hd-MI was also found relevant to infer networks from RNA-seq data acquired in adipose tissue during a nutritional intervention in obese individuals. In these networks, novel links between genes were highlighted, as well as an improved comparability between the two steps of the nutritional intervention. Availability: Software and sample data are available as an R package, RNAseqNet, that can be downloaded from the Comprehensive R Archive Network (CRAN).},
Doi = {10.1093/bioinformatics/btx819},
Owner = {nathalie},
Timestamp = {2017.07.07},
Topics = {multiple imputation; hot-deck},
Website = {https://academic.oup.com/bioinformatic}
}
@InProceedings{jonsson_wohlin_ISSM2004,
Title = {An evaluation of k-nearest neighbour imputation using lIkert data},
Author = {J{\"{o}}nsson, P. and Wohlin, C.},
Booktitle = {Proceedings of the 10th International Symposium on Software Metrics},
Year = {2004},
Editor = {-},
Address = {Chicago, IL, USA},
Pages = {1530-1435},
Publisher = {IEEE},
Abstract = {Studies in many different fields of research suffer from the problem of missing data. With missing data, statistical tests will lose power, results may be biased, or analysis may not be feasible at all. There are several ways to handle the problem, for example through imputation. With imputation, missing values are replaced with estimated values according to an imputation method or model. In the k-nearest neighbour (k-NN) method, a case is imputed using values from the k most similar cases. In this paper, we present an evaluation of the k-NN method using Likert data in a software engineering context. We simulate the method with different values of k and for different percentages of missing data. Our findings indicate that it is feasible to use the k-NN method with Likert data. We suggest that a suitable value of k is approximately the square root of the number of complete cases. We also show that by relaxing the method rules with respect to selecting neighbours, the ability of the method remains high for large amounts of missing data without affecting the quality of the imputation.},
Doi = {10.1109/METRIC.2004.1357895},
Eventdate = {2004-09-14/2004-09-16},
ISBN = {0769521290},
ISSN = {15301435},
Owner = {alyssa},
Timestamp = {2017.05.29},
Topics = {knn}
}
@Article{jamshidian_jalal_P2010,
Title = {Tests of homoscedasticity, normality, and missing completely at random for incomplete multivariate data},
Author = {Jamshidian, M. and Jalal, S.},
Journal = {Psychometrika},
Year = {2010},
Number = {4},
Pages = {649-674},
Volume = {75},
Abstract = {Test of homogeneity of covariances (or homoscedasticity) among several groups has many applications in statistical analysis. In the context of incomplete data analysis, tests of homoscedasticity among groups of cases with identical missing data patterns have been proposed to test whether data are missing completely at random (MCAR). These tests of MCAR require large sample sizes n and/or large group sample sizes n(i), and they usually fail when applied to non-normal data. Hawkins (1981) proposed a test of multivariate normality and homoscedasticity that is an exact test for complete data when n(i) are small. This paper proposes a modification of this test for complete data to improve its performance, and extends its application to test of homoscedasticity and MCAR when data are multivariate normal and incomplete. Moreover, it is shown that the statistic used in the Hawkins test in conjunction with a nonparametric k-sample test can be used to obtain a nonparametric test of homoscedasticity that works well for both normal and non-normal data. It is explained how a combination of the proposed normal-theory Hawkins test and the nonparametric test can be employed to test for homoscedasticity, MCAR, and multivariate normality. Simulation studies show that the newly proposed tests generally outperform their existing competitors in terms of Type I error rejection rates. Also, a power study of the proposed tests indicates good power. The proposed methods use appropriate missing data imputations to impute missing data. Methods of multiple imputation are described and one of the methods is employed to confirm the result of our single imputation methods. Examples are provided where multiple imputation enables one to identify a group or groups whose covariance matrices differ from the majority of other groups.},
Archiveprefix = {arXiv},
Arxivid = {NIHMS150003},
Doi = {10.1007/s11336-010-9175-3},
Eprint = {NIHMS150003},
ISBN = {1133601091753},
ISSN = {00333123},
Keywords = {covariance structures; k-sample test; missing data; multiple imputation; nonparametric test; structural equations; test of homogeneity of covariances},
Owner = {alyssa},
Pmid = {21720450},
Timestamp = {2017.05.09},
Topics = {diagnosis}
}
@Article{jamshidian_etal_JSS2014,
Title = {{MissMech}: an {R} package for testing homoscedasticity, multivariate normality, and missing completely at random ({MCAR})},
Author = {Jamshidian, M. and Jalal, S. and Jansen, C.},
Journal = {Journal of Statistical Software},
Year = {2014},
Number = {6},
Pages = {1-31},
Volume = {56},
Abstract = {Researchers are often faced with analyzing data sets that are not complete. To prop- erly analyze such data sets requires the knowledge of the missing data mechanism. If data are missing completely at random (MCAR), then many missing data analysis techniques lead to valid inference. Thus, tests of MCAR are desirable. The package MissMech implements two tests developed by Jamshidian and Jalal (2010) for this purpose. These tests can be run using a function called TestMCARNormality. One of the tests is valid if data are normally distributed, and another test does not require any distributional assumptions for the data. In addition to testing MCAR, in some special cases, the function TestMCARNormality is also able to test whether data have a multivariate normal distribution. As a bonus, the functions in MissMech can also be used for the following additional tasks: (i) test of homoscedasticity for several groups when data are completely observed, (ii) perform the k-sample test of Anderson-Darling to determine whether k groups of univariate data come from the same distribution, (iii) impute incomplete data sets using two methods, one where normality is assumed and one where no specific distributional assumptions are made, (iv) obtain normal-theory maximum likelihood estimates for mean and covariance matrix when data are incomplete, along with their standard errors, and finally (v) perform the Neyman's test of uniformity. All of these features are explained in the paper, including examples.},
Doi = {10.18637/jss.v056.i06},
ISBN = {1548-7660},
ISSN = {1548-7660},
Keywords = {anderson-darling; goodness of fit test; hawkins; homogeneity of covariances; incomplete data; maximum likelihood estimate; missing data; neyman; imputation,s test,test},
Owner = {alyssa},
Timestamp = {2017.05.09},
Topics = {diagnosis}
}
@Article{jiang_etal_2018,
Title = {Logistic Regression with Missing Covariates--Parameter Estimation, Model Selection and Prediction},
Author = {Jiang, Wei and Josse, Julie and Lavielle, Marc},
Journal = {arXiv preprint},
ArchivePrefix = {arXiv},
Eprint = {1805.04602},
PrimaryClass = {stat.ME},
Year = {2018},
Abstract = {Logistic regression is a common classification method in supervised learning. Sur- prisingly, there are very few solutions for performing it and selecting variables in the presence of missing values. We develop a complete approach, including the es- timation of parameters and variance of estimators, derivation of confidence intervals and a model selection procedure, for cases where the missing values can be anywhere in covariates. By well organizing different patterns of missingness in each observa- tion, we propose a stochastic approximation version of the EM algorithm based on Metropolis-Hasting sampling, to perform statistical inference for logistic regression with incomplete data. We also tackle the problem of prediction for a new individual with missing values, which is never addressed. The methodology is computationally efficient, and its good coverage and variable selection properties are demonstrated in a simulation study where we contrast its performances to other methods. For instance, the popular multiple imputation by chained equation can lead to biased estimates while our method is unbiased. We then illustrate the method on a dataset of severely traumatized patients from Paris hospitals to predict the occurrence of hemorrhagic shock, a leading cause of early preventable death in severe trauma cases. The aim is to consolidate the current red flag procedure, a binary alert identifying patients with a high risk of severe hemorrhage. The methodology is implemented in the R package misaem.},
Keywords = {incomplete data; observed likelihood; variable selection; major trauma; public health},
Owner = {imke},
Timestamp = {2019.03.31},
Topics = {ml; regression}
}
@Article{joenssen_bankhofer_JTACS2012,
Title = {Donor limited hot deck imputation: effect on parameter estimation},
Author = {Joenssen, D. W. and Bankhofer, U.},
Journal = {Journal of Theoretical and Applied Computer Science},
Year = {2012},
Number = {3},
Pages = {58-70},
Volume = {6},
Abstract = {Methods for dealing with missing data in the context of large surveys or data mining projects are limited by the computational complexity that they may exhibit. Hot deck imputation methods are computationally simple, yet effective for creating complete data sets from which correct inferences may be drawn. All hot deck methods draw values for the imputation of missing values from the data matrix that will later be analyzed. The object, from which these available values are taken for imputation within another, is called the donor. This duplication of values may lead to the problem that using any donor ``too often'' will induce incorrect estimates. To mitigate this dilemma some hot deck methods limit the amount of times any one donor may be selected. This study answers which conditions influence whether or not any such limitation is sensible for six different hot deck methods. In addition, five factors that influence the strength of any such advantage are identified and possibilities for further research are discussed.},
Keywords = {hot deck imputation; missing data; non-response; imputation; simulation},
ISSN = {2299-2634},
Url = {http://www.jtacs.org/archive/2012/3/6},
Owner = {aimbert},
Timestamp = {2017.02.21},
Topics = {imputation; hot-deck}
}
@Article{jones_JASA1996,
Title = {Indicator and Stratification Methods for Missing Explanatory Variables in Multiple Linear Regression},
Author = {Jones, Michael P.},
Journal = {Journal of the American Statistical Association},
Volume = {91},
Number = {433},
Pages = {222-230},
Year = {1996},
Publisher = {Taylor & Francis},
Doi = {10.1080/01621459.1996.10476680},
Abstract = {The statistical literature and folklore contain many methods for handling missing explanatory variable data in multiple linear regression. One such approach is to incorporate into the regression model an indicator variable for whether an explanatory variable is observed. Another approach is to stratify the model based on the range of values for an explanatory variable, with a separate stratum for those individuals in which the explanatory variable is missing. For a least squares regression analysis using either of these two missing-data approaches, the exact biases of the estimators for the regression coefficients and the residual variance are derived and reported. The complete-case analysis, in which individuals with any missing data are omitted, is also investigated theoretically and is found to be free of bias in many situations, though often wasteful of information. A numerical evaluation of the bias of two missing-indicator methods and the complete-case analysis is reported. The missing-indicator methods show unacceptably large biases in practical situations and are not advisable in general.},
Keywords = {incomplete data; regression; stratification; missing-indicator methods},
Owner = {imke},
Timestamp = {2019.02.04},
Topics = {ml; regression}
}
@Article{josse_etal_JC2012,
Title = {Handling missing values with regularized iterative multiple correspondance analysis},
Author = {Josse, Julie and Chavent, Marie and Liquet, Benoi and Husson, Fran\c{c}ois},
Journal = {Journal of Classification},
Year = {2012},
Number = {1},
Pages = {91-116},
Volume = {29},
Abstract = {A common approach to deal with missing values in multivariate exploratory data analysis consists in minimizing the loss function over all non-missing elements, which can be achieved by EM-type algorithms where an iterative imputation of the missing values is performed during the estimation of the axes and components. This paper proposes such an algorithm, named iterative multiple correspondence analysis, to handle missing values in multiple correspondence analysis (MCA). The algorithm, based on an iterative PCA algorithm, is described and its properties are studied. We point out the overfitting problem and propose a regularized version of the algorithm to overcome this major issue. Finally, performances of the regularized iterative MCA algorithm (implemented in the R-package named missMDA) are assessed from both simulations and a real dataset. Results are promising with respect to other methods such as the missing-data passive modified margin method, an adaptation of the missing passive method used in Gifi’s Homogeneity analysis framework.},
Doi = {10.1007/s00357-012-9097-0},
Keywords = {multiple correspondence analysis; categorical data; missing values; imputation; regularization},
Owner = {alyssa},
Timestamp = {2017.02.22},
Topics = {factorial data analysis; imputation}
}
@Article{josse_husson_JSS2016,
Title = {{missMDA}: a package for handling missing values in multivariate data analysis},
Author = {Josse, J. and Husson, F.},
Journal = {Journal of Statistical Software},
Year = {2016},
Number = {1},
Pages = {1-31},
Volume = {70},
Doi = {10.18637/jss.v070.i01},
Owner = {nathalie},
Timestamp = {2016.10.17},
Topics = {multiple imputation; factorial data analysis}
}
@Article{josse_husson_JSFdS2012,
Title = {Handling missing values in exploratory multivariate data analysis methods},
Author = {Josse, J. and Husson, F.},
Journal = {Journal de la Soci\'et\'e Fran\c{c}aise de Statistique},
Year = {2012},
Number = {2},
Pages = {79-99},
Volume = {153},
Abstract = {This paper is a written version of the talk Julie Josse delivered at the 44 Journ{\'{e}}es de Statistique (Bruxelles, 2012), when being awarded the Marie-Jeanne Laurent-Duhamel prize for her Ph.D. dissertation by the French Statistical Society. It proposes an overview of some results, proposed in Julie Josse and Fran{\c{c}}ois Husson's papers, as well as new challenges in the field of handling missing values in exploratory multivariate data analysis methods and especially in principal component analysis (PCA). First we describe a regularized iterative PCA algorithm to provide point estimates of the principal axes and components and to overcome the major issue of overfitting. Then, we give insight in the parameters variance using a non parametric multiple imputation procedure. Finally, we discuss the problem of the choice of the number of dimensions and we detail cross-validation approximation criteria. The proposed methodology is implemented in the R package missMDA. R{\'{e}}sum{\'{e}} : Cet article fait suite {\`{a}} la conf{\'{e}}rence de Julie Josse sur ses travaux de th{\`{e}}se lors de la r{\'{e}}ception du prix Marie-Jeanne Laurent-Duhamel, dans le cadre des 44e Journ{\'{e}}es de Statistique (Bruxelles, 2012). Il reprend les principaux r{\'{e}}sultats des papiers de Julie Josse et Fran{\c{c}}ois Husson sur la gestion des donn{\'{e}}es manquantes en analyse factorielle et d{\'{e}}crit de nouvelles avanc{\'{e}}es sur le sujet. Dans un premier temps, nous d{\'{e}}taillons un algorithme d'ACP it{\'{e}}rative r{\'{e}}gularis{\'{e}}e qui permet d'estimer les axes et composantes principales en pr{\'{e}}sence de donn{\'{e}}es manquantes et qui pallie le probl{\`{e}}me majeur du surajustement. L'estimation ponctuelle est enrichie par la construction de zone de confiance. Une m{\'{e}}thode d'imputation multiple non-param{\'{e}}trique est alors d{\'{e}}velopp{\'{e}}e pour prendre en compte l'incertitude due aux donn{\'{e}}es manquantes. Enfin, nous abordons le probl{\`{e}}me r{\'{e}}current du choix du nombre de dimensions et d{\'{e}}finissons des approximations de la validation crois{\'{e}}e de type validation crois{\'{e}}e g{\'{e}}n{\'{e}}ralis{\'{e}}e. Tous ces travaux sont mis {\`{a}} disposition de l'utilisateur gr{\^{a}}ce au package missMDA du logiciel libre R.},
ISSN = {2102-6238},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {multiple imputation; factorial data analysis},
Url = {http://publications-sfds.fr/ojs/index.php/J-SFdS/article/view/122/112}
}
@Article{josse_etal_JSFdS2009,
Title = {Gestion des donn\'ees manquantes en {A}nalyse en {C}omposantes {P}rincipales},
Author = {Josse, J. and Husson, F. and Pag\`es, J.},
Journal = {Journal de la Soci\'et\'e Fran\c{c}aise de Statistique},
Year = {2009},
Number = {2},
Pages = {28-51},
Volume = {150},
Abstract = {An approach commonly used to handle missing values in Principal Component Analysis (PCA) consists in ignoring the missing values by optimizing the loss function over all non-missing ele- ments. This can be achieved by several methods, including the use of NIPALS, weighted regression or iterative PCA. The latter is based on iterative imputation of the missing elements during the es- timation of the parameters, and can be seen as a particular EM algorithm. First, we review theses approaches with respect to the criterion minimization. This presentation gives a good understanding of their properties and the difficulties encountered. Then, we point out the problem of overfitting and we show how the probabilistic formulation of PCA (Tipping {\&} Bishop, 1997) offers a proper and convenient regularization term to overcome this problem. Finally, the performances of the new algorithm are compared to those of the other algorithms from simulations.},
Keywords = {ACP; ACP probabiliste; ACP-GEM; algorithme EM; donn\'ees manquantes; moindres carr\'es altern\'es pond\'er\'es; surajustement},
Owner = {alyssa},
Timestamp = {2016.11.30},
Topics = {factorial data analysis; imputation},
Url = {http://journal-sfds.fr/ojs/index.php/J-SFdS/article/view/33/27}
}
@Article{josse_etal_ADAC2011,
Title = {Multiple imputation in principal component analysis},
Author = {Josse, J. and Pag\`es, J. and Husson, F.},
Journal = {Advances in Data Analysis and Classification},
Year = {2011},
Number = {3},
Pages = {231-246},
Volume = {5},
Abstract = {The available methods to handle missing values in principal component analysis only provide point estimates of the parameters (axes and components) and estimates of the missing values. To take into account the variability due to missing values a multiple imputation method is proposed. First a method to generate multiple imputed data sets from a principal component analysis model is defined. Then, two ways to visualize the uncertainty due to missing values onto the principal component analysis results are described. The first one consists in projecting the imputed data sets onto a reference configuration as supplementary elements to assess the stability of the individuals (respectively of the variables). The second one consists in performing a principal component analysis on each imputed data set and fitting each obtained configuration onto the reference one with Procrustes rotation. The latter strategy allows to assess the variability of the principal component analysis parameters induced by the missing values. The methodology is then evaluated from a real data set.},
Doi = {10.1007/s11634-011-0086-7},
Keywords = {Bootstrap; EM algorithm; Missing values; Multiple imputation; Principal component analysis;Procrustes rotation},
Owner = {alyssa},
Timestamp = {2016.12.20},
Topics = {multiple imputation; factorial data analysis}
}
@Article{josse_etal_2019,
Title = {On the consistency of supervised learning with missing values},
Author = {Josse, Julie and Prost, Nicolas and Scornet, Erwan and Varoquaux, Ga{\"e}l},
Journal = {arXiv preprint},
archivePrefix = {arXiv},
eprint = {1902.06931},
primaryClass = {stat.ML},
Year = {2019},
Url = {https://arxiv.org/abs/1902.06931},
Abstract = {In many application settings, the data are plagued with missing features. These hinder data analysis. An abundant literature addresses missing values in an inferential framework, where the aim is to estimate parameters and their variance from incomplete tables. Here, we consider supervised-learning settings where the objective is to best predict a target when missing values appear in both training and test sets. We analyze which missing-values strategies lead to good prediction. We show the consistency of two approaches to estimating the prediction function. The most striking one shows that the widely-used mean imputation prior to learning method is consistent when missing values are not informative. This is in contrast with inferential settings as mean imputation is known to have serious drawbacks in terms of deformation of the joint and marginal distribution of the data. That such a simple approach can be consistent has important consequences in practice. This result holds asymptotically when the learning algorithm is consistent in itself. We contribute additional analysis on decision trees as they can naturally tackle empirical risk minimization with missing values. This is due to their ability to handle the half-discrete nature of variables with missing values. After comparing theoretically and empirically different missing-values strategies in trees, we recommend using the missing incorporated in attributes method as it can handle both non-informative and informative missing values.},
Keywords = {Imputation; decision trees; expectation maximization},
Owner = {imke},
Timestamp = {2019.03.17},
Topics = {random trees; random forests}
}
@Article{kaiser_JSI2014,
Title = {Dealing with missing values in data},
Author = {Kaiser, J.},
Journal = {Journal of Systems Integration},
Year = {2014},
Number = {1},
Pages = {42-51},
Volume = {5},
Abstract = {Many existing industrial and research data sets contain missing values due to various reasons, such as manual data entry procedures, equipment errors and incorrect measurements. Problems associated with missing values are loss of efficiency, complications in handling and analyzing the data and bias resulting from differences between missing and complete data. The important factor for selection of approach to missing values is missing data mechanism. There are various strategies for dealing with missing values. Some analytical methods have their own approach to handle missing values. Data set reduction is another option. Finally missing values problem can be handled by missing values imputation. This paper presents simple methods for missing values imputation like using most common value, mean or median, closest fit approach and methods based on data mining algorithms like k-nearest neighbor, neural networks and association rules, discusses their usability and presents issues with their applicability on examples.},
Doi = {10.20470/jsi.v5i1.178},
ISBN = {18042724},
ISSN = {18042724},
Owner = {alyssa},
Pmid = {94265105},
Timestamp = {2017.05.29},
Topics = {general}
}
@InProceedings{kallus_etal_2018,
Title = {Causal Inference with Noisy and Missing Covariates via Matrix Factorization},
Author = {Kallus, N. and Mao, X. and Udell, M.},
Booktitle = {Advances in Neural Information Processing Systems},
Year = {2018},
Editor = {-},
Abstract = {Valid causal inference in observational studies often requires controlling for confounders. However, in practice measurements of confounders may be noisy, and can lead to biased estimates of causal effects. We show that we can reduce the bias caused by measurement noise using a large number of noisy measurements of the underlying confounders. We propose the use of matrix factorization to infer the confounders from noisy covariates, a flexible and principled framework that adapts to missing values, accommodates a wide variety of data types, and can augment many causal inference methods. We bound the error for the induced average treatment effect estimator and show it is consistent in a linear regression setting, using Exponential Family Matrix Completion preprocessing. We demonstrate the effectiveness of the proposed procedure in numerical experiments with both synthetic data and real clinical data.},
Archiveprefix = {arXiv},
Url = {https://arxiv.org/abs/1806.00811},
Eprint = {1806.00811},
Owner = {imke},
Timestamp = {2018.11.12},
Topics = {causal inference}
}
@Article{kalton_kasprzyk_SM1986,
Title = {The treatment of missing survey data},
Author = {Kalton, G. and Kasprzyk, D.},
Journal = {Survey Methodology},
Year = {1986},
Number = {1},
Pages = {1-16},
Volume = {12},
Abstract = {Missing survey data occur because of total nonresponse and item nonresponse. The standard way to attempt to compensate for total nonresponse is by some form of weighting adjustment, whereas item nonresponses are handled by some form of imputation. This paper reviews methods of weighting adjustment and imputation and discusses their properties.},
Keywords = {nonresponse; item nonresponse; weiweight adjustments; imputation},
Owner = {alyssa},
Timestamp = {2017.06.07},
Topics = {imputation; survey; ipw},
Url = {http://www.statcan.gc.ca/pub/12-001-x/1986001/article/14404-eng.pdf}
}
@Article{kapelner_bleich_2014,
Title = {Prediction with missing data via Bayesian additive regression trees},
Author = {Kapelner, A. and Bleich, J.},
Journal = {Canadian Journal of Statistics},
Year = {2015},
Number = {2},
Pages = {224-239},
Volume = {43},
Abstract = {We present a method for incorporating missing data into general prediction problems which use non-parametric statistical learning. We focus on a tree‐based method, Bayesian Additive Regression Trees (BART), enhanced with “Missingness Incorporated in Attributes,” a recently proposed approach for incorporating missingness into decision trees. This procedure extends the native partitioning mechanisms found in tree‐based models and does not require imputation. Simulations on generated models and real data indicate that our procedure offers promise for both selection model and pattern‐mixture frameworks as measured by out‐of‐sample predictive accuracy. We also illustrate BART's abilities to incorporate missingness into uncertainty intervals. Our implementation is readily available in the R package bartMachine.},
Doi = {10.1002/cjs.11248},
ISSN = {0319-5724},
Keywords = {statistical learning; non-parametric statistical learning; BART; random forests; decision trees; missing data},
Owner = {imke},
Timestamp = {2018.10.30},
Url = {https://arxiv.org/abs/1306.0618v3},
Topics = {random trees; random forests; prediction; statistical learning}
}
@Book{kim_shao_2013,
Title = {Statistical Methods for Handling Incomplete Data},
Author = {Kim, J. K. and Shao, J.},
Publisher = {Chapman and Hall/CRC},
Year = {2013},
Address = {Boca Raton, FL, USA},
Abstract = {Due to recent theoretical findings and advances in statistical computing, there has been a rapid development of techniques and applications in the area of missing data analysis. Statistical Methods for Handling Incomplete Data covers the most up-to-date statistical theories and computational methods for analyzing incomplete data. The book presents thorough treatments of: (1) Statistical theories of likelihood-based inference with missing data, (2) Computational techniques and theories on imputation, (3) Methods involving propensity score weighting, nonignorable missing data, longitudinal missing data, survey sampling, and statistical matching.},
ISBN = {9781482205077},
Keywords = {imputation; EM algorithm; MC EM; latent variable model; propensity scoring; longitudinal data; survey sampling},
Owner = {imke},
Timestamp = {2018.10.26},
Topics = {general}
}
@Article{kohn_ansley_JASA1986,
Title = {Estimation, prediction, and interpolation for {ARIMA} models with missing data},
Author = {Kohn, Robert and Ansley, Craig F.},
Journal = {Journal of the American Statistical Association},
Year = {1986},
Number = {395},
Pages = {751-761},
Volume = {81},
Abstract = {We show how to define and then compute efficiently the marginal likelihood of an ARIMA model with missing observations. The computation is carried out by using the univariate version of the modified Kalman filter introduced by Ansley and Kohn (1985a), which allows a partially diffuse initial state vector. We also show how to predict and interpolate missing observations and obtain the mean squared error of the estimate.},
Doi = {10.2307/2289007},
ISSN = {01621459},
Keywords = {Kalman filters; datasets; data smoothing; modeling; missing data; interpolation; state vectors; covariance matrices; maximum likelihood estimation; time series models},
Owner = {alyssa},
Publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
Timestamp = {2018.06.07},
Topics = {time series; imputation}
}
@Article{kowarik_templ_JSS2016,
Title = {Imputation with the {R} Package {VIM}},
Author = {Kowarik, A. and Templ, M.},
Journal = {Journal of Statistical Software},
Year = {2016},
Number = {7},
Pages = {1-16},
Volume = {74},
Abstract = {The package VIM (Templ, Alfons, Kowarik, and Prantner 2016) is developed to explore and analyze the structure of missing values in data using visualization methods, to impute these missing values with the built-in imputation methods and to verify the imputation process using visualization tools, as well as to produce high-quality graphics for publications. This article focuses on the different imputation techniques available in the package. Four different imputation methods are currently implemented in VIM, namely hot-deck imputation, k-nearest neighbor imputation, regression imputation and iterative robust model-based imputation (Templ, Kowarik, and Filzmoser 2011). All of these methods are implemented in a flexible manner with many options for customization. Furthermore in this article practical examples are provided to highlight the use of the implemented methods on real-world applications. In addition, the graphical user interface of VIM has been re-implemented from scratch resulting in the package VIMGUI (Schopfhauser, Templ, Alfons, Kowarik, and Prantner 2016) to enable users without extensive R skills to access these imputation and visualization methods.},
Doi = {10.18637/jss.v074.i07},
Owner = {nathalie},
Timestamp = {2017.05.29},
Keywords = {imputation; visualization; diagnosis},
Topics = {imputation}
}
@article{kropko_etal_2014,
Title = {Multiple Imputation for Continuous and Categorical Data: Comparing Joint Multivariate Normal and Conditional Approaches},
Author = {Kropko, J. and Goodrich, B. and Gelman, A. and Hill, J.},
Year = {2014},
Volume = {22},
Number = {4},
Journal = {Political Analysis},
Publisher = {Cambridge University Press},
Pages = {497--519},
Doi = {10.1093/pan/mpu007},
Abstract = {We consider the relative performance of two common approaches to multiple imputation (MI): joint multivariate normal (MVN) MI, in which the data are modeled as a sample from a joint MVN distribution; and conditional MI, in which each variable is modeled conditionally on all the others. In order to use the multivariate normal distribution, implementations of joint MVN MI typically assume that categories of discrete variables are probabilistically constructed from continuous values. We use simulations to examine the implications of these assumptions. For each approach, we assess (1) the accuracy of the imputed values; and (2) the accuracy of coefficients and fitted values from a model fit to completed data sets. These simulations consider continuous, binary, ordinal, and unordered-categorical variables. One set of simulations uses multivariate normal data, and one set uses data from the 2008 American National Election Studies. We implement a less restrictive approach than is typical when evaluating methods using simulations in the missing data literature: in each case, missing values are generated by carefully following the conditions necessary for missingness to be “missing at random” (MAR). We find that in these situations conditional MI is more accurate than joint MVN MI whenever the data include categorical variables.},
Keywords = {Data imputation; missing data; multivariate normal distribution; mixed data; mar},
Owner = {imke},
Timestamp = {2018.11.19},
Topics = {mi}
}
@Article{lee_etal_2018,
Title = {Optimal design when outcome values are not missing at random},
Author = {Lee, K. M. and Mitra, R. and Biedermann, S.},
Journal = {Statistica Sinica},
Year = {2018},
Number = {4},
Pages = {1821--1838},
Volume = {28},
Publisher = {Institute of Statistical Science},
Abstract = {The presence of missing values complicates statistical analyses. In design of experiments, missing values are particularly problematic when constructing optimal designs, as it is not known which values are missing at the design stage. When data are missing at random it is possible to incorporate this information into the optimality criterion that is used to find designs; Imhof, Song and Wong (2002) develop such a framework. However, when data are not missing at random this framework can lead to inefficient designs. We investigate and address the specific challenges that not missing at random values present when finding optimal designs for linear regression models. We show that the optimality criteria depend on model parameters that traditionally do not affect the design, such as regression coefficients and the residual variance. We also develop a framework that improves efficiency of designs over those found when values are missing at random.},
Doi = {10.5705/ss.202016.0526},
Keywords = {Covariance matrix; information matrix; linear regression model; missing observations; not missing at random; optimal design},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar}
}
@Article{little_JASA1995,
Title = {Modeling the drop-out mechanism in repeated-measures studies},
Author = {Little, R. J. A.},
Journal = {Journal of the American Statistical Association},
Year = {1995},
Number = {431},
Pages = {1112-1121},
Volume = {90},
Abstract = {Subjects often drop out of longitudinal studies prematurely, yielding unbalanced data with unequal numbers of measures for each subject. Modern software programs for handling unbalanced longitudinal data improve on methods that discard the incomplete cases by including all the data, but also yield biased inferences under plausible models for the drop-out process. This article discusses methods that simultaneously model the data and the drop-out process within a unified model-based framework. Models are classified into two broad classes--random-coefficient selection models and random-coefficient pattern-mixture models--depending on how the joint distribution of the data and drop-out mechanism is factored. Inference is likelihood-based, via maximum likelihood or Bayesian methods. A number of examples in the literature are placed in this framework, and possible extensions outlined. Data collection on the nature of the drop-out process is advocated to guide the choice of model. In cases where the drop-out mechanism is not well understood, sensitivity analyses are suggested to assess the effect on inferences about target quantities of alternative assumptions about the drop-out process.},
Doi = {10.2307/2291350},
Keywords = {attrition; longitudinal data; missing data; nonrandom nonresponse; selection bias},
Owner = {alyssa},
Timestamp = {2016.12.12},
Topics = {mnar}
}
@Article{little_JASA1993,
Title = {Pattern-mixture models for multivariate incomplete data},
Author = {Little, R. J. A.},
Journal = {Journal of the American Statistical Association},
Year = {1993},
Number = {421},
Pages = {125-134},
Volume = {88},
Abstract = {Consider a random sample on variables X1, ..., XV with some values of XV missing. Selection models specify the distribution of X1, ..., XV over respondents and nonrespondents to XV, and the conditional distribution that XV is missing given X1, ..., XV. In contrast, pattern-mixture models specify the conditional distribution of X1, ..., XV given that XV is observed or missing respectively and the marginal distribution of the binary indicator for whether or not XV is missing. For multivariate data with a general pattern of missing values, the literature has tended to adopt the selection-modeling approach (see for example Little and Rubin); here, pattern-mixture models are proposed for this more general problem. Pattern-mixture models are chronically underidentified; in particular for the case of univariate nonresponse mentioned above, there are no data on the distribution of XV given X1, ..., XV-1 in the stratum with XV missing. Thus the models require restrictions or prior information to identify the parameters. Complete-case restrictions tie unidentified parameters to their (identified) analogs in the stratum of complete cases. Alternative types of restriction tie unidentified parameters to parameters in other missing-value patterns or sets of such patterns. This large set of possible identifying restrictions yields a rich class of missing-data models. Unlike ignorable selection models, which generally requires iterative methods except for special missing-data patterns, some pattern-mixture models yield explicit ML estimates for general patterns. Such models are readily amenable to Bayesian methods and form a convenient basis for multiple imputation. Some previously considered noniterative estimation methods are shown to be maximum likelihood (ML) under a pattern-mixture model. For example, Buck's method for continuous data, corrected as in Beale and Little (1975), and Brown's estimators for nonrandomly missing data are ML for pattern-mixture models with particular complete-case restrictions. Available-case analyses, where the mean and variance of Xj are computed using all cases with Xj observed and the correlation (or covariance) of Xj and Xk is computed using all cases with Xj and Xk observed, are also close to ML for another pattern-mixture model. Asymptotic theory for this class of estimators is outlined.},
Doi = {10.2307/2290705},
ISSN = {01621459},
Keywords = {parametric models; statistical estimation; statistical models; missing data; covariance matrices; statistical variance; modeling; mathematical models; data models; sample mean},
Owner = {alyssa},
Publisher = {[American Statistical Association, Taylor \& Francis, Ltd.]},
Timestamp = {2017.11.14},
Topics = {mnar}
}
@Article{little_JASA1988,
Title = {A test of missing completely at random for multivariate data with missing values},
Author = {Little, R. J. A.},
Journal = {Journal of the American Statistical Association},
Year = {1988},
Number = {404},
Pages = {1198-1202},
Volume = {83},
Booktitle = {Journal of the American Statistical Association},
Doi = {10.2307/2290157},
ISBN = {01621459},
ISSN = {02776715},
Keywords = {datasets; missing data; covariance matrices; sampling distributions; T tests; data sampling; maximum likelihood estimation; standard error; ratio test},
Owner = {alyssa},
Pmid = {9280038},
Timestamp = {2018.05.11},
Topics = {diagnosis}
}
@Book{little_rubin_SAMD2002,
Title = {Statistical Analysis with Missing Data},
Author = {Little, R. J. A. and Rubin, D. B.},
Publisher = {Wiley},
Year = {2002},
Booktitle = {Statistical analysis with missing data Second edition},
Doi = {10.2307/1533221},
ISBN = {0471183865},
ISSN = {00324663},
Owner = {alyssa},
Pages = {408},
Pmid = {10403256},
Timestamp = {2016.09.27},
Topics = {general}
}
@Article{little_JASA1992,
Title = {Regression with missing {X}'s: a review},
Author = {Little, R. J. A.},
Journal = {Journal of the American Statistical Association},
Year = {1992},
Number = {420},
Pages = {1227-1237},
Volume = {87},
Abstract = {The literature of regression analysis with missing values of the independent variables is reviewed. Six classes of procedures are distinguished: complete case analysis, available case methods, least squares on imputed data, maximum likelihood, Bayesian methods, and multiple imputation. Methods are compared and illustrated when missing data are confined to one independent variable, and extensions to more general patterns are indicated. Attention is paid to the performance of methods when the missing data are not missing completely at random. Least squares methods that fill in missing X's using only data on the X's are contrasted with likelihood-based methods that use data on the X's and Y. The latter approach is preferred and provides methods for elaboration of the basic normal linear regression model. It is suggested that more widely distributed software is needed that advances beyond complete-case analysis, available-case analysis, and naive imputation methods. Bayesian simulation methods and multiple imputation are reviewed; these provide fruitful avenues for future research.},
Doi = {10.2307/2290664},
ISBN = {01621459},
ISSN = {01621459},
Keywords = {bayesian inference; imputation; incomplete data; multiple imputation},
Owner = {alyssa},
Pmid = {318},
Timestamp = {2018.06.07},
Topics = {ml}
}
@Article{louis_JRSS1982,
Title = {Finding the Observed Information Matrix when Using the EM Algorithm},
Author = {Louis, Thomas A.},
Journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
Number = {2},
Pages = {226--233},
Publisher = {Royal Statistical Society, Wiley},
Volume = {44},
Year = {1982},
Url = {http://www.jstor.org/stable/2345828},
Abstract = {A procedure is derived for extracting the observed information matrix when the EM algorithm is used to find maximum likelihood estimates in incomplete data problems. The technique requires computation of a complete-data gradient vector or second derivative matrix, but not those associated with the incomplete data likelihood. In addition, a method useful in speeding up the convergence of the EM algorithm is developed. Two examples are presented.},
ISSN = {00359246},
Keywords = {EM algorithm; Observed Information; Maximum Likelihood; Speeding Convergence},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {ml; em}
}
@Book{mclachlan_krishnan_2008,
Title = {The EM Algorithm and Extensions},
Author = {McLachlan, G. J. and Krishnan, T.},
Publisher = {Wiley},
Year = {2008},
Booktitle = {The EM Algorithm and Extensions, Second Edition},
Address = {Hoboken, NJ, USA},
Series = {Wiley series in probability and statistics},
Abstract = {The first unified account of the theory, methodology, and applications of the EM algorithm and its extensions. Since its inception in 1977, the Expectation-Maximization (EM) algorithm has been the subject of intense scrutiny, dozens of applications, numerous extensions, and thousands of publications. The algorithm and its extensions are now standard tools applied to incomplete data problems in virtually every field in which statistical methods are used. Until now, however, no single source offered a complete and unified treatment of the subject.The EM Algorithm and Extensions describes the formulation of the EM algorithm, details its methodology, discusses its implementation, and illustrates applications in many statistical contexts. Employing numerous examples, Geoffrey McLachlan and Thriyambakam Krishnan examine applications both in evidently incomplete data situations-where data are missing, distributions are truncated, or observations are censored or grouped-and in a broad variety of situations in which incompleteness is neither natural nor evident. They point out the algorithm's shortcomings and explain how these are addressed in the various extensions.Areas of application discussed include: Regression Medical imaging Categorical data analysis Finite mixture analysis Factor analysis Robust statistical modeling Variance-components estimation Survival analysis Repeated-measures designs For theoreticians, practitioners, and graduate students in statistics as well as researchers in the social and physical sciences, The EM Algorithm and Extensions opens the door to the tremendous potential of this remarkably versatile statistical tool.},
ISBN = {9780471201700},
Owner = {imke},
Timestamp = {2018.10.26},
Topics = {em}
}
@Article{meng_rubin_B1993,
Title = {Maximum likelihood estimation via the {ECM} algorithm: a general framework},
Author = {Meng, S. L. and Rubin, D. B.},
Journal = {Biometrika},
Year = {1993},
Number = {2},
Pages = {267-278},
Volume = {80},
Abstract = {Two major reasons for the popularity of the EM algorithm are that its maximum step involves only complete-data maximum likelihood estimation, which is often computa-tionally simple, and that its convergence is stable, with each iteration increasing the likelihood. When the associated complete-data maximum likelihood estimation itself is complicated, EM is less attractive because the M-step is computationally unattractive. In many cases, however, complete-data maximum likelihood estimation is relatively simple when conditional on some function of the parameters being estimated. We introduce a class of generalized EM algorithms, which we call the ECM algorithm, for Expecta-tion/Conditional Maximization (CM), that takes advantage of the simplicity of complete-data conditional maximum likelihood estimation by replacing a complicated M-step of EM with several computationally simpler cM-steps. We show that the ECM algorithm shares all the appealing convergence properties of EM, such as always increasing the likelihood, and present several illustrative examples.},
Doi = {10.1093/biomet/80.2.267},
ISBN = {00063444},
ISSN = {00063444},
Keywords = {Bayesian inference; conditional maximization; constrained optimization; EM algorithm; Gibbs sampler; incomplete data; iterated conditional modes; iterative proportional fitting; missing data},
Owner = {alyssa},
Timestamp = {2017.08.31},
Topics = {ml}
}
@Article{meng_rubin_JASA1991,
Title = {Using {EM} to obtain asymptotic variance-covariance matrices: the {SEM} algorithm},
Author = {Meng, X. L. and Rubin, D. B.},
Journal = {Journal of the American Statistical Association},
Year = {1991},
Number = {416},
Pages = {899-909},
Volume = {86},
Abstract = {The expectation maximization (EM) algorithm estimation in incomplete-data parameters matrices using only the code for computing missing infornation matrix. We call this supplemented that the SEM algorithm tiparameter environments. is a popular, and often remarkably (e.g., standard errors) are not automatic byproducts, to find the increased variability EM algorithm can be a practically problems. One criticism of EM in practice is that asymptotic Raphson. In this article we define and illustrate a procedure that obtains numerically the complete-data important due to missing information the SEM algorithm. supplement SEM can also be used as a tool for monitoring whether simple, method for maximum likelihood variance-covariance matrices for as they are when using some other methods, such as Newton- stable asymptotic standard matrix operations. The basic idea is to use the fact that the rate of convergence of EM is governed by the fractions to add to the complete-data Theory and particular examples reinforce variance-covariance variance-covariance matrix, the code for EM itself, and code for of variance-covariance the conclusion to EM in many problems. SEM is especially useful in mul- problems where only a subset of the parameters are affected by missing infonnation and in parallel computing EM has converged to a (local) maximum.},
Doi = {10.1080/01621459.1991.10475130},
ISBN = {01621459},
ISSN = {1537274X},
Keywords = {Bayesian inference; convergence rate; EM algorithm; incomplete data; maximum likelihood estimation; observed information},
Owner = {alyssa},
Pmid = {298},
Timestamp = {2017.10.25},
Topics = {ml}
}
@Article{meng_SAP2012,
Title = {UYou want me to analyze data I don't have? Are you insane?},
Author = {Meng, X. L.},
Journal = {Shanghai Archives of Psychiatry},
Year = {2012},
Number = {5},
Pages = {287-301},
Volume = {24},
Abstract = {Eighteen years ago, Professor Xinming Tu (one of the journal's biostatistical editors) and I were coauthors of a paper that involved missing data in chemometrics. One of the reviewer's comments included the following:
The statement, ‘The naive approach of ignoring the missing data and using only the observed portion could provide very misleading conclusions’ is nonsense to me (and I think the authors should also recognize it as nonsense in the real world). Similarly, what does it mean, ‘When analyzing such missing data, ...’; if the data are missing, you can't analyze them.
If you find nothing nonsensical in this reviewer's comments, then the current article is worth a few minutes of your time. Statistical analysis has the same inductive nature as detective work: inferring unknowns from whatever one knows and observes, including the evidence that something is missing. Few qualified detectives would ignore suspicious absences in drawing their overall conclusions. Similarly, understanding the complications and consequences of having missing data is essential to reaching statistically meaningful and scientifically defensible conclusions.},
Doi = {10.3969/j.issn.1002-0829.2012.05.011},
Url = {http://stat.harvard.edu/XLM/ShanghaiArchivesofPsychiatry/ShanghaiArchivesofPsychiatry_v24_n5_2012_pp297-301.pdf},
Keywords = {missing data; incomplete data},
Owner = {imke},
Timestamp = {2019.01.09},
Topics = {general_informal}
}
@Article{miao_tchetgen_2018,
Title = {Identification and inference with nonignorable missing covariate data},
Author = {Miao, W. and Tchetgen Tchetgen, E. J.},
Journal = {Statistica Sinica},
Year = {2018},
Number = {4},
Pages = {2049--2067},
Volume = {28},
Publisher = {Institute of Statistical Science},
Abstract = {We study identification of parametric and semiparametric models with missing covariate data. When covariate data are missing not at random, identification is not guaranteed even under fairly restrictive parametric assumptions, a fact that is illustrated with several examples. We propose a general approach to establish identification of parametric and semiparametric models when a covariate is missing not at random. Without auxiliary information about the missingness process, identification of parametric models is strongly dependent on model specification. However, in the presence of a fully observed shadow variable that is correlated with the missing covariate but otherwise independent of the missingness conditional on the covariate, identification is more broadly achievable, including in fairly large semiparametric models. Special consideration is given to the generalized linear models with the missingness process unrestricted. Under such a setting, the outcome model is identified for a number of familiar generalized linear models, and we provide counterexamples when identification fails. For estimation, we describe an inverse probability weighted estimator that incorporates the shadow variable to estimate the propensity score model, and we evaluate its performance via simulations. We further illustrate the shadow variable approach with a data example about home prices in China.},
Doi = {10.5705/ss.202016.0322},
Keywords = {Identification; missing covariate data; missing not at random; shadow variable},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar}
}
@Article{moeur_stage_FS1995,
Title = {Most similar neighbor: an improved sampling inference procedure for natural resources planning},
Author = {Moeur, M. and Stage, A. R.},
Journal = {Forest Science},
Year = {1995},
Number = {1},
Pages = {337-359},
Volume = {42},
Abstract = {To model ecosystem functioning for landscape design, analysts would like detailed data about each parcel of land in the landscape. Usually, only information of low resolution is available for the entire area, supplemented by detailed information for a sample of the parcels. These sample data, usually obtained through two-phase sampling, provide initial values of important design elements for dynamic, often nonlinear, models of ecosystem functioning. However, to represent the contribution of the nonsampled portions of the landscape to ecosystem functioning, it would be convenient to be able to operate as if the detailed design information were available for each and every parcel in the analysis. Inference procedures to complete the design information for the unsampled parcels have usually followed the techniques of stratified or regression sampling. These procedures have been developed with regard to their efficiency for estimating population means and totals rather than for their utility to model ecosystem functioning and response to intervention. Stratified sampling or regression estimates therefore do not retain the complex relationships between multivariate design attributes. We present a new multivariate inference procedure for use in such circumstances. In place of estimating design attributes element-by-element in a traditional sense for each first-phase observation, the procedure simply chooses the most similar parcel from the set of parcels with detailed examinations to act as its stand-in. The stand-in is chosen on the basis of a similarity measure that summarizes the multivariate relationships between the set of low resolution indicator attributes and the set of detailed design attributes derived from the second-phase sample. Canonical correlation analysis is used to derive a similarity function for this procedure, which we call "Most Similar Neighbor Inference." We compared most similar neighbor estimates for a multivariate forest inventory to estimates from regression, stratified sampling, and a Swedish National Forest Survey method. The indicator attributes were recorded from stand records, maps, and aerial photographs, while the design attributes were stand yield characteristics derived from on-the-ground inventories. The most similar neighbor estimates have prediction errors that are comparable in magnitude to the traditional estimates for easy-to-predict design attributes. Thus, most similar neighbor inference should be expected to perform almost as well as regression in sampling contexts requiring estimates of population means or totals. More importantly, the most similar neighbor procedure more closely reproduces the covariance structure of the design attributes. Preserving the relationships among design attributes is a vital feature when the purpose of the modeling is to evaluate management options. Furthermore, because most similar neighbor is an exact interpolator, estimates derived from it are consistent in a finite population sense.},
Keywords = {Canonical correlation analysis; retaining sample variability; preserving covariance; data-splitting; jackknifing},
Owner = {nathalie},
Timestamp = {2017.10.12},
Doi = {10.1093/forestscience/41.2.337},
Topics = {knn}
}
@InProceedings{mohan_etal_2018,
Title = {Estimation with Incomplete Data: The Linear Case},
Author = {Karthika Mohan and Felix Thoemmes and Judea Pearl},
Booktitle = {Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence, {IJCAI-18}},
Publisher = {International Joint Conferences on Artificial Intelligence Organization},
Pages = {5082--5088},
Year = {2018},
Month = {7},
Doi = {10.24963/ijcai.2018/705},
Url = {https://doi.org/10.24963/ijcai.2018/705},
Keywords = {MNAR; estimation; graphical models},
Owner = {imke},
Timestamp = {2019.07.29},
Topics = {mnar}
}
@TechReport{mohan_pearl_2019,
Author = {Mohan, K. and Pearl, J.},
Title = {Graphical Models for Processing Missing Data},
Institution = {Department of Computer Science, University of California, Los Angeles},
Address = {CA},
Year = {2019},
Number = {R-473-L},
Note = {Forthcoming, Journal of American Statistical Association (JASA)},
Abstract = {
This paper reviews recent advances in missing data research using graphical models to represent multivariate dependencies. We first examine the limitations of tra- ditional frameworks from three different perspectives: transparency, estimability and testability. We then show how procedures based on graphical models can overcome these limitations and provide meaningful performance guarantees even when data are Missing Not At Random (MNAR). In particular, we identify conditions that guarantee consistent estimation in broad categories of missing data problems, and derive procedures for implementing this estimation. Finally we derive testable implications for missing data models in both MAR (Missing At Random) and MNAR categories.},
Url = {http://ftp.cs.ucla.edu/pub/stat\_ser/r473-L.pdf},
Keywords = {Missing data; Graphical Models; Testability; Recoverability; Non-Ignorable; Missing Not At Random (MNAR)},
Owner = {imke},
Timestamp = {2019.07.29},
Topics = {mnar}
}
@Book{molenberghs_etal_HMDM2014,
Title = {Handbook of Missing Data Methodology},
Author = {Molenberghs, G. and Fitzmaurice, G. and Kenward, M. G. and Tsiatis,A. and Verbeke, G.},
Publisher = {Chapman and Hall/CRC},
Year = {2014},
Address = {New York, NY, USA},
Series = {Chapman \& Hall/CRC Handbooks of Modern Statistical Methods},
Abstract = {Missing data affect nearly every discipline by complicating the statistical analysis of collected data. But since the 1990s, there have been important developments in the statistical methodology for handling missing data. Written by renowned statisticians in this area, Handbook of Missing Data Methodology presents many methodological advances and the latest applications of missing data methods in empirical research. Divided into six parts, the handbook begins by establishing notation and terminology. It reviews the general taxonomy of missing data mechanisms and their implications for analysis and offers a historical perspective on early methods for handling missing data. The following three parts cover various inference paradigms when data are missing, including likelihood and Bayesian methods; semi-parametric methods, with particular emphasis on inverse probability weighting; and multiple imputation methods. The next part of the book focuses on a range of approaches that assess the sensitivity of inferences to alternative, routinely non-verifiable assumptions about the missing data process. The final part discusses special topics, such as missing data in clinical trials and sample surveys as well as approaches to model diagnostics in the missing data setting. In each part, an introduction provides useful background material and an overview to set the stage for subsequent chapters. Covering both established and emerging methodologies for missing data, this book sets the scene for future research. It provides the framework for readers to delve into research and practical applications of missing data methods},
ISBN = {9781439854624},
Owner = {alyssa},
Timestamp = {2017.11.14},
Topics = {general}
}
@Book{molenberghs_kenward_2007,
Title = {Missing Data in Clinical Studies},
Author = {Molenberghs, G. and Kenward, M. G.},
Publisher = {Wiley},
Year = {2007},
Address = {Chichester, West Sussex, UK},
Abstract = {Missing Data in Clinical Studies provides a comprehensive account of the problems arising when data from clinical and related studies are incomplete, and presents the reader with approaches to effectively address them. The text provides a critique of conventional and simple methods before moving on to discuss more advanced approaches. The authors focus on practical and modeling concepts, providing an extensive set of case studies to illustrate the problems described. Provides a practical guide to the analysis of clinical trials and related studies with missing data. Examines the problems caused by missing data, enabling a complete understanding of how to overcome them. Presents conventional, simple methods to tackle these problems, before addressing more advanced approaches, including sensitivity analysis, and the MAR missingness mechanism. Illustrated throughout with real-life case studies and worked examples from clinical trials. Details the use and implementation of the necessary statistical software, primarily SAS. Missing Data in Clinical Studies has been developed through a series of courses and lectures. Its practical approach will appeal to applied statisticians and biomedical researchers, in particular those in the biopharmaceutical industry, medical and public health organisations. Graduate students of biostatistics will also find much of benefit.},
Doi = {10.1002/9780470510445},
ISBN = {9780470849811},
Owner = {imke},
Timestamp = {2018.10.26},
Topics = {general}
}
@Article{molenberghs_etal_SN1998,
Title = {Monotone missing data and pattern-mixture models},
Author = {Molenberghs, G. and Michiels, B. and Kenward, M. G. and Diggle, P. J.},
Journal = {Statistica Neerlandica},
Year = {1998},
Number = {2},
Pages = {153-161},
Volume = {52},
Abstract = {It is shown that the classical taxonomy of missing data models, namely missing completely at random, missing at random and informative missingness, which has been developed almost exclusively within a selection modelling framework, can also be applied to pattern-mixture models. In particular, intuitively appealing identifying restrictions are proposed for a pattern-mixture MAR mechanism.},
Doi = {10.1111/1467-9574.00075},
ISBN = {0039-0402},
ISSN = {0039-0402},
Keywords = {missing at random; phrases; selection model},
Owner = {alyssa},
Timestamp = {2017.11.14},
Topics = {mnar}
}
@Article{molnar_etal_CMAJ2008,
Title = {Does analysis using ``last observation carried forward'' introduce bias in dementia research?},
Author = {Molnar, F. J. and Hutton, B. and Fergusson, D.},
Journal = {Canadian Medical Association Journal},
Year = {2008},
Number = {8},
Pages = {751-753},
Volume = {179},
Abstract = {If there were a prize for the most inappropriate analytical technique in dementia research, ``last observation carried forward'' would be the runaway winner. As a society, we have spent millions of dollars on drug research in the hope of improving the care of the estimated 24.3 million people who have dementia worldwide. Researchers, patients and families have dedicated countless hours to carrying out trials to test the efficacy of drugs to treat dementia. We then take this invaluable data and, in accordance with US Food and Drug Administration regulation, subject it to last observation carried forward, a form of analysis that introduces bias.},
Doi = {10.1503/cmaj.080820},
Owner = {nathalie},
Timestamp = {2018.05.09},
Topics = {time series}
}
@Article{moritz_bartzbeielstein_RJ2017,
Title = {{imputeTS}: time series missing value imputation in {R}},
Author = {Moritz, Steffen and Bartz-Beielstein, Thomas},
Journal = {The R Journal},
Year = {2017},
Number = {1},
Pages = {207-218},
Volume = {9},
Abstract = {The imputeTS package specializes on univariate time series imputation. It offers multiple state-of-the-art imputation algorithm implementations along with plotting functions for time series missing data statistics. While imputation in general is a well-known problem and widely covered by R packages, finding packages able to fill missing values in univariate time series is more complicated. The reason for this lies in the fact that most imputation algorithms rely on inter-attribute correlations, while univariate time series imputation instead needs to employ time dependencies. This paper provides an introduction to the imputeTS package and its provided algorithms and tools. Furthermore, it gives a short overview about univariate time series imputation in R.},
ISSN = {20734859},
Owner = {alyssa},
Timestamp = {2018.06.07},
Topics = {time series; imputation},
Url = {https://journal.r-project.org/archive/2017/RJ-2017-009/index.html}
}
@Unpublished{moritz_etal_p2015,
Title = {Comparison of different methods for univariate time series imputation in {R}},
Author = {Moritz, S. and Sard\'a, A. and Bartz-Beielstein, T. and Zaefferer, M. and Stork, J.},
Note = {Prepint arXiv 1510.03924},
Year = {2015},
Owner = {nathalie},
Timestamp = {2017.07.13},
Topics = {imputation; time series},
Url = {https://arxiv.org/abs/1510.03924},
Topics = {time series; imputation}
}
@Article{murray_reiter_2016,
Author = {Murray, J. S. and Reiter, J. P.},
Title = {Multiple Imputation of Missing Categorical and Continuous Values via Bayesian Mixture Models With Local Dependence},
Journal = {Journal of the American Statistical Association},
Volume = {111},
Number = {516},
Pages = {1466-1479},
Year = {2016},
Publisher = {Taylor \& Francis},
Doi = {10.1080/01621459.2016.1174132},
Abstract = {We present a nonparametric Bayesian joint model for multivariate continuous and categorical variables, with the intention of developing a flexible engine for multiple imputation of missing values. The model fuses Dirichlet process mixtures of multinomial distributions for categorical variables with Dirichlet process mixtures of multivariate normal distributions for continuous variables. We incorporate dependence between the continuous and categorical variables by (1) modeling the means of the normal distributions as component-specific functions of the categorical variables and (2) forming distinct mixture components for the categorical and continuous data with probabilities that are linked via a hierarchical model. This structure allows the model to capture complex dependencies between the categorical and continuous data with minimal tuning by the analyst. We apply the model to impute missing values due to item nonresponse in an evaluation of the redesign of the Survey of Income and Program Participation (SIPP). The goal is to compare estimates from a field test with the new design to estimates from selected individuals from a panel collected under the old design. We show that accounting for the missing data changes some conclusions about the comparability of the distributions in the two datasets. We also perform an extensive repeated sampling simulation using similar data from complete cases in an existing SIPP panel, comparing our proposed model to a default application of multiple imputation by chained equations. Imputations based on the proposed model tend to have better repeated sampling properties than the default application of chained equations in this realistic setting. Supplementary materials for this article are available online.},
Keywords = {hierarchical mixture model; missing data; nonparametric bayes; stick-breaking process},
Owner = {imke},
Timestamp = {2018.11.12},
Topics = {multiple imputation}
}
@Book{national_research_council_2010,
Title = {The Prevention and Treatment of Missing Data in Clinical Trials},
Author = {National Research Council, U.S.},
Publisher = {National Academies Press},
Year = {2010},
Address = {Washington (DC), USA},
Abstract = {Randomized clinical trials are the primary tool for evaluating new medical interventions. Randomization provides for a fair comparison between treatment and control groups, balancing out, on average, distributions of known and unknown factors among the participants. Unfortunately, these studies often lack a substantial percentage of data. This missing data reduces the benefit provided by the randomization and introduces potential biases in the comparison of the treatment groups. Missing data can arise for a variety of reasons, including the inability or unwillingness of participants to meet appointments for evaluation. And in some studies, some or all of data collection ceases when participants discontinue study treatment. Existing guidelines for the design and conduct of clinical trials, and the analysis of the resulting data, provide only limited advice on how to handle missing data. Thus, approaches to the analysis of data with an appreciable amount of missing values tend to be ad hoc and variable. The Prevention and Treatment of Missing Data in Clinical Trials concludes that a more principled approach to design and analysis in the presence of missing data is both needed and possible. Such an approach needs to focus on two critical elements: (1) careful design and conduct to limit the amount and impact of missing data and (2) analysis that makes full use of information on all randomized participants and is based on careful attention to the assumptions about the nature of the missing data underlying estimates of treatment effects. In addition to the highest priority recommendations, the book offers more detailed recommendations on the conduct of clinical trials and techniques for analysis of trial data.},
Doi = {10.17226/12955},
ISBN = {9780309158145},
Owner = {imke},
Timestamp = {2018.10.26},
Topics = {general_informal}
}
@InProceedings{nowicki_etal_2016,
Title = {Novel rough neural network for classification with missing data},
Author = {Nowicki, R. K. and Scherer, R. and Rutkowski, L.},
Booktitle = {21st International Conference on Methods and Models in Automation and Robotics (MMAR)},
Pages = {820--825},
Year = {2016},
Publisher = {IEEE},
Eventdate = {2016-09-29/2016-09-01},
Editor = {-},
Abstract = {The paper presents a new feedforward neural network architecture. Thanks to incorporating the rough set theory, the new network is able to process imperfect input data, i.e. in the form of intervals or with missing values. The paper focuses on the last case. In contrast to imputation, marginalisation and similar solutions, the proposed architecture is able to give an imprecise answer as the result of input data imperfection. In the extreme case, the answer can be indefinite contrary to a confabulation specific for the aforementioned methods. The results of experiments performed on three classification benchmark datasets for every possible combination of missing attribute values showed the proposed solution works well with missing data with accuracy dependent on the level of missing data.},
Doi = {10.1109/MMAR.2016.7575243},
Keywords = {Classification; rough set theory; rough neural network; imperfect input data; feedforward neural network architecture; missing data},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {classification; deep learning; neural network}
}
@Book{okelly_ratitch_2014,
Title = {Clinical Trials with Missing Data: A Guide for Practitioners},
Author = {O'Kelly, M. and Ratitch, B.},
Year = {2014},
Publisher = {John Wiley \& Sons, Ltd},
Doi = {10.1002/9781118762516},
Abstract = {This book provides practical guidance for statisticians, clinicians, and researchers involved in clinical trials in the biopharmaceutical industry, medical and public health organisations. Academics and students needing an introduction to handling missing data will also find this book invaluable.},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {general}
}
@InProceedings{orchard_woodbury_1972,
Title = {A missing information principle: theory and applications},
Author = {Orchard, Terence and Woodbury, Max A},
Booktitle = {Proceedings of the Sixth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1: Theory of Statistic},
Year = {1972},
Volume = {1},
Pages = {697--715},
Editor = {Le Cam, L. M. and Neyman J. and Scott, E. L.},
Publisher = {University of California Press},
Abstract = {The problem that a relatively simple analysis is changed into a complex one just because some of the information is missing, is one which faces most practicing statisticians at some point in their career. Obviously the best way to treat missing information problems is not to have them. Unfortunately circumstances arise in which information is missing and nothing can be done to replace it for one reason or another.},
Keywords = {maximum likelihood estimation; analysis of variance; distribution functions; random variables; factor analysis; covariance; regression analysis; discrete distribution; discriminant analysis; factorial design; probability; statistical inference},
Url = {https://apps.dtic.mil/dtic/tr/fulltext/u2/1022173.pdf},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {general}
}
@Article{peugh_enders_RER2004,
Title = {Missing data in educational research: a review of reporting practices and suggestions for improvement},
Author = {Peugh, J. L. and Enders, C. K.},
Journal = {Review of Educational Research},
Year = {2004},
Number = {4},
Pages = {525--556},
Volume = {74},
Abstract = {Missing data analyses have received considerable recent attention in the methodological literature, and two “modern” methods, multiple imputation and maximum likelihood estimation, are recommended. The goals of this article are to (a) provide an overview of missing-data theory, maximum likelihood estimation, and multiple imputation; (b) conduct a methodological review of missing-data reporting practices in 23 applied research journals; and (c) provide a demonstration of multiple imputation and maximum likelihood estimation using the Longitudinal Study of American Youth data. The results indicated that explicit discussions of missing data increased substantially between 1999 and 2003, but the use of maximum likelihood estimation or multiple imputation was rare; the studies relied almost exclusively on listwise and pairwise deletion.},
Doi = {10.3102/00346543074004525},
Owner = {alyssa},
Timestamp = {2018.07.12},
Topics = {general_informal},
Url = {http://dx.doi.org/10.3102/00346543074004525}
}
@Article{pigott_ERE2001,
Title = {A review of methods for missing data},
Author = {Pigott, T. D.},
Journal = {Educational Research and Evaluation},
Year = {2001},
Number = {4},
Pages = {353--383},
Volume = {7},
Abstract = {This paper reviews methods for handling missing data in a research study. Many researchers use ad hoc methods such as complete case analysis, available case analysis (pairwise deletion), or single-value imputation. Though these methods are easily implemented, they require assumptions about the data that rarely hold in practice. Model-based methods such as maximum likelihood using the EM algorithm and multiple imputation hold more promise for dealing with difficulties caused by missing data. While model-based methods require specialized computer programs and assumptions about the nature of the missing data, these methods are appropriate for a wider range of situations than the more commonly used ad hoc methods. The paper provides an illustration of the methods using data from an intervention study designed to increase students' ability to control their asthma symptoms.},
Doi = {10.1076/edre.7.4.353.8937},
Owner = {alyssa},
Timestamp = {2017.02.21},
Topics = {general}
}
@article{preisser_etal_SIM2002,
Title = {Performance of weighted estimating equations for longitudinal binary data with drop-outs missing at random},
Author = {Preisser, John S. and Lohman, Kurt K. and Rathouz, Paul J.},
Journal = {Statistics in Medicine},
Volume = {21},
Number = {20},
Pages = {3035--3054},
Year = {2002},
Publisher = {John Wiley \& Sons, Ltd.},
Abstract = {The generalized estimating equations (GEE) approach is commonly used to model incomplete longitudinal binary data. When drop-outs are missing at random through dependence on observed responses (MAR), GEE may give biased parameter estimates in the model for the marginal means. A weighted estimating equations approach gives consistent estimation under MAR when the drop-out mechanism is correctly specified. In this approach, observations or person-visits are weighted inversely proportional to their probability of being observed. Using a simulation study, we compare the performance of unweighted and weighted GEE in models for time-specific means of a repeated binary response with MAR drop-outs. Weighted GEE resulted in smaller finite sample bias than GEE. However, when the drop-out model was misspecified, weighted GEE sometimes performed worse than GEE. Weighted GEE with observation-level weights gave more efficient estimates than a weighted GEE procedure with cluster-level weights.},
Keywords = {correlated data; drop-outs; estimating equations; logistic models; repeated measures},
Doi = {10.1002/sim.1241},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {survey}
}
@Article{rahman_islam_KBS2013,
Title = {Missing value imputation using decision trees and decision forests by splitting and merging records: Two novel techniques},
Author = {Rahman, Geaur and Islam, Zahidul},
Journal = {Knowledge-Based Systems},
Volume = {53},
Pages = {51--65},
Year = {2013},
Publisher = {Elsevier},
Doi = {10.1016/j.knosys.2013.08.023},
Url = {http://www.sciencedirect.com/science/article/pii/S0950705113002591},
Abstract = {We present two novel techniques for the imputation of both categorical and numerical missing values. The techniques use decision trees and forests to identify horizontal segments of a data set where the records belonging to a segment have higher similarity and attribute correlations. Using the similarity and correlations, missing values are then imputed. To achieve a higher quality of imputation some segments are merged together using a novel approach. We use nine publicly available data sets to experimentally compare our techniques with a few existing ones in terms of four commonly used evaluation criteria. The experimental results indicate a clear superiority of our techniques based on statistical analyses such as confidence interval.},
Keywords = {Data pre-processing; Data cleansing; Missing value imputation; Decision tree algorithm; Decision forest algorithm; EM algorithm},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {decision trees}
}
@Article{rao_shao_B1992,
Title = {Jackknife variance estimation with survey data under hot deck imputation},
Author = {Rao, J. N. K. and Shao, J.},
Journal = {Biometrika},
Year = {1992},
Number = {4},
Pages = {811-822},
Volume = {79},
Abstract = {Hot deck imputation is commonly employed for item nonresponse in sample surveys. It is also a common practice to treat the imputed values as if they are true values, and then compute the variance estimates using standard formulae. This procedure, however, could lead to serious underestimation of the true variance, when the proportion of missing values for an item is appreciable. We propose a jackknife variance estimator for stratified multistage surveys which is obtained by first adjusting the imputed values for each pseudo-replicate and then applying the standard jackknife formula. The proposed jack-knife variance estimator is shown to be consistent as the sample size increases, assuming equal response probabilities within imputation classes and using a particular hot deck imputation.},
Doi = {10.2307/2337236},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {hot-deck}
}
@Article{reilly_pepe_SM1997,
Title = {The relationship between hot-deck multiple imputation and weighted likelihood},
Author = {Reilly, M. and Pepe, M.},
Journal = {Statistics in Medecine},
Year = {1997},
Number = {1-3},
Pages = {5-19},
Volume = {16},
Abstract = {Hot-deck imputation is an intuitively simple and popular method of accommodating incomplete data. Users of the method will often use the usual multiple imputation variance estimator which is not appropriate in this case. However, no variance expression has yet been derived for this easily implemented method applied to missing covariates in regression models. The simple hot-deck method is in fact asymptotically equivalent to the mean-score method for the estimation of a regression model parameter, so that hot-deck can be understood in the context of likelihood methods. Both of these methods accommodate data where missingness may depend on the observed variables but not on the unobserved value of the incomplete covariate, that is, missing at random (MAR). The asymptotic properties of hot-deck are derived here for the case where the fully observed variables are categorical, though the incomplete covariate(s) may be continuous. Simulation studies indicate that the two methods compare well in small samples and for small numbers of imputations. Current users of hot-deck may now conduct their analysis using mean-score, which is a weighted likelihood method and can thus be implemented by a single pass through the data using any standard package which accommodates weighted regression models. Valid inference is now straightforward using the variance expression provided here. The equivalence of mean-score and hot-deck is illustrated using three clinical data sets where an important covariate is missing for a large number of study subjects.},
Doi = {10.1002/(SICI)1097-0258(19970115)16:1%3C5::AID-SIM469%3E3.0.CO;2-8},
Owner = {nathalie},
Timestamp = {2018.04.16},
Topics = {hot-deck}
}
@Article{reiter_sadinle_2017,
Author = {Reiter, Jerome P. and Sadinle, Mauricio},
Title = {Itemwise conditionally independent nonresponse modelling for incomplete multivariate data},
Journal = {Biometrika},
Volume = {104},
Number = {1},
Pages = {207-220},
Year = {2017},
Month = {01},
Abstract = {We introduce a nonresponse mechanism for multivariate missing data in which each study variable and its nonresponse indicator are conditionally independent given the remaining variables and their nonresponse indicators. This is a nonignorable missingness mechanism, in that nonresponse for any item can depend on values of other items that are themselves missing. We show that under this itemwise conditionally independent nonresponse assumption, one can define and identify nonparametric saturated classes of joint multivariate models for the study variables and their missingness indicators. We also show how to perform sensitivity analysis with respect to violations of the conditional independence assumptions encoded by this missingness mechanism. We illustrate the proposed modelling approach with data analyses.},
Doi = {10.1093/biomet/asw063},
Url = {https://doi.org/10.1093/biomet/asw063},
eprint = {http://oup.prod.sis.lan/biomet/article-pdf/104/1/207/13066719/asw063.pdf},
Keywords = {Loglinear model; Missing not at random; Missingness mechanism;
Nonignorable; Nonparametric saturated; Sensitivity analysis.},
Owner = {imke},
Timestamp = {2019.03.28},
Topics = {mnar; sensitivity analysis}
}
@TechReport{rieger_etal_2010,
Title = {Random forests with missing values in the covariates},
Author = {Rieger, A. and Hothorn, T. and Strobl, C.},
Institution = {University of Munich, Department of Statistics},
Year = {2010},
Number = {79},
Abstract = {In Random Forests several trees are constructed from bootstrap- or subsamples of the original data. Random Forests have become very popular, e.g., in the fields of genetics and bioinformatics, because they can deal with high-dimensional problems including complex interaction effects. Conditional Inference Forests provide an implementation of Random Forests with unbiased variable selection. Like the original Random Forests, they employ surrogate variables to handle missing values in the predictor variables. In this paper we report the results of an extensive simulation study covering both classification and regression problems under a variety of scenarios, including different missing value generating processes as well as different correlation structures between the variables. Moreover, a high dimensional setting with a high number of noise variables was considered in each case. The results compare the performance of Conditional Inference Forests with surrogate variables to that of knn imputation prior to fitting. The results show that while in some settings one or the other approach is slightly superior, there is no overall difference in the performance of Conditional Inference Forests with surrogate variables and with prior knn-imputation.},
Keywords = {surrogate variables; knn; imputation; missing at random; MAR},
Url = {https://epub.ub.uni-muenchen.de/11481/1/techreport.pdf},
Owner = {imke},
Timestamp = {2018.10.30},
Topics = {knn; random forests}
}
@Article{robins_etal_JASA1994,
Title = {Estimation of Regression Coefficients When Some Regressors are not Always Observed},
Author = {Robins, J. M. and Rotnitzky, A. and Zhao, L. P.},
Journal = {Journal of the American Statistical Association},
Year = {1994},
Number = {427},
Pages = {846-866},
Volume = {89},
Abstract = {In applied problems it is common to specify a model for the conditional mean of a response given a set of regressors. A subset of the regressors may be missing for some study subjects either by design or happenstance. In this article we propose a new class of semiparametric estimators, based on inverse probability weighted estimating equations, that are consistent for parameter vector α0 of the conditional mean model when the data are missing at random in the sense of Rubin and the missingness probabilities are either known or can be parametrically modeled. We show that the asymptotic variance of the optimal estimator in our class attains the semiparametric variance bound for the model by first showing that our estimation problem is a special case of the general problem of parameter estimation in an arbitrary semiparametric model in which the data are missing at random and the probability of observing complete data is bounded away from 0, and then deriving a representation for the efficient score, the semiparametric variance bound, and the influence function of any regular, asymptotically linear estimator in this more general estimation problem. Because the optimal estimator depends on the unknown probability law generating the data, we propose locally and globally adaptive semiparametric efficient estimators. We compare estimators in our class with previously proposed estimators. We show that each previous estimator is asymptotically equivalent to some, usually inefficient, estimator in our class. This equivalence is a consequence of a proposition stating that every regular asymptotic linear estimator of α0 is asymptotically equivalent to some estimator in our class. We compare various estimators in a small simulation study and offer some practical recommendations.},
Keywords = {Cox proportional hazards model; Linear regression; Logistic regression; Measurement error; Missing covariates; Missing data; Nonlinear regression; Semiparametric efficiency; Survey sampling; Two-stage case-control studies; Validation study},
Doi = {10.1080/01621459.1994.10476818},
Owner = {imke},
Publisher = {American Statistical Association, Taylor \& Francis},
Timestamp = {2018.12.19},
Topics = {ipw}
}
@Article{robins_etal_JASA1995,
Title = {Analysis of semiparametric regression models for repeated outcomes in the presence of missing data},
Author = {Robins, J. M. and Rotnitzky, A. and Zhao, L. P.},
Journal = {Journal of the American Statistical Association},
Year = {1995},
Number = {429},
Pages = {106-121},
Volume = {90},
Abstract = {We propose a class of inverse probability of censoring weighted estimators for the parameters of models for the dependence of the mean of a vector of correlated response variables on a vector of explanatory variables in the presence of missing response data. The proposed estimators do not require full specification of the likelihood. They can be viewed as an extension of generalized estimating equations estimators that allow for the data to be missing at random but not missing completely at random. These estimators can be used to correct for dependent censoring and nonrandom noncompliance in randomized clinical trials studying the effect of a treatment on the evolution over time of the mean of a response variable. The likelihood-based parametric G-computation algorithm estimator may also be used to attempt to correct for dependent censoring and nonrandom noncompliance. But because of possible model misspecification, the parametric G-computation algorithm estimator, in contrast with the proposed weighted estimators, may be inconsistent for the difference in treatment-arm-specific means, even when compliance is completely at random and censoring is independent. We illustrate our methods with the analysis of the effect of zidovudine (AZT) treatment on the evolution of mean CD4 count with data from an AIDS clinical trial.},
Doi = {10.2307/2291134},
ISSN = {01621459},
Owner = {alyssa},
Publisher = {American Statistical Association, Taylor \& Francis},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Article{robins_wang_B2000,
Title = {Inference for imputation estimators},
Author = {Robins, J. M. and Wang, N.},
Journal = {Biometrika},
Year = {2000},
Number = {1},
Pages = {113-124},
Volume = {87},
Abstract = {We derive an estimator of the asymptotic variance of both single and multiple imputation estimators. We assume a parametric imputation model but allow for non- and semiparametric analysis models. Our variance estimator, in contrast to the estimator proposed by Rubin (1987), is consistent even when the imputation and analysis models are misspecified and incompatible with one another.},
Keywords = {estimators; statistical variance; missing data; parametric models; data imputation; consistent estimators; modeling; datasets; analytical estimating; estimation bias},
Owner = {nathalie},
Timestamp = {2018.04.16},
Topics = {multiple imputation},
Url = {https://www.jstor.org/stable/2673565}
}
@Article{rosseel_JSS2012,
Title = {{lavaan}: an {R} package for structural equation modeling},
Author = {Rosseel, Y.},
Journal = {Journal of Statistical Software},
Year = {2012},
Number = {2},
Volume = {48},
Abstract = {Structural equation modeling (SEM) is a vast field and widely used by many applied researchers in the social and behavioral sciences. Over the years, many software packages for structural equation modeling have been developed, both free and commercial. However, perhaps the best state-of-the-art software packages in this field are still closed-source and/or commercial. The R package lavaan has been developed to provide applied researchers, teachers, and statisticians, a free, fully open-source, but commercial-quality package for latent variable modeling. This paper explains the aims behind the development of the package, gives an overview of its most important features, and provides some examples to illustrate how lavaan works in practice.},
Doi = {10.18637/jss.v048.i02},
Owner = {nathalie},
Timestamp = {2018.05.15},
Topics = {ml}
}
@Article{rotnitzky_etal_JASA1998,
Title = {Semiparametric regression for repeated outcomes with nonignorable nonresponse},
Author = {Rotnitzky, A. and Robins, J. M. and Scharfstein, D. O.},
Journal = {Journal of the American Statistical Association},
Year = {1998},
Number = {444},
Pages = {1321-1339},
Volume = {93},
Abstract = {We consider inference about the parameter beta* indexing the conditional mean of a vector of correlated outcomes given a vector of explanatory variables when some of the outcomes are missing in a subsample of the study and the probability of response depends on both observed and unobserved data values; that is, nonresponse is nonignorable. We propose a class of augmented inverse probability of response weighted estimators that are consistent and asymptotically normal (CAN) for estimating beta* when the response probabilities can be parametrically modeled and a CAN estimator exists. The proposed estimators do not require full specification of a parametric likelihood, and their computation does not require numerical integration. Our estimators can be viewed as an extension of generalized estimating equation estimators that allows for nonignorable nonresponse. We show that our class essentially consists of all CAN estimators of beta*. We also show that the asymptotic variance of the optimal estimator in our class attains the semiparametric variance bound for the model. When the model for nonresponse is richly parameterized, joint estimation of the regression parameter beta* and the nonresponse model parameter tau* which encodes the magnitude of nonignorable selection bias, may be difficult or impossible. Therefore we propose regarding the selection bias parameter tau* as known, rather than estimating it from the data. We then perform a sensitivity analysis that examines how inference concerning the regression parameter beta* changes as we vary tau* over a range of plausible values. We apply our approach to the analysis of ACTG Trial 002, an AIDS clinical trial.},
Doi = {10.2307/2670049},
ISSN = {01621459},
Keywords = {curse of dimensionality; estimating equations; identification; missing data; semiparametric efficiency; sensitivity analysis},
Owner = {alyssa},
Publisher = {American Statistical Association, Taylor \& Francis},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Article{rubin_JASA2012,
Title = {Multiple imputation after 18+ years},
Author = {Rubin, D. B.},
Journal = {Journal of the American Statistical Association},
Year = {2012},
Number = {434},
Pages = {473-489},
Volume = {91},
Abstract = {Multiple imputation was designed to handle the problem of missing data in public-use data bases where the data-base constructor and the ultimate user are distinct entities. The objective is valid frequency inference for ultimate users who in general have access only to complete-data software and possess limited knowledge of specific reasons and models for nonresponse. For this situation and objective, I believe that multiple imputation by the data-base constructor is the method of choice. This article first provides a description of the assumed context and objectives, and second, reviews the multiple imputation framework and its standard results. These preliminary discussions are especially important because some recent commentaries on multiple imputation have reflected either misunderstandings of the practical objectives of multiple imputation or misunderstandings of fundamental theoretical results. Then, criticisms of multiple imputation are considered, and, finally, comparisons are made to alternative strategies},
Doi = {10.1080/01621459.1996.10476908},
ISBN = {0162-1459},
ISSN = {0162-1459},
Keywords = {confidence validity; missing data; nonresponse in surveys; public-use files; sample surveys; superefficient procedures},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {multiple imputation}
}
@Book{rubin_MINS1987,
Title = {Multlipe Imputation for Nonresponse in Surveys},
Author = {Rubin, D. B.},
Publisher = {Wiley},
Year = {1987},
Address = {Hoboken, NJ, USA},
Abstract = {Multiple imputation is a statistical technique designed to take advantage of the flexibility in modem computing to handle missing data. With it, each missing value is replaced by two or more imputed values in order to represent the uncertainty about whch value to impute. The ideas for multiple imputation first arose in the early 1970s when I was working on a problem of survey nonresponse at Educational Testing Service, here summarized as Example 1.1. T h s work was published several years later as Rubin (1977a). The real impetus for multiple imputation, however, came from work encouraged and supported by Fritz Scheuren, then of the United States Social Security Administration and now head of the Statistics of Income Division at the United States Internal Revenue Service. His concern for problems of nonresponse in the Current Population Survey led to a working paper for the Social Security Administration (Rubin, 1977b), which explicitly proposed multiple imputation. Fritz’s continued support and encouragement for the idea of multiple imputation resulted in (1) an American Statistical Association invited address on multiple imputation (Rubin, 1978a); (2) continued research, such as published in Rubin (1979a); (3) joint work with Fritz and Thomas N. Herzog in the late 1970s, summarized in several papers including Herzog and Rubin (1983); and (4) application of the ideas in 1980 to file matching, which eventually was published as Rubin (1986). Another important contributor to the development of multiple imputation has been the United States Census Bureau, which several years ago supported the production of a monograph on multiple imputation (Rubin, 1980a). This monograph was the first of four nearly complete drafts that were supposed to become this book.},
ISBN = {9780471655740},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {multiple imputation}
}
@Article{rubin_JASA1977,
Title = {Formalizing subjective notions about the effect of nonrespondents in sample surveys},
Author = {Rubin, D. B.},
Journal = {Journal of the American Statistical Association},
Year = {1977},
Number = {359},
Pages = {538-543},
Volume = {72},
Abstract = {A method is given for estimating, in a subjective sense, the effect of nonresponse in sample surveys. Based on Bayesian techniques, this method produces a subjective probability interval for the statistic that would have been calculated if all nonrespondents had responded. Background information which is recorded for both respondents and nonrespondents plays an important role in sharpening the subjective interval. Real survey data of 660 schools with 188 nonrespondents indicates that the method can be useful in practical problems. The general idea can be applied to any problem with nonrespondents or missing data.},
Doi = {10.2307/2286214},
ISSN = {01621459},
Owner = {alyssa},
Publisher = {American Statistical Association, Taylor \& Francis},
Timestamp = {2016.12.12},
Topics = {survey}
}
@Article{rubin_B1976,
Title = {Inference and missing data},
Author = {Rubin, D. B.},
Journal = {Biometrika},
Year = {1976},
Number = {3},
Pages = {581-592},
Volume = {63},
Abstract = {When making sampling distribution inferences about the parameter of the data, theta, it is appropriate to ignore the process that causes missing data if the missing data are "missing at random" and the observed data are "observed at random", but these inferences are generally conditional on the observed pattern of missing data. When making direct-likelihood or Bayesian inferences about theta, it is appropriate to ignore the process that causes missing data if the missing data are missing at random and the parameter of the missing data process is "distinct" from theta. These conditions are the weakest general conditions under which ignoring the process that causes missing data always leads to correct inferences.},
Doi = {10.1093/biomet/63.3.581},
Keywords = {Bayesian inference; incomplete data; likelikhood inference; missing at random; missing data; observed at random; sampling distribution inference},
Owner = {nathalie},
Timestamp = {2018.04.16},
Topics = {ml}
}
@Article{sadinle_reiter_2018,
Title = {Sequential Identification of Nonignorable Missing Data Mechanisms},
Author = {Sadinle, M. and Reiter, J. P.},
Journal = {Statistica Sinica},
Year = {2018},
Number = {4},
Pages = {1741--1759},
Volume = {28},
Publisher = {Institute of Statistical Science},
Abstract = {With nonignorable missing data, likelihood-based inference should be based on the joint distribution of the study variables and their missingness indicators. These joint models cannot be estimated from the data alone, thus requiring the analyst to impose restrictions that make the models uniquely obtainable from the distribution of the observed data. We present an approach for constructing classes of identifiable nonignorable missing data models. The main idea is to use a sequence of carefully set up identifying assumptions, whereby we specify potentially different missingness mechanisms for different blocks of variables. We show that the procedure results in models with the desirable property of being non-parametric saturated.},
Doi = {10.5705/ss.202016.0328},
Keywords = {Identification; Non-parametric saturated; Missing not at random; Partial ignorability; Sensitivity analysis},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar; sensitivity analysis}
}
@ARTICLE{santos2019,
title = "Generating Synthetic Missing Data: A Review by Missing Mechanism",
author = "Santos, M S and Pereira, R C and Costa, A F and Soares, J P and
Santos, J and Abreu, P H",
journal = "IEEE Access",
volume = 7,
pages = "11651--11667",
year = 2019
}
@Article{schafer_SMMR1999,
Title = {Multiple imputation: a primer},
Author = {Schafer, J. L.},
Journal = {Statistical Methods in Medical Research},
Year = {1999},
Number = {1},
Pages = {3-15},
Volume = {8},
Abstract = {In recent years, multiple imputation has emerged as a convenient and flexible paradigm for analysing data with missing values. Essential features of multiple imputation are reviewed, with answers to frequently asked questions about using the method in practice.},
Doi = {10.1191/096228099671525676},
ISBN = {0962-2802 (Print)},
ISSN = {09622802},
Owner = {alyssa},
Pmid = {10347857},
Timestamp = {2016.09.27},
Topics = {multiple imputation}
}
@Book{schafer_AIMD1997,
Title = {Analysis of Incomplete Multivariate Data},
Author = {Schafer, J. L.},
Publisher = {Chapman and Hall/CRC},
Year = {1997},
Address = {Boca Raton, FL, USA},
Series = {CRC Monographs on Statistics \& Applied Probability},
Abstract = {The last two decades have seen enormous developments in statistical methods for incomplete data. The EM algorithm and its extensions, multiple imputation, and Markov Chain Monte Carlo provide a set of flexible and reliable tools from inference in large classes of missing-data problems. Yet, in practical terms, those developments have had surprisingly little impact on the way most data analysts handle missing values on a routine basis. Analysis of Incomplete Multivariate Data helps bridge the gap between theory and practice, making these missing-data tools accessible to a broad audience. It presents a unified, Bayesian approach to the analysis of incomplete multivariate data, covering datasets in which the variables are continuous, categorical, or both. The focus is applied, where necessary, to help readers thoroughly understand the statistical properties of those methods, and the behavior of the accompanying algorithms. All techniques are illustrated with real data examples, with extended discussion and practical advice. All of the algorithms described in this book have been implemented by the author for general use in the statistical languages S and S Plus. The software is available free of charge on the Internet.},
ISBN = {0412040611},
Owner = {aimbert},
Timestamp = {2017.04.11},
Topics = {general}
}
@Article{schafer_graham_PM2002,
Title = {Missing data: our view of the state of the art},
Author = {Schafer, J. L. and Graham, J. W.},
Journal = {Psychological Methods},
Year = {2002},
Number = {2},
Pages = {147-177},
Volume = {7},
Abstract = {Statistical procedures for missing data have vastly improved, yet misconception and unsound practice still abound. The authors frame the missing-data problem, review methods, offer advice, and raise issues that remain unresolved. They clear up common misunderstandings regarding the missing at random (MAR) concept. They summarize the evidence against older procedures and, with few exceptions, discourage their use. They present, in both technical and practical language, 2 general approaches that come highly recommended: maximum likelihood (ML) and Bayesian multiple imputation (MI). Newer developments are discussed, including some for dealing with missing data that are not MAR. Although not yet in the mainstream, these procedures may eventually extend the ML and MI methods that currently represent the state of the art.},
Doi = {10.1037/1082-989X.7.2.147},
Owner = {alyssa},
Timestamp = {2017.02.21},
Topics = {general}
}
@Article{schafer_olsen_MBR1998,
Title = {Multiple Imputation for multivariate missing-data problems: a data analyst's perspective},
Author = {Schafer, J. L. and Olsen, M. K.},
Journal = {Multivariate Behavioral Research},
Year = {1998},
Number = {4},
Pages = {545-571},
Volume = {33},
Abstract = {Analyses of multivariate data are frequently hampered by missing values. Until recently, the only missing-data methods available to most data analysts have been relatively ad1 hoc practices such as listwise deletion. Recent dramatic advances in theoretical and computational statistics, however, have produced anew generation of flexible procedures with a sound statistical basis. These procedures involve multiple imputation (Rubin, 1987), a simulation technique that replaces each missing datum with a set of m > 1 plausible values. The rn versions of the complete data are analyzed by standard complete-data methods, and the results are combined using simple rules to yield estimates, standard errors, and p-values that formally incorporate missing-data uncertainty. New computational algorithms and software described in a recent book (Schafer, 1997a) allow us to create proper multiple imputations in complex multivariate settings. This article reviews the key ideas of multiple imputation, discusses the software programs currently available, and demonstrates their use on data from the Adolescent Alcohol Prevention Trial (Hansen & Graham, 199 I).},
Doi = {10.1207/s15327906mbr3304_5},
Owner = {nathalie},
Timestamp = {2018.05.14},
Topics = {multiple imputation}
}
@Article{seaman_etal_2013,
Title = {What Is Meant by "Missing at Random"?},
Author = {Seaman, S. and Galati, J. and Jackson, D. and Carlin, J.},
Journal = {Statistical Science},
Number = {2},
Pages = {257--268},
Publisher = {Institute of Mathematical Statistics},
Volume = {28},
Year = {2013},
Abstract = {The concept of missing at random is central in the literature on statistical analysis with missing data. In general, inference using incomplete data should be based not only on observed data values but should also take account of the pattern of missing values. However, it is often said that if data are missing at random, valid inference using likelihood approaches (including Bayesian) can be obtained ignoring the missingness mechanism. Unfortunately, the term "missing at random" has been used inconsistently and not always clearly; there has also been a lack of clarity around the meaning of "valid inference using likelihood". These issues have created potential for confusion about the exact conditions under which the missingness mechanism can be ignored, and perhaps fed confusion around the meaning of "analysis ignoring the missingness mechanism". Here we provide standardised precise definitions of "missing at random" and "missing completely at random", in order to promote unification of the theory. Using these definitions we clarify the conditions that suffice for "valid inference" to be obtained under a variety of inferential paradigms.},
Doi = {10.2307/43288491},
ISSN = {08834237, 21688745},
URL = {http://www.jstor.org/stable/43288491},
Owner = {imke},
Timestamp = {2018.11.17},
Keywords = {ignorability; direct-likelihood inference; frequentist inference; repeated sampling; missing completely at random},
Topics = {mar}
}
@Article{seaman_white_SMMR2011,
Title = {Review of inverse probability weighting for dealing with missing data},
Author = {Seaman, S. R. and White, I. R.},
Journal = {Statistical Methods in Medical Research},
Year = {2011},
Number = {3},
Pages = {278-295},
Volume = {22},
Abstract = {The simplest approach to dealing with missing data is to restrict the analysis to complete cases, i.e. individuals with no missing values. This can induce bias, however. Inverse probability weighting (IPW) is a commonly used method to correct this bias. It is also used to adjust for unequal sampling fractions in sample surveys. This article is a review of the use of IPW in epidemiological research. We describe how the bias in the complete-case analysis arises and how IPW can remove it. IPW is compared with multiple imputation (MI) and we explain why, despite MI generally being more efficient, IPW may sometimes be preferred. We discuss the choice of missingness model and methods such as weight truncation, weight stabilisation and augmented IPW. The use of IPW is illustrated on data from the 1958 British Birth Cohort},
Doi = {10.1177/0962280210395740},
Owner = {alyssa},
Timestamp = {2017.03.06},
Topics = {causal inference}
}
@article{seaman_vansteelandt_SS2018,
Title = {Introduction to Double Robust Methods for Incomplete Data},
Author = {Seaman, Shaun R and Vansteelandt, Stijn},
Journal = {Statistical Science},
Volume = {33},
Number = {2},
Pages = {184},
Year = {2018},
Publisher = {Europe PMC Funders},
Doi = {10.1214/18-STS647},
Abstract = {Most methods for handling incomplete data can be broadly classified as inverse probability weighting (IPW) strategies or imputation strategies. The former model the occurrence of incomplete data; the latter, the distribution of the missing variables given observed variables in each missingness pattern. Imputation strategies are typically more efficient, but they can involve extrapolation, which is difficult to diagnose and can lead to large bias. Double robust (DR) methods combine the two approaches. They are typically more efficient than IPW and more robust to model misspecification than imputation. We give a formal introduction to DR estimation of the mean of a partially observed variable, before moving to more general incomplete-data scenarios. We review strategies to improve the performance of DR estimators under model misspecification, reveal connections between DR estimators for incomplete data and ‘design-consistent’ estimators used in sample surveys, and explain the value of double robustness when using flexible data-adaptive methods for IPW or imputation.},
Keywords = {augmented inverse probability weighting; calibration estimators; data-adaptive methods; doubly robust; empirical likelihood; imputation; inverse probability weighting; missing data; semiparametric methods},
Owner = {imke},
Timestamp = {2019.03.28},
Topics = {causal inference}
}
@Article{shao_zhang_2015,
Title = {A transformation approach in linear mixed-effects models with informative missing responses},
Author = {Shao, J. and Zhang, J.},
Journal = {Biometrika},
Volume = {102},
Number = {1},
Pages = {107-119},
Year = {2015},
Doi = {10.1093/biomet/asu069},
Abstract = {We consider a linear mixed-effects model in which the response panel vector has missing components and the missing data mechanism depends on observed data as well as missing responses through unobserved random effects. Using a transformation of the data that eliminates the random effects, we derive asymptotically unbiased and normally distributed estimators of certain model parameters. Estimators of model parameters that cannot be estimated using the transformed data are also constructed, and their asymptotic unbiasedness and normality are established. Simulation results are presented to examine the finite sample performance of the proposed estimators and a real data example is discussed.},
Keywords = {nonignorable missing data; panel data; random effect dependent missingness; unbiasedness; unspecified missing data mechanism},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar; nonignorable}
}
@Article{sharpe_solly_1995,
Title = {Dealing with missing values in neural network-based diagnostic systems},
Author = {Sharpe, P. K. and Solly, R. J.},
Journal = {Neural Computing \& Applications},
Year = {1995},
Number = {2},
Pages = {73-77},
Volume = {3},
Publisher = {Springer-Verlag},
ISSN = {1433-3058},
Abstract = {Backpropagation neural networks have been applied to prediction and classification problems in many real world situations. However, a drawback of this type of neural network is that it requires a full set of input data, and real world data is seldom complete. We have investigated two ways of dealing with incomplete data — network reduction using multiple neural network classifiers, and value substitution using estimated values from predictor networks — and compared their performance with an induction method. On a thyroid disease database collected in a clinical situation, we found that the network reduction method was superior. We conclude that network reduction can be a useful method for dealing with missing values in diagnostic systems based on backpropagation neural networks.},
Doi = {10.1007/BF01421959},
Keywords = {Backpropagation; classification; decision support; neural networks; incomplete data},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {deep learning; neural networks}
}
@Article{simon_simonoff_JASA1986,
Title = {Diagnostic plots for missing data in least squares regression},
Author = {Simon, G. A. and Simonoff, J. S.},
Journal = {Journal of the American Statistical Association},
Year = {1986},
Number = {394},
Pages = {501-509},
Volume = {81},
Abstract = {The usual approach to handling missing data in a regression is to assume that the points are missing at random (MAR) and use either a fill-in method to replace the missing points or a method using maximally available pairs in the sample covariance matrix. We derive limits for the values of the least squares estimates of the coefficients (and their associated t statistics) when there are missing observations in one carrier. These limits are derived subject to a constraint on the relationship of the missing data to the present data. Calculating these limits while varying this constrained value results in a series of diagnostic plots that can be used to study the potential effect of the missing points on the regression (without assuming that the points are MAR). Simulations are performed to illustrate the use of the plots, and two real data sets are analyzed. The more general case of missing data in more than one carrier is also discussed.},
Doi = {10.1080/01621459.1986.10478296},
Keywords = {constrained optimization; missing at random; missing by unknown mechanism; regression diagnostics},
Owner = {nathalie},
Timestamp = {2018.04.16},
Topics = {diagnosis}
}
@Article{smieja_etal_2018,
Title = {Processing of missing data by neural networks},
Author = {\'Smieja, M. and Struski, \L{}. and Tabor, J. and Zieli\'nski, B. and Spurek, P.},
Journal = {Computing Research Repository},
Year = {2018},
Volume = {abs/1805.07405},
Abstract = {We propose a general, theoretically justified mechanism for processing missing data by neural networks. Our idea is to replace typical neuron response in the first hidden layer by its expected value. This approach can be applied for various types of networks at minimal cost in their modification. Moreover, in contrast to recent approaches, it does not require complete data for training. Experimental results performed on different types of architectures show that our method gives better results than typical imputation strategies and other methods dedicated for incomplete data.},
Archiveprefix = {arXiv},
Arxivid = {1805.07405},
Eprint = {1805.07405},
Url = {https://arxiv.org/abs/1805.07405},
Keywords = {missing data; neural networks; incomplete samples; probability density estimation},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {deep learning}
}
@Article{sovilj_etal_2016,
Title = {Extreme learning machine for missing data using multiple imputations},
Author = {Sovilj, D. and Eirola, E. and Miche, Y. and Bj\"ork, J. M. and Nian, R. and Akusok, A. and Lendasse, A.},
Journal = {Neurocomputing},
Year = {2016},
Number = {A},
Pages = {220-231},
Volume = {174},
Abstract = {In the paper, we examine the general regression problem under the missing data scenario. In order to provide reliable estimates for the regression function (approximation), a novel methodology based on Gaussian Mixture Model and Extreme Learning Machine is developed. Gaussian Mixture Model is used to model the data distribution which is adapted to handle missing values, while Extreme Learning Machine enables to devise a multiple imputation strategy for final estimation. With multiple imputation and ensemble approach over many Extreme Learning Machines, final estimation is improved over the mean imputation performed only once to complete the data. The proposed methodology has longer running times compared to simple methods, but the overall increase in accuracy justifies this trade-off.},
Doi = {10.1016/j.neucom.2015.03.108},
Owner = {imke},
Timestamp = {2018.11.08},
Keywords = {Extreme Learning Machine; Missing data; Multiple imputation; Gaussian mixture model; Mixture of Gaussians; Conditional distribution},
Topics = {machine learning}
}
@Article{sportisse_etal_2018,
Title = {Imputation and low-rank estimation with Missing Non At Random data},
Author = {Sportisse, Aude and Boyer, Claire and Josse, Julie},
Journal = {arXiv preprint},
archivePrefix = {arXiv},
eprint = {1812.11409},
primaryClass = {stat.ML},
Year = {2018},
Url = {https://arxiv.org/abs/1812.11409},
Abstract = {Missing values challenge data analysis because many supervised and unsu-pervised learning methods cannot be applied directly to incomplete data. Matrix completion based on low-rank assumptions are very powerful solution for dealing with missing values. However, existing methods do not consider the case of informative missing values which are widely encountered in practice. This paper proposes matrix completion methods to recover Missing Not At Random (MNAR) data. Our first contribution is to suggest a model-based estimation strategy by modelling the missing mechanism distribution. An EM algorithm is then implemented, involving a Fast Iterative Soft-Thresholding Algorithm (FISTA). Our second contribution is to suggest a computationally efficient surrogate estimation by implicitly taking into account the joint distribution of the data and the missing mechanism: the data matrix is concatenated with the mask coding for the missing values ; a low-rank structure for exponential family is assumed on this new matrix, in order to encode links between variables and missing mechanisms. The methodology that has the great advantage of handling different missing value mechanisms is robust to model specification errors.},
Keywords = {Informative missing values; denoising; matrix completion; accelerated proximal gradient method; EM algorithm; nuclear norm penalty},
Owner = {imke},
Timestamp = {2019.03.28},
Topics = {mnar}
}
@Article{stacklies_etal_B2007,
Title = {{pcaMethods} -- a bioconductor package providing {PCA} methods for incomplete data},
Author = {Stacklies, W. and Redestig, H. and Scholz, M. and Walther, D. and Selbig, J.},
Journal = {Bioconductor},
Year = {2007},
Number = {9},
Pages = {1164-1167},
Volume = {23},
Abstract = {pcaMethods is a Bioconductor compliant library for computing principal component analysis (PCA) on incomplete data sets. The results can be analyzed directly or used to estimate missing values to enable the use of missing value sensitive statistical methods. The package was mainly developed with microarray and metabolite data sets in mind, but can be applied to any other incomplete data set as well.},
Doi = {10.1093/bioinformatics/btm069},
Owner = {nathalie},
Timestamp = {2018.05.09},
Topics = {imputation; factorial data analysis}
}
@Article{stage_crookston_FS2007,
Title = {Partitioning error components for accuracy-assessment of near-neighbor methods of imputation},
Author = {Stage, A. R. and Crookston, N. L.},
Journal = {Forest Science},
Year = {2007},
Number = {1},
Pages = {62-72},
Volume = {53},
Abstract = {Imputation is applied for two quite different purposes: to supply missing data to complete a data set for subsequent modeling analyses or to estimate subpopulation totals. Error properties of the imputed values have different effects in these two contexts. We partition errors of imputation derived from similar observation units as arising from three sources: observation error, the distribution of observation units with respect to their similarity, and pure error given a particular choice of variables known for all observation units. Two new statistics based on this partitioning measure the accuracy of the imputations, facilitating comparison of imputation to alternative methods of estimation such as regression and comparison of alternative methods of imputation generally. Knowing the relative magnitude of the errors arising from these partitions can also guide efficient investment in obtaining additional data. We illustrate this partitioning using three extensive data sets from western North America. Application of this partitioning to compare near-neighbor imputation is illustrated for Mahalanobis- and two canonical correlation-based measures of similarity.},
Doi = {10.1093/forestscience/53.1.62},
ISBN = {0015-749X},
ISSN = {0015749X},
Keywords = {landscape modeling; missing data; most similar neighbor; k-nn inference; diagnosis},
Owner = {alyssa},
Timestamp = {2017.11.13},
Topics = {}
}
@Article{stekhoven_buhlmann_B2012,
Title = {Missforest-non-parametric missing value imputation for mixed-type data},
Author = {Stekhoven, D. J. and B\"uhlmann, P.},
Journal = {Bioinformatics},
Year = {2012},
Number = {1},
Pages = {112-118},
Volume = {28},
Abstract = {Modern data acquisition based on high-throughput technology is often facing the problem of missing data. Algorithms commonly used in the analysis of such large-scale data often depend on a complete set. Missing value imputation offers a solution to this problem. However, the majority of available imputation methods are restricted to one type of variable only: continuous or categorical. For mixed-type data, the different types are usually handled separately. Therefore, these methods ignore possible relations between variable types. We propose a non-parametric method which can cope with different types of variables simultaneously. We compare several state of the art methods for the imputation of missing values. We propose and evaluate an iterative imputation method (missForest) based on a random forest. By averaging over many unpruned classification or regression trees, random forest intrinsically constitutes a multiple imputation scheme. Using the built-in out-of-bag error estimates of random forest, we are able to estimate the imputation error without the need of a test set. Evaluation is performed on multiple datasets coming from a diverse selection of biological fields with artificially introduced missing values ranging from 10{\%} to 30{\%}. We show that missForest can successfully handle missing values, particularly in datasets including different types of variables. In our comparative study, missForest outperforms other methods of imputation especially in data settings where complex interactions and non-linear relations are suspected. The out-of-bag imputation error estimates of missForest prove to be adequate in all settings. Additionally, missForest exhibits attractive computational efficiency and can cope with high-dimensional data. The package missForest is freely available from http://stat.ethz.ch/CRAN/.},
Archiveprefix = {arXiv},
Arxivid = {1105.0828},
Doi = {10.1093/bioinformatics/btr597},
Eprint = {1105.0828},
ISBN = {1367-4811 (Electronic)$\backslash$n1367-4803 (Linking)},
ISSN = {13674803},
Owner = {alyssa},
Pmid = {22039212},
Timestamp = {2016.09.27},
Topics = {random tree}
}
@Article{strobl_etal_2007,
Title = {Unbiased split selection for classification trees based on the Gini Index},
Author = {Strobl, C. and Boulesteix, A. L., and Augustin, T.},
Journal = {Computational Statistics \& Data Analysis},
Year = {2007},
Number = {1},
Pages = {483-501},
Volume = {52},
Publisher = {Elsevier},
Abstract = {Classification trees are a popular tool in applied statistics because their heuristic search approach based on impurity reduction is easy to understand and the interpretation of the output is straightforward. However, all standard algorithms suffer from a major problem: variable selection based on standard impurity measures as the Gini Index is biased. The bias is such that, e.g., splitting variables with a high amount of missing values—even if missing completely at random (MCAR)—are artificially preferred. A new split selection criterion that avoids variable selection bias is introduced. The exact distribution of the maximally selected Gini gain is derived by means of a combinatorial approach and the resulting
-value is suggested as an unbiased split selection criterion in recursive partitioning algorithms. The efficiency of the method is demonstrated in simulation studies and a real data study from veterinary gynecology in the context of binary classification and continuous predictor variables with different numbers of missing values. The proposed method is extendible to categorical and ordinal predictor variables and to other split selection criteria such as the cross-entropy.},
Doi = {10.1016/j.csda.2006.12.030},
ISSN = {0167-9473},
Keywords = {classification trees; variable selection bias; Gini gain; missing values},
Owner = {imke},
Timestamp = {2018.10.30},
Topics = {classification trees; decision trees; random forests}
}
@Article{stuart_etal_AJE2009,
Title = {Multiple imputation with large data sets: a case study of the children's mental health initiative},
Author = {Stuart, E. A. and Azur, M. and Frangakis, C. and Leaf, P.},
Journal = {American Journal of Epidemiology},
Year = {2009},
Number = {9},
Pages = {1133-1139},
Volume = {169},
Abstract = {Multiple imputation is an effective method for dealing with missing data, and it is becoming increasingly common in many fields. However, the method is still relatively rarely used in epidemiology, perhaps in part because relatively few studies have looked at practical questions about how to implement multiple imputation in large data sets used for diverse purposes. This paper addresses this gap by focusing on the practicalities and diagnostics for multiple imputation in large data sets. It primarily discusses the method of multiple imputation by chained equations, which iterates through the data, imputing one variable at a time conditional on the others. Illustrative data were derived from 9,186 youths participating in the national evaluation of the Community Mental Health Services for Children and Their Families Program, a US federally funded program designed to develop and enhance community-based systems of care to meet the needs of children with serious emotional disturbances and their families. Multiple imputation was used to ensure that data analysis samples reflect the full population of youth participating in this program. This case study provides an illustration to assist researchers in implementing multiple imputation in their own data.},
Doi = {10.1093/aje/kwp026},
ISBN = {1476-6256 (Electronic)$\backslash$n0002-9262 (Linking)},
ISSN = {00029262},
Keywords = {mental health services; missing at random; missing data; multiple imputation},
Owner = {alyssa},
Pmid = {19318618},
Timestamp = {2017.11.08},
Topics = {multiple imputation; chained equations}
}
@Article{stubbendick_ibrahim_2003,
Title = {Maximum Likelihood Methods for Nonignorable Missing Responses and Covariates in Random Effects Models},
Author = {Stubbendick, A. L. and Ibrahim, J. G.},
Journal = {Biometrics},
Pages = {1140--1150},
Year = {2003},
Publisher = {Wiley-Blackwell},
Volume = {59},
Number = {4},
Abstract = {This article analyzes quality of life (QOL) data from an Eastern Cooperative Oncology Group (ECOG) melanoma trial that compared treatment with ganglioside vaccination to treatment with high‐dose interferon. The analysis of this data set is challenging due to several difficulties, namely, nonignorable missing longitudinal responses and baseline covariates. Hence, we propose a selection model for estimating parameters in the normal random effects model with nonignorable missing responses and covariates. Parameters are estimated via maximum likelihood using the Gibbs sampler and a Monte Carlo expectation maximization (EM) algorithm. Standard errors are calculated using the bootstrap. The method allows for nonmonotone patterns of missing data in both the response variable and the covariates. We model the missing data mechanism and the missing covariate distribution via a sequence of one‐dimensional conditional distributions, allowing the missing covariates to be either categorical or continuous, as well as time‐varying. We apply the proposed approach to the ECOG quality‐of‐life data and conduct a small simulation study evaluating the performance of the maximum likelihood estimates. Our results indicate that a patient treated with the vaccine has a higher QOL score on average at a given time point than a patient treated with high‐dose interferon.},
Keywords = {Nonignorabe missing data mechanism; Gibbs sampling; Monte Carlo EM algorithm; Normal random effects model; Selection model},
Doi = {10.1111/j.0006-341X.2003.00131.x},
ISSN = {0006341X},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar; ml}
}
@Article{stubbendick_ibrahim_2006,
Title = {Likelihood-based inference with nonignorable missing responses and covariates in models for discrete longitudinal data},
Author = {Stubbendick, A. L. and Ibrahim, J. G.},
Journal = {Statistica Sinica},
Pages = {1143--1167},
Year = {2006},
Publisher = {Institute of Statistical Science},
Volume = {16},
Number = {4},
Abstract = {We propose methods for estimating parameters in two types of models for discrete longitudinal data in the presence of nonignorable missing responses and covariates. We first present the generalized linear model with random effects, also known as the the generalized linear mixed model. We specify a missing data mechanism and a missing covariate distribution and incorporate them into the complete data log-likelihood. Parameters are estimated via maximum likelihood using the Gibbs sampler and a Monte Carlo EM algorithm. The second model is a marginal model for correlated binary responses and discrete covariates with finite range, both of which may be nonignorably missing. We incorporate the missing data mechanism and the missing covariate distribution into the multivariate probit model defined by Chib and Greenberg (1998). We use the EM by method of weights (Ibrahim, 1990) and sample the latent normal variables conditional on a particular response and covariate pattern. The M-steps for each model are like a complete data maximization problem, and standard methods are used. Standard errors for the parameter estimates are computed using the multiple imputation method of Goetghebeur and Ryan (2000). We discuss the advantages and disadvantages of each model and give some guidance as to when one model might be chosen over the other. We illustrate both models using data from an environmental study of dyspnea in Chinese cotton factory workers.},
Keywords = {generalized linear mixed model; Gibbs sampling; Monte Carlo EM algorithm; multivariate probit model; nonignorable missing data; sensitivity analysis},
Url = {https://www.jstor.org/stable/24307781},
ISSN = {10170405},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar; ml; longitudinal}
}
@Article{su_etal_JSS2011,
Title = {Multiple imputation with diagnostics (mi) in {R}: opening windows into the black box},
Author = {Su, Y. S. and Gelman, A. and Hill, J. and Yajima, M.},
Journal = {Journal of Statistical Software},
Year = {2011},
Pages = {2},
Volume = {45},
Abstract = {Our mi package in R has several features that allow the user to get inside the imputation process and evaluate the reasonableness of the resulting models and imputations. These features include: choice of predictors, models, and transformations for chained imputation models; standard and binned residual plots for checking the fit of the conditional distributions used for imputation; and plots for comparing the distributions of observed and imputed data. In addition, we use Bayesian models and weakly informative prior distributions to construct more stable estimates of imputation models. Our goal is to have a demonstration package that (a) avoids many of the practical problems that arise with existing multivariate imputation programs, and (b) demonstrates state-of-the-art diagnostics that can be applied more generally and can be incorporated into the software of others.},
Doi = {10.18637/jss.v045.i02},
Owner = {nathalie},
Timestamp = {2017.10.16},
Topics = {multiple imputation; chained equations}
}