-
Notifications
You must be signed in to change notification settings - Fork 14
/
rmisstastic_biblio.bib
3863 lines (3343 loc) · 385 KB
/
rmisstastic_biblio.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Encoding: UTF-8
@Article{abayomi_etal_JRSSC2008,
Title = {Diagnostics for multivariate imputations},
Author = {Abayomi, K. and Gelman, A. and Levy, M.},
Journal = {Journal of the Royal Statistical Society, Series C (Applied Statistics)},
Year = {2008},
Number = {3},
Pages = {273-291},
Volume = {57},
Abstract = {We consider three sorts of diagnostics for random imputations: displays of the completed data, which are intended to reveal unusual patterns that might suggest problems with the imputations, comparisons of the distributions of observed and imputed data values and checks of the fit of observed data to the model that is used to create the imputations. We formulate these methods in terms of sequential regression multivariate imputation, which is an iterative procedure in which the missing values of each variable are randomly imputed conditionally on all the other variables in the completed data matrix. We also consider a recalibration procedure for sequential regression imputations. We apply these methods to the 2002 environmental sustainability index, which is a linear aggregation of 64 environmental variables on 142 countries.},
Doi = {10.1111/j.1467-9876.2007.00613.x},
ISSN = {1467-9876},
Keywords = {missing values; multiple imputation: multivariate statistics; sustainability; environmental statistics},
Owner = {alyssa},
Publisher = {Blackwell Publishing Ltd},
Timestamp = {2017.11.08},
Topics = {mi}
}
@Article{albert_follmann_B2000,
Title = {Modeling repeated count data subject to informative dropout},
Author = {Albert, P. S. and Follmann, D. A.},
Journal = {Biometrics},
Year = {2000},
Number = {3},
Pages = {667-677},
Volume = {56},
Abstract = {In certain diseases, outcome is the number of morbid events over the course of follow-up. In epilepsy, e.g., daily seizure counts are often used to reflect disease severity. Follow-up of patients in clinical trials of such diseases is often subject to censoring due to patients dying or dropping out. If the sicker patients tend to be censored in such trials, estimates of the treatment effect that do not incorporate the censoring process may be misleading. We extend the shared random effects approach of Wu and Carroll (1988, Biometrics 44, 175-188) to the setting of repeated counts of events. Three strategies are developed. The first is a likelihood-based approach for jointly modeling the count and censoring processes. A shared random effect is incorporated to introduce dependence between the two processes. The second is a likelihood-based approach that conditions on the dropout times in adjusting for informative dropout. The third is a generalized estimating equations (GEE) approach, which also conditions on the dropout times but makes fewer assumptions about the distribution of the count process. Estimation procedures for each of the approaches are discussed, and the approaches are applied to data from an epilepsy clinical trial. A simulation study is also conducted to compare the various approaches. Through analyses and simulations, we demonstrate the flexibility of the likelihood-based conditional model for analyzing data from the epilepsy trial.},
Doi = {10.1111/j.0006-341X.2000.00667.x},
ISSN = {0006341X, 15410420},
Owner = {alyssa},
Publisher = {[Wiley, International Biometric Society]},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Book{allison_MD2001,
Title = {Missing Data},
Author = {Allison, P. D.},
Publisher = {Sage Publications},
Year = {2001},
Address = {Thousand Oaks, CA, USA},
Series = {Quantitative Applications in the Social Sciences},
Doi = {10.1136/bmj.38977.682025.2C},
ISBN = {9780761916727},
ISSN = {0959-8138},
Mendeley-groups = {missing data},
Owner = {nathalie},
Timestamp = {2017.03.06},
Topics = {general}
}
@Article{andridge_little_ISR2010,
Title = {A review of hot deck imputation for survey non-response},
Author = {Andridge, R. and Little, R. J. A.},
Journal = {International Statistical Review},
Year = {2010},
Number = {1},
Pages = {40-64},
Volume = {78},
Abstract = {Hot deck imputation is a method for handling missing data in which each missing value is replaced with an observed response from a ``similar'' unit. Despite being used extensively in practice, the theory is not as well developed as that of other imputation methods. We have found that no consensus exists as to the best way to apply the hot deck and obtain inferences from the completed data set. Here we review different forms of the hot deck and existing research on its statistical properties. We describe applications of the hot deck currently in use, including the U.S. Census Bureau's hot deck for the Current Population Survey (CPS). We also provide an extended example of variations of the hot deck applied to the third National Health and Nutrition Examination Survey (NHANES III). Some potential areas for future research are highlighted.},
Annote = {A review of Hot deck imputation for survey Non-response},
Doi = {10.1111/j.1751-5823.2010.00103.x},
Keywords = {item non-response; missing data; multiple imputation; variance estimation},
Mendeley-groups = {missing data},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {hot-deck}
}
@Article{audigier_etal_ADAC2016,
Title = {A principal component method to impute missing values for mixed data},
Author = {Audigier, V. and Husson, F. and Josse, J.},
Journal = {Advances in Data Analysis and Classification},
Year = {2016},
Number = {1},
Pages = {5-26},
Volume = {10},
Abstract = {We propose a new method to impute missing values in mixed data sets. It is based on a principal component method, the factorial analysis for mixed data, which balances the influence of all the variables that are continuous and categorical in the construction of the principal components. Because the imputation uses the principal axes and components, the prediction of the missing values is based on the similarity between individuals and on the relationships between variables. The properties of the method are illustrated via simulations and the quality of the imputation is assessed using real data sets. The method is compared to a recent method (Stekhoven and Buhlmann Bioinformatics 28:113_118, 2011) based on random forest and shows better performance especially for the imputation of categorical variables and situations with highly linear relationships between continuous variables.},
Doi = {10.1007/s11634-014-0195-1},
Keywords = {missing values; mixed data; imputation; principal component method; factorial analysis of mixed data},
Owner = {alyssa},
Timestamp = {2017.02.22},
Topics = {factorial data analysis; imputation}
}
@Article{audigier_etal_JSCS2015,
Title = {Multiple imputation for continuous variables using a {B}ayesian principal component analysis},
Author = {Audigier, V. and Husson, F. and Josse, J.},
Journal = {Journal of Statistical Computation and Simulation},
Year = {2015},
Number = {11},
Pages = {2140-2156},
Volume = {86},
Abstract = {We propose a multiple imputation method based on principal component analysis (PCA) to deal with incomplete continuous data. To reflect the uncertainty of the parameters from one imputation to the next, we use a Bayesian treatment of the PCA model. Using a simulation study and real data sets, the method is compared to two classical approaches: multiple imputation based on joint modelling and on fully conditional modelling. Contrary to the others, the proposed method can be easily used on data sets where the number of individuals is less than the number of variables and when the variables are highly correlated. In addition, it provides unbiased point estimates of quantities of interest, such as an expectation, a regression coefficient or a correlation coefficient, with a smaller mean squared error. Furthermore, the widths of the confidence intervals built for the quantities of interest are often smaller whilst ensuring a valid coverage.},
Doi = {10.1080/00949655.2015.1104683},
Keywords = {missing values; continuous data; multiple imputaiton; bayesian principal component analysis; data augmentation},
Owner = {alyssa},
Timestamp = {2017.02.23},
Topics = {factorial data analysis; multiple imputation}
}
@Article{audigier_etal_SC2016,
Title = {{MIMCA}: multiple imputation for categorical variables with multiple correspondence analysis},
Author = {Audigier, V. and Husson, F. and Josse, J.},
Journal = {Statistics and Computing},
Year = {2016},
Number = {2},
Pages = {1-18},
Volume = {27},
Abstract = {We propose a multiple imputation method to deal with incomplete categorical data. This method imputes the missing entries using the principal components method dedicated to categorical data: multiple correspondence analysis {\{}(MCA).{\}} The uncertainty concerning the parameters of the imputation model is reflected using a non-parametric bootstrap. Multiple imputation using {\{}MCA{\}} {\{}(MIMCA){\}} requires estimating a small number of parameters due to the dimensionality reduction property of {\{}MCA.{\}} It allows the user to impute a large range of data sets. In particular, a high number of categories per variable, a high number of variables or a small the number of individuals are not an issue for {\{}MIMCA.{\}} Through a simulation study based on real data sets, the method is assessed and compared to the reference methods (multiple imputation using the loglinear model, multiple imputation by logistic regressions) as well to the latest works on the topic (multiple imputation by random forests or by the Dirichlet process mixture of products of multinomial distributions model). The proposed method shows good performances in terms of bias and coverage for an analysis model such as a main effects logistic regression model. In addition, {\{}MIMCA{\}} has the great advantage that it is substantially less time consuming on data sets of high dimensions than the other multiple imputation methods.},
Archiveprefix = {arXiv},
Arxivid = {1505.08116},
Doi = {10.1007/s11222-016-9635-4},
Eprint = {1505.08116},
ISSN = {15731375},
Keywords = {bootstrap; categorical data; missing values; multiple correspondence analysis; multiple imputation},
Owner = {alyssa},
Publisher = {Springer US},
Timestamp = {2017.07.06},
Topics = {factorial data analysis; multiple imputation}
}
@Article{bang_robins_B2005,
Title = {Doubly robust estimation in missing data and causal inference models},
Author = {Bang, H. and Robins, J. M.},
Journal = {Biometrics},
Year = {2005},
Number = {4},
Pages = {962-973},
Volume = {61},
Abstract = {The goal of this article is to construct doubly robust (DR) estimators in ignorable missing data and causal inference models. In a missing data model, an estimator is DR if it remains consistent when either (but not necessarily both) a model for the missingness mechanism or a model for the distribution of the complete data is correctly specified. Because with observational data one can never be sure that either a missingness model or a complete data model is correct, perhaps the best that can be hoped for is to find a DR estimator. DR estimators, in contrast to standard likelihood-based or (nonaugmented) inverse probability-weighted estimators, give the analyst two chances, instead of only one, to make a valid inference. In a causal inference model, an estimator is DR if it remains consistent when either a model for the treatment assignment mechanism or a model for the distribution of the counterfactual data is correctly specified. Because with observational data one can never be sure that a model for the treatment assignment mechanism or a model for the counterfactual data is correct, inference based on DR estimators should improve upon previous approaches. Indeed, we present the results of simulation studies which demonstrate that the finite sample performance of DR estimators is as impressive as theory would predict. The proposed method is applied to a cardiovascular clinical trial.},
Doi = {10.1111/j.1541-0420.2005.00377.x},
ISBN = {0006-341X},
ISSN = {0006341X},
Keywords = {causal inference; doubly robust estimation; longitudinal data; marginal structural model; missing data; semiparametrics},
Owner = {alyssa},
Pmid = {16401269},
Timestamp = {2017.05.29},
Topics = {causal inference}
}
@Article{baraldi_enders_JSP2010,
Title = {An introduction to modern missing data analysis},
Author = {Baraldi, A. N. and Enders, C. K.},
Journal = {Journal of School Psychology},
Year = {2010},
Number = {1},
Pages = {5-37},
Volume = {48},
Abstract = {A great deal of recent methodological research has focused on two modern missing data analysis methods: maximum likelihood and multiple imputation. These approaches are advantageous to traditional techniques (e.g. deletion and mean imputation techniques) because they require less stringent assumptions and mitigate the pitfalls of traditional techniques. This article explains the theoretical underpinnings of missing data analyses, gives an overview of traditional missing data techniques, and provides accessible descriptions of maximum likelihood and multiple imputation. In particular, this article focuses on maximum likelihood estimation and presents two analysis examples from the Longitudinal Study of American Youth data. One of these examples includes a description of the use of auxiliary variables. Finally, the paper illustrates ways that researchers can use intentional, or planned, missing data to enhance their research designs.},
Doi = {10.1016/j.jsp.2009.10.001},
Keywords = {missing data; multiple imputation; maximum likelihood; planned missingness},
Owner = {alyssa},
Timestamp = {2017.02.21},
Topics = {general_informal}
}
@Article{baretta_santaniello_BMCMIDM2016,
Title = {Nearest neighbor imputation algorithms: a critical evaluation},
Author = {Baretta, L. and Santaniello, A.},
Journal = {BMC Medical Informatics and Decision Making},
Year = {2016},
Number = {Supp. 3},
Pages = {74},
Volume = {16},
Abstract = {Background Nearest neighbor (NN) imputation algorithms are efficient methods to fill in missing data where each missing value on some records is replaced by a value obtained from related cases in the whole set of records. Besides the capability to substitute the missing data with plausible values that are as close as possible to the true value, imputation algorithms should preserve the original data structure and avoid to distort the distribution of the imputed variable. Despite the efficiency of NN algorithms little is known about the effect of these methods on data structure. Methods Simulation on synthetic datasets with different patterns and degrees of missingness were conducted to evaluate the performance of NN with one single neighbor (1NN) and with k neighbors without (kNN) or with weighting (wkNN) in the context of different learning frameworks: plain set, reduced set after ReliefF filtering, bagging, random choice of attributes, bagging combined with random choice of attributes (Random-Forest-like method). Results Whatever the framework, kNN usually outperformed 1NN in terms of precision of imputation and reduced errors in inferential statistics, 1NN was however the only method capable of preserving the data structure and data were distorted even when small values of k neighbors were considered; distortion was more severe for resampling schemas. Conclusions The use of three neighbors in conjunction with ReliefF seems to provide the best trade-off between imputation error and preservation of the data structure. The very same conclusions can be drawn when imputation experiments were conducted on the single proton emission computed tomography (SPECTF) heart dataset after introduction of missing data completely at random.},
Doi = {10.1186/s12911-016-0318-z},
Keywords = {near neighbour; imputation method; imputation algorithm; near neighbour algorithm; Minkowski norm},
Owner = {nathalie},
Series = {Proceedings of the 5th Translational Bioinformatics Conference (TBC 2015): medical informatics and decision making},
Timestamp = {2018.05.17},
Topics = {knn}
}
@article{bartlett_etal_2015,
Title = {Asymptotically unbiased estimation of exposure odds ratios in complete records logistic regression},
Author = {Bartlett, Jonathan W and Harel, Ofer and Carpenter, James R},
Journal = {American journal of epidemiology},
Volume = {182},
Number = {8},
Pages = {730--736},
Year = {2015},
Publisher = {Oxford University Press},
Doi = {10.1093/aje/kwv114},
Abstract = {Missing data are a commonly occurring threat to the validity and efficiency of epidemiologic studies. Perhaps the most common approach to handling missing data is to simply drop those records with 1 or more missing values, in so-called “complete records” or “complete case” analysis. In this paper, we bring together earlier-derived yet perhaps now somewhat neglected results which show that a logistic regression complete records analysis can provide asymptotically unbiased estimates of the association of an exposure of interest with an outcome, adjusted for a number of confounders, under a surprisingly wide range of missing-data assumptions. We give detailed guidance describing how the observed data can be used to judge the plausibility of these assumptions. The results mean that in large epidemiologic studies which are affected by missing data and analyzed by logistic regression, exposure associations may be estimated without bias in a number of settings where researchers might otherwise assume that bias would occur.},
Keywords = {complete case analysis; logistic regression; missing data; odds ratio},
Owner = {imke},
Timestamp = {2019.04.01},
Topics = {causal inference}
}
@Article{beaulac_rosenthal_2018,
Title = {BEST: A decision tree algorithm that handles missing values},
Author = {Beaulac, C{\'e}dric and Rosenthal, Jeffrey S},
Journal = {arXiv preprint},
archivePrefix = {arXiv},
eprint = {1804.10168},
Year = {2018},
Url = {https://arxiv.org/pdf/1804.10168.pdf},
Abstract = {The main contribution of this paper is the development of a new decision tree algorithm. The proposed approach allows users to guide the algorithm through the data partitioning process. We believe this feature has many applications but in this paper we demonstrate how to utilize this algorithm to analyse data sets containing missing values. We tested our algorithm against simulated data sets with various missing data structures and a real data set. The results demonstrate that this new classification procedure efficiently handles missing values and produces results that are slightly more accurate and more interpretable than most common procedures without any imputations or pre-processing.},
Keywords = {cart; machine learning; variable importance analysis},
Owner = {imke},
Timestamp = {2019.12.12},
Topics = {random forests; regression trees; variable selection}
}
@InProceedings{bengio_gingras_1995,
Title = {Recurrent neural networks for missing or asynchronous data},
Author = {Bengio, Y. and Gingras, F.},
Booktitle = {Proceedings of the 8th International Conference on Neural Information Processing Systems},
Pages = {395-401},
Year = {1995},
Editor = {-},
Address = {Cambridge, MA, USA},
Eventdate = {1995-11-27/1995-12-02},
Publisher = {MIT Press},
Abstract = {In this paper we propose recurrent neural networks with feedback into the input units for handling two types of data analysis problems. On the one hand, this scheme can be used for static data when some of the input variables are missing. On the other hand, it can also be used for sequential data, when some of the input variables are missing or are available at different frequencies. Unlike in the case of probabilistic models (e.g. Gaussian) of the missing variables, the network does not attempt to model the distribution of the missing variables given the observed variables. Instead it is a more "discriminant" approach that fills in the missing variables for the sole purpose of minimizing a learning criterion (e.g., to minimize an output error).},
Url = {http://papers.nips.cc/paper/1126-recurrent-neural-networks-for-missing-or-asynchronous-data.pdf},
Owner = {imke},
Timestamp = {2018.11.08},
Keywords = {machine learning; deep learning; rnn; sequential data},
Topics = {deep learning; rnn}
}
@Article{bertsimas_etal_2017,
Title = {From predictive methods to missing data imputation: an optimization approach},
Author = {Bertsimas, Dimitris and Pawlowski, Colin and Zhuo, Ying Daisy},
Journal = {The Journal of Machine Learning Research},
Volume = {18},
Number = {1},
Pages = {7133--7171},
Year = {2017},
Publisher = {JMLR.org},
Abstract = {Missing data is a common problem in real-world settings and for this reason has attracted significant attention in the statistical literature. We propose a flexible framework based on formal optimization to impute missing data with mixed continuous and categorical variables. This framework can readily incorporate various predictive models including K nearest neighbors, support vector machines, and decision tree based methods, and can be adapted for multiple imputation. We derive fast first-order methods that obtain high quality solutions in seconds following a general imputation algorithm opt.impute presented in this paper. We demonstrate that our proposed method improves out-of-sample accuracy in large-scale computational experiments across a sample of 84 data sets taken from the UCI Machine Learning Repository. In all scenarios of missing at random mechanisms and various missing percentages, opt.impute produces the best overall imputation in most data sets benchmarked against five other methods: mean impute, K-nearest neighbors, iterative knn, Bayesian PCA, and predictive-mean matching, with an average reduction in mean absolute error of 8.3\% against the best cross-validated benchmark method. Moreover, opt.impute leads to improved out-of-sample performance of learning algorithms trained using the imputed data, demonstrated by computational experiments on 10 downstream tasks. For models trained using opt.impute single imputations with 50\% data missing, the average out-of-sample R2 is 0.339 in the regression tasks and the average out-of-sample accuracy is 86.1\% in the classification tasks, compared to 0.315 and 84.4\% for the best cross-validated benchmark method. In the multiple imputation setting, downstream models trained using opt.impute obtain a statistically significant improvement over models trained using multivariate imputation by chained equations (mice) in 8/10 missing data scenarios considered.},
Keywords = {missing data imputation; K-NN; SVM; optimal decision trees},
Owner = {imke},
Timestamp = {2019.12.12},
Topics = {imputation; knn; decision trees}
}
@Article{beunckens_etal_2008,
Title={A latent-class mixture model for incomplete longitudinal Gaussian data},
Author={Beunckens, Caroline and Molenberghs, Geert and Verbeke, Geert and Mallinckrodt, Craig},
Journal={Biometrics},
Volume={64},
Number={1},
Pages={96--105},
Year={2008},
Publisher={Wiley Online Library},
Abstract={In the analyses of incomplete longitudinal clinical trial data, there has been a shift, away from simple methods that are valid only if the data are missing completely at random, to more principled ignorable analyses, which are valid under the less restrictive missing at random assumption. The availability of the necessary standard statistical software nowadays allows for such analyses in practice. While the possibility of data missing not at random (MNAR) cannot be ruled out, it is argued that analyses valid under MNAR are not well suited for the primary analysis in clinical trials. Rather than either forgetting about or blindly shifting to an MNAR framework, the optimal place for MNAR analyses is within a sensitivity‐analysis context. One such route for sensitivity analysis is to consider, next to selection models, pattern‐mixture models or shared‐parameter models. The latter can also be extended to a latent‐class mixture model, the approach taken in this article. The performance of the so‐obtained flexible model is assessed through simulations and the model is applied to data from a depression trial.},
Doi = {https://doi.org/10.1111/j.1541-0420.2007.00837.x},
Topics = {mnar},
Owner = {aude},
Timestamp = {2021.01.20}
}
@Article{bianchi_etal_2019,
Title = {Learning representations of multivariate time series with missing data},
Author = {Bianchi, Filippo Maria and Livi, Lorenzo and Mikalsen, Karl {\O}yvind and Kampffmeyer, Michael and Jenssen, Robert},
Journal = {Pattern Recognition},
Volume = {96},
Pages = {106973},
Year = {2019},
Publisher = {Elsevier},
DOI = {10.1016/j.patcog.2019.106973},
Abstract = {Learning compressed representations of multivariate time series (MTS) facilitates data analysis in the presence of noise and redundant information, and for a large number of variates and time steps. However, classical dimensionality reduction approaches are designed for vectorial data and cannot deal explicitly with missing values. In this work, we propose a novel autoencoder architecture based on recurrent neural networks to generate compressed representations of MTS. The proposed model can process inputs characterized by variable lengths and it is specifically designed to handle missing data. Our autoencoder learns fixed-length vectorial representations, whose pairwise similarities are aligned to a kernel function that operates in input space and that handles missing values. This allows to learn good representations, even in the presence of a significant amount of missing data. To show the effectiveness of the proposed approach, we evaluate the quality of the learned representations in several classification tasks, including those involving medical data, and we compare to other methods for dimensionality reduction. Successively, we design two frameworks based on the proposed architecture: one for imputing missing data and another for one-class classification. Finally, we analyze under what circumstances an autoencoder with recurrent layers can learn better compressed representations of MTS than feed-forward architectures.},
Keywords = {Representation learning; Multivariate time series; Autoencoders; Recurrent neural networks; Kernel methods},
Owner = {imke},
Timestamp = {2019.12.12},
Topics = {time series; deep learning; neural network}
}
@InProceedings{biessmann_CIKM2018,
Title = {"Deep" Learning for Missing Value Imputation in Tables with Non-Numerical Data},
Author = {Biessmann, F. and Salinas, D. and Schelter, S. and Schmidt, P. and Lange, D.},
Booktitle = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
Series = {CIKM '18},
Year = {2018},
ISBN = {978-1-4503-6014-2},
Location = {Torino, Italy},
Pages = {2017--2025},
Url = {http://doi.acm.org/10.1145/3269206.3272005},
Doi = {10.1145/3269206.3272005},
Publisher = {ACM},
Address = {New York, NY, USA},
Editor = {-},
Abstract = {The success of applications that process data critically depends on the quality of the ingested data. Completeness of a data source is essential in many cases. Yet, most missing value imputation approaches suffer from severe limitations. They are almost exclusively restricted to numerical data, and they either offer only simple imputation methods or are difficult to scale and maintain in production. Here we present a robust and scalable approach to imputation that extends to tables with non-numerical values, including unstructured text data in diverse languages. Experiments on public data sets as well as data sets sampled from a large product catalog in different languages (English and Japanese) demonstrate that the proposed approach is both scalable and yields more accurate imputations than previous approaches. Training on data sets with several million rows is a matter of minutes on a single machine. With a median imputation F1 score of 0.93 across a broad selection of data sets our approach achieves on average a 23-fold improvement compared to mode imputation. While our system allows users to apply state-of-the-art deep learning models if needed, we find that often simple linear n-gram models perform on par with deep learning methods at a much lower operational cost. The proposed method learns all parameters of the entire imputation pipeline automatically in an end-to-end fashion, rendering it attractive as a generic plugin both for engineers in charge of data pipelines where data completeness is relevant, as well as for practitioners without expertise in machine learning who need to impute missing values in tables with non-numerical data.},
Owner = {imke},
Timestamp = {2018.12.18},
Keywords = {data cleaning; missing value imputation},
Topics = {deep learning; neural networks}
}
@Article{blake_etal_2019,
Title = {Propensity scores using missingness pattern information: a practical guide},
Author = {Blake, Helen A. and Leyrat, Clémence and Mansfield, Kate and Seaman, Shaun and Tomlinson, Laurie and Carpenter, James and Williamson, Elizabeth},
Year = {2019},
Journal = {arXiv preprint},
archivePrefix = {arXiv},
Year = {2018},
eprint = {1901.03981},
primaryClass = {stat.ME},
Abstract = {Electronic health records are a valuable data source for investigating health-related questions, and propensity score analysis has become an increasingly popular approach to address confounding bias in such investigations. However, because electronic health records are typically routinely recorded as part of standard clinical care, there are often missing values, particularly for potential confounders. In our motivating study -- using electronic health records to investigate the effect of renin-angiotensin system blockers on the risk of acute kidney injury -- two key confounders, ethnicity and chronic kidney disease stage, have 59% and 53% missing data, respectively.
The missingness pattern approach (MPA), a variant of the missing indicator approach, has been proposed as a method for handling partially observed confounders in propensity score analysis. In the MPA, propensity scores are estimated separately for each missingness pattern present in the data. Although the assumptions underlying the validity of the MPA are stated in the literature, it can be difficult in practice to assess their plausibility.
In this paper, we explore the MPA's underlying assumptions by using causal diagrams to assess their plausibility in a range of simple scenarios, drawing general conclusions about situations in which they are likely to be violated. We present a framework providing practical guidance for assessing whether the MPA's assumptions are plausible in a particular setting and thus deciding when the MPA is appropriate. We apply our framework to our motivating study, showing that the MPA's underlying assumptions appear reasonable, and we demonstrate the application of MPA to this study.},
Keywords = {Electronic health records; Missing confounder data; Missing indicator; Missingness pattren; Propensity score analysis},
Url = {https://researchonline.lshtm.ac.uk/4651159/1/1901.03981v1.pdf},
Owner = {imke},
Timestamp = {2019.02.13},
Topics = {causal inference}
}
@Article{brinis_etal_2019,
Title = {Hollow-tree: a metric access method for data with missing values},
Author = {Brinis, Safia and Traina, Caetano and Traina, Agma JM},
Journal = {Journal of Intelligent Information Systems},
Pages = {1--28},
Year = {2019},
Publisher = {Springer},
DOI = {10.1007/s10844-019-00567-8},
Abstract = {Similarity search is fundamental to store and retrieve large volumes of complex data required by many real world applications. A useful mechanism for such concept is the query-by-similarity. Based on their topological properties, metric similarity functions can be used to index sets of data which can be queried effectively and efficiently by the so-called metric access methods. However, data produced by various application domains and the varying data types handled often lead to missing data, hence, they do not follow the metric similarity requirements. As a consequence, missing data cause distortions in the index structure and yield bias in the query answer. In this paper, we propose the Hollow-tree, a novel access method aimed at successfully retrieving data with missing attribute values. It employs new strategies for indexing and searching data elements, capable of handling the missing data issues when the cause of missingness is ignorable. The indexing strategy is based on a family of distance functions that allow measuring the distance between elements with missing values, along with a set of policies able to organize the elements in the index without causing distortions to its internal structure. The searching strategy employs fractal dimension property of the data to achieve accurate query answer while considering data with missing values part of the response. Results from experiments performed on a variety of real and synthetic data sets showed that, while other metric access methods deteriorate with small amounts of missing values, the Hollow-tree maintains a remarkable performance with almost 100\% of precision and recall for range queries and more than 90\% for k-nearest neighbor queries, for up to 40\% of missing values.},
Keywords = {Missing at random; Similarity search; Fractal dimension},
Owner = {imke},
Timestamp = {2019.12.12},
Topics = {classification; knn; clustering}
}
@Article{buck_JRSSB1960,
Title = {A method of estimation of missing values in multivariate data suitable for use with an electronic computer},
Author = {Buck, S. F.},
Journal = {Journal of the Royal Statistical Society, Series B},
Year = {1960},
Pages = {302-306},
Volume = {22},
Abstract = {Procedures for treating missing data in the statistical analysis of survey data are reviewed. The main topics covered are: (1) how to assess the nature of missing data especially with regard to randomness, (2) a comparison of listwise and pairwise deletion, and (3) methods for using maximum information to estimate (a) parameters or (b) missing values.},
Doi = {10.1177/004912417700600206},
Owner = {nathalie},
Timestamp = {2016.09.28},
Topics = {survey}
}
@InProceedings{burns_ARC1990,
Title = {Multiple and replicate item imputation in a complex sample survey},
Author = {Burns, R. M.},
Booktitle = {Proceedings of the 6th Annual Research Conference},
Year = {1990},
Address = {Washington DC, USA},
Editor = {Bureau of the Census},
Pages = {655-665},
Owner = {nathalie},
Timestamp = {2018.06.06}
}
@Article{candes_etal_IEEETSP2013,
Title = {Unbiased risk estimates for singular value thresholding and spectral estimators},
Author = {Cand\`es, E. J. and Sing-Long, C. A. and Trzasko, J. D.},
Journal = {IEEE Transactions on Signal Processing},
Year = {2013},
Number = {19},
Pages = {4643-4657},
Volume = {61},
Abstract = {In an increasing number of applications, it is of interest to recover an approximately low-rank data matrix from noisy observations. This paper develops an unbiased risk estimate -- holding in a Gaussian model -- for any spectral estimator obeying some mild regularity assumptions. In particular, we give an unbiased risk estimate formula for singular value thresholding (SVT), a popular estimation strategy that applies a soft-thresholding rule to the singular values of the noisy observations. Among other things, our formulas offer a principled and automated way of selecting regularization parameters in a variety of problems. In particular, we demonstrate the utility of the unbiased risk estimation for SVT-based denoising of real clinical cardiac MRI series data. We also give new results concerning the differentiability of certain matrix-valued functions.},
Doi = {10.1109/TSP.2013.2270464},
Owner = {nathalie},
Timestamp = {2018.05.09},
Topics = {factorial data analysis; misc}
}
@article{carpenter_etal_JRSS2006,
Title = {A comparison of multiple imputation and doubly robust estimation for analyses with missing data},
Author = {Carpenter, James R. and Kenward, Michael G. and Vansteelandt, Stijn},
Journal = {Journal of the Royal Statistical Society: Series A (Statistics in Society)},
Volume = {169},
Number = {3},
Pages = {571--584},
Year = {2006},
Abstract = {Multiple imputation is now a well-established technique for analysing data sets where some units have incomplete observations. Provided that the imputation model is correct, the resulting estimates are consistent. An alternative, weighting by the inverse probability of observing complete data on a unit, is conceptually simple and involves fewer modelling assumptions, but it is known to be both inefficient (relative to a fully parametric approach) and sensitive to the choice of weighting model. Over the last decade, there has been a considerable body of theoretical work to improve the performance of inverse probability weighting, leading to the development of ‘doubly robust’ or ‘doubly protected’ estimators. We present an intuitive review of these developments and contrast these estimators with multiple imputation from both a theoretical and a practical viewpoint.},
Keywords = {Double robustness; Inverse probability weighting; Missing at random; Multiple imputation},
Doi = {10.1111/j.1467-985X.2006.00407.x},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {ipw; mi}
}
@Book{carpenter_kenward_MIA2013,
Title = {Multiple Imputation and its Application},
Author = {Carpenter, J. and Kenward, M.},
Publisher = {Wiley},
Year = {2013},
Address = {Chichester, West Sussex, UK},
Abstract = {A practical guide to analysing partially observed data. Collecting, analysing and drawing inferences from data is central to research in the medical and social sciences. Unfortunately, it is rarely possible to collect all the intended data. The literature on inference from the resulting incomplete data is now huge, and continues to grow both as methods are developed for large and complex data structures, and as increasing computer power and suitable software enable researchers to apply these methods. This book focuses on a particular statistical method for analysing and drawing inferences from incomplete data, called Multiple Imputation (MI). MI is attractive because it is both practical and widely applicable. The authors aim is to clarify the issues raised by missing data, describing the rationale for MI, the relationship between the various imputation models and associated algorithms and its application to increasingly complex data structures.},
Doi = {10.1002/9781119942283},
ISBN = {9780470740521},
Owner = {alyssa},
Timestamp = {2017.04.11},
Topics = {multiple imputation; general}
}
@InProceedings{chen_guestrin_2016,
Title = {XGBoost: A Scalable Tree Boosting System},
Author = {Chen, T. and Guestrin, C.},
Booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Year = {2016},
Editor = {-},
Address = {New York, NY, USA},
Pages = {785-794},
Publisher = {ACM},
Abstract = {Tree boosting is a highly effective and widely used machine learning method. In this paper, we describe a scalable end-to-end tree boosting system called XGBoost, which is used widely by data scientists to achieve state-of-the-art results on many machine learning challenges. We propose a novel sparsity-aware algorithm for sparse data and weighted quantile sketch for approximate tree learning. More importantly, we provide insights on cache access patterns, data compression and sharding to build a scalable tree boosting system. By combining these insights, XGBoost scales beyond billions of examples using far fewer resources than existing systems.},
Doi = {10.1145/2939672.2939785},
Eventdate = {2016-08-13/2016-08-17},
ISBN = {0450342322},
Owner = {imke},
Timestamp = {2018.10.30},
Keywords = {large-scale machine learning},
Topics = {random forests}
}
@Article{chen_reiter_2019,
Title = {Nonparametric Pattern-Mixture Models for Inference with Missing Data},
Author = {Chen, Yen-Chi and Sadinle, Mauricio},
Journal = {arXiv preprint},
archivePrefix = {arXiv},
eprint = {1904.11085},
Year = {2019},
primaryClass = {stat.ME},
Url = {https://arxiv.org/pdf/1904.11085.pdf},
Abstract = {Pattern-mixture models provide a transparent approach for handling missing data, where the full-data distribution is factorized in a way that explicitly shows the parts that can be estimated from observed data alone, and the parts that require identifying restrictions. We introduce a nonparametric estimator of the full-data distribution based on the pattern-mixture model factorization. Our approach uses the empirical observed-data distribution and augments it with a nonparametric estimator of the missing-data distributions under a given identifying restriction. Our results apply to a large class of donor-based identifying restrictions that encompasses commonly used ones and can handle
both monotone and nonmonotone missingness. We propose a Monte Carlo procedure to derive point estimates of functionals of interest, and the bootstrap to construct confidence intervals.},
Keywords = {Bootstrap; Missingness mechanism; Nonignorable nonresponse; Nonparametric identification; Nonparametric inference},
Owner = {imke},
Timestamp = {2019.12.12},
Topics = {mnar}
}
@Article{chen_shao_JOS2000,
Title = {Nearest neighbor imputation for survey data},
Author = {Chen, J. and Shao, J.},
Journal = {Journal of Official Statistics},
Year = {2000},
Number = {2},
Pages = {113-131},
Volume = {16},
Abstract = {Nearest neighbor imputation is one of the hot deck methods used to compensate for nonresponse in sample surveys. Although it has a long history of application, few theoretical properties of the nearest neighbor imputation method are known prior to the current article. We show that under some conditions, the nearest neighbor imputation method provides asymptotically unbiased and consistent estimators of functions of population means (or totals), population distributions, and population quantiles. We also derive the asymptotic variances for estimators based on nearest neighbor imputation and consistent estimators of these asymptotic variances. Some simulation results show that the estimators based on nearest neighbor imputation and the proposed variance estimators have good performances.},
ISSN = {0282-423X},
Keywords = {biases; hot deck; quantiles; sample means; variance estimation},
Mendeley-groups = {missing data},
Owner = {alyssa},
Timestamp = {2016.09.27},
Topics = {knn},
Url = {http://www.jos.nu/Articles/abstract.asp?article=162113}
}
@Article{collins_etal_PM2007,
Title = {A comparison of inclusive and restrictive strategies in modern missing data procedures},
Author = {Collins, L. M. and Schafer, J. L. and Chi-Ming, K.},
Journal = {Psychological Methods},
Year = {2007},
Number = {4},
Pages = {330-351},
Volume = {6},
Abstract = {Two classes of modem missing data procedures, maximum likelihood (ML) and multiple imputation (MI), tend to yield similar results when implemented in comparable ways. In either approach, it is possible to include auxiliary variables solely for the purpose of improving the missing data procedure. A simulation was presented to assess the potential costs and benefits of a restrictive strategy, which makes minimal use of auxiliary variables, versus an inclusive strategy, which makes liberal use of such variables. The simulation showed that the inclusive strategy is to be greatly preferred. With an inclusive strategy not only is there a reduced chance of inadvertently omitting an important cause of missingness, there is also the possibility of noticeable gains in terms of increased efficiency and reduced bias, with only minor costs. As implemented in currently available software, the ML approach tends to encourage the use of a restrictive strategy, whereas the MI approach makes it relatively simple to use an inclusive strategy.},
Doi = {10.1037/1082-989X.6.4.330},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {multiple imputation; ml}
}
@Article{cranmer_gill_BJPS2012,
Title = {We have to be discrete about this: a non-parametric imputation technique for missing categorical data},
Author = {Cranmer, S. J. and Gill, J.},
Journal = {British Journal of Political Science},
Year = {2012},
Pages = {425-449},
Volume = {43},
Abstract = {Missing values are a frequent problem in empirical political science research. Surprisingly, the match between the measurement of the missing values and the correcting algorithms applied is seldom studied. While multiple imputation is a vast improvement over the deletion of cases with missing values, it is often unsuitable for imputing highly non-granular discrete data. We develop a simple technique for imputing missing values in such situations, which is a variant of hot deck imputation, drawing from the conditional distribution of the variable with missing values to preserve the discrete measure of the variable. This method is tested against existing techniques using Monte Carlo analysis and then applied to real data on democratization and modernization theory. Software for our imputation technique is provided in a free, easy-to-use package for the R statistical environment.},
Doi = {10.1017/S0007123412000312},
Owner = {nathalie},
Timestamp = {2016.02.15},
Topics = {knn; imputation}
}
@Article{crookston_finley_JSS2008,
Title = {{yaImpute}: an {R} package for {kNN} imputation},
Author = {Crookston, N. L. and Finley, A. O.},
Journal = {Journal of Statistical Software},
Year = {2008},
Pages = {10},
Volume = {23},
Abstract = {This article introduces yaImpute, an R package for nearest neighbor search and imputation. Although nearest neighbor imputation is used in a host of disciplines, the methods implemented in the yaImpute package are tailored to imputation-based forest attribute estimation and mapping. The impetus to writing the yaImpute is a growing interest in nearest neighbor imputation methods for spatially explicit forest inventory, and a need within this research community for software that facilitates comparison among different nearest neighbor search algorithms and subsequent imputation techniques. yaImpute provides directives for defining the search space, subsequent distance calculation, and imputation rules for a given number of nearest neighbors. Further, the package offers a suite of diagnostics for comparison among results generated from different imputation analyses and a set of functions for mapping imputation results.},
Doi = {10.18637/jss.v023.i10},
Owner = {nathalie},
Timestamp = {2017.10.09},
Topics = {knn; imputation}
}
@Article{dax_2014,
Title = {Imputing Missing Entries of a Data Matrix: A review},
Author = {Dax, A.},
Journal = {Journal of Advanced Computing},
Year = {2014},
Pages = {98-222},
Volume = {3},
Number = {3},
Abstract = {This review presents a practical summary of the missing data literature, including a sketch of missing data theory and descriptions of normal-model multiple imputation (MI) and maximum likelihood methods. Practical missing data analysis issues are discussed, most notably the inclusion of auxiliary variables for improving power and reducing bias. Solutions are given for missing data challenges such as handling longitudinal, categorical, and clustered data with normal-model MI; including interactions in the missing data model; and handling large numbers of variables. The discussion of attrition and nonignorable missingness emphasizes the need for longitudinal diagnostics and for reducing the uncertainty about the missing data mechanism under attrition. Strategies suggested for reducing attrition bias include using auxiliary variables, collecting follow-up data on a sample of those initially missing, and collecting data on intent to drop out. Suggestions are given for moving forward with research on missing data and attrition.},
Doi = {10.7726/jac.2014.1007},
Keywords = {imputation; missing data; matrix completion problems; low-rank approximations; nearest neighbors; iterative SVD; least squares methods; rank minimization; nuclear norm minimization; error assessment; training set; probe set; cross-validation; rank determination},
Owner = {imke},
Timestamp = {2018.11.07},
Topics = {general_informal; knn; imputation}
}
@Article{dempster_etal_JRSSB1977,
Title = {Maximum likelihood from incomplete data via the {EM} algorithm},
Author = {Dempster, A. P. and Laird, N. M. and Rubin, D. B.},
Journal = {Journal of the Royal Statistical Society, Series B (Methodological)},
Year = {1977},
Number = {1},
Pages = {1-38},
Volume = {39},
Keywords = {maximum likelihood estimation; statistical variance; statism; factor analysis; algorithms; estimation methods; missing data; censored data; perceptron convergence procedure},
Owner = {nathalie},
Timestamp = {2018.05.11},
Topics = {ML},
Url = {http://www.jstor.org/stable/2984875}
}
@Article{diggle_kenward_AP1994,
Title = {Informative drop-out in longitudinal data analysis},
Author = {Diggle, P. and Kenward, M. G.},
Journal = {Journal of the Royal Statistical Society, Series C (Applied Statistics)},
Year = {1994},
Number = {1},
Pages = {49-93},
Volume = {43},
Abstract = {A model is proposed for continuous longitudinal data with non-ignorable or informative drop-out (ID). The model combines a multivariate linear model for the underlying response with a logistic regression model for the drop-out process. The latter incorporates dependence of the probability of drop-out on unobserved, or missing, observations. Parameters in the model are estimated by using maximum likelihood (ML) and inferences drawn through conventional likelihood procedures. In particular, likelihood ratio tests can be used to assess the informativeness of the drop-out process through comparison of the full model with reduced models corresponding to random drop-out (RD) and completely random processes. A simulation study is used to assess the procedure in two settings: the comparison of time trends under a linear regression model with autocorrelated errors and the estimation of period means and treatment differences from a four-period four-treatment crossover trial. It is seen in both settings that, when data are generated under an ID process, the ML estimators from the ID model do not suffer from the bias that is present in the ordinary least squares and RD ML estimators. The approach is then applied to three examples. These derive from a milk protein trial involving three groups of cows, milk yield data from a study of mastitis in dairy cattle and data from a multicentre clinical trial on the study of depression. All three examples provide evidence of an underlying ID process, two with some strength. It is seen that the assumption of an ID rather than an RD process has practical implications for the interpretation of the data.},
Doi = {10.2307/2986113},
ISBN = {00359254},
ISSN = {00359254, 14679876},
Keywords = {longitudinal methods; missing data},
Mendeley-groups = {missing data},
Owner = {alyssa},
Pmid = {6121453},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Article{ding_li_SS2018,
Title = {Causal Inference: A Missing Data Perspective},
Author = {Ding, P. and Li, F.},
Journal = {Statistical Science},
Year = {2018},
Volume = {33},
Number = {2},
Pages = {214--237},
Abstract = {Inferring causal effects of treatments is a central goal in many disciplines. The potential outcomes framework is a main statistical approach to causal inference, in which a causal effect is defined as a comparison of the potential outcomes of the same units under different treatment conditions. Because for each unit at most one of the potential outcomes is observed and the rest are missing, causal inference is inherently a missing data problem. Indeed, there is a close analogy in the terminology and the inferential framework between causal inference and missing data. Despite the intrinsic connection between the two subjects, statistical analyses of causal inference and missing data also have marked differences in aims, settings and methods. This article provides a systematic review of causal inference from the missing data perspective. Focusing on ignorable treatment assignment mechanisms, we discuss a wide range of causal inference methods that have analogues in missing data analysis, such as imputation, inverse probability weighting and doubly robust methods. Under each of the three modes of inference—Frequentist, Bayesian and Fisherian randomization—we present the general structure of inference for both finite-sample and super-population estimands, and illustrate via specific examples. We identify open questions to motivate more research to bridge the two fields.},
Doi = {10.1214/18-STS645},
Keywords = {assignment mechanism; ignorability; imputation; missing data mechanism; observational studies; potential outcome; propensity score; randomizatoin; weighting},
Owner = {imke},
Timestamp = {2018.12.11},
Topics = {causal inference}
}
@Article{ding_simonoff_JMLR2010,
Title = {An investigation of missing data methods for classification trees applied to binary response data},
Author = {Ding, Y. and Simonoff, J. S.},
Journal = {Journal of Machine Learning Research},
Year = {2010},
Pages = {131-170},
Volume = {11},
Number = {1},
Abstract = {There are many different methods used by classification tree algorithms when missing data occur in the predictors, but few studies have been done comparing their appropriateness and performance. This paper provides both analytic and Monte Carlo evidence regarding the effectiveness of six popular missing data methods for classification trees applied to binary response data. We show that in the context of classification trees, the relationship between the missingness and the dependent variable, as well as the existence or non-existence of missing values in the testing data, are the most helpful criteria to distinguish different missing data methods. In particular, separate class is clearly the best method to use when the testing set has missing values and the missingness is related to the response variable. A real data set related to modeling bankruptcy of a firm is then analyzed. The paper concludes with discussion of adaptation of these results to logistic regression, and other potential generalizations.},
Keywords = {classification tree; missing data; separate class; rpart; C4.5; cart},
Owner = {nathalie},
Timestamp = {2016.11.30},
Topics = {imputation; surrogate variables; classification trees},
Url = {http://www.jmlr.org/papers/v11/ding10a.html}
}
@Article{dong_peng_SP2013,
Title = {Principled missing data methods for researchers},
Author = {Dong, Yiran and Peng, Chao-Ying Joanne},
Journal = {SpringerPlus},
Year = {2013},
Pages = {222},
Volume = {2},
Abstract = {The impact of missing data on quantitative research can be serious, leading to biased estimates of parameters, loss of information, decreased statistical power, increased standard errors, and weakened generalizability of findings. In this paper, we discussed and demonstrated three principled missing data methods: multiple imputation, full information maximum likelihood, and expectation-maximization algorithm, applied to a real-world data set. Results were contrasted with those obtained from the complete data set and from the listwise deletion method. The relative merits of each method are noted, along with common features they share. The paper concludes with an emphasis on the importance of statistical assumptions, and recommendations for researchers. Quality of research will be enhanced if (a) researchers explicitly acknowledge missing data problems and the conditions under which they occurred, (b) principled methods are employed to handle missing data, and (c) the appropriate treatment of missing data is incorporated into review standards of manuscripts submitted for publication.},
Doi = {10.1186/2193-1801-2-222},
Keywords = {missing data; listwise deletion; mi; fiml; em; mar; mcar; mnar},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {general_informal}
}
@Book{enders_AMDA2010,
Title = {Applied Missing Data Analysis},
Author = {Enders, C. K.},
Publisher = {Guilford Press},
Year = {2010},
Abstract = {Walking readers step by step through complex concepts, this book translates missing data techniques into something that applied researchers and graduate students can understand and utilize in their own research. Enders explains the rationale and procedural details for maximum likelihood estimation, Bayesian estimation, multiple imputation, and models for handling missing not at random (MNAR) data. Easy-to-follow examples and small simulated data sets illustrate the techniques and clarify the underlying principles. The companion website includes data files and syntax for the examples in the book as well as up-to-date information on software. The book is accessible to substantive researchers while providing a level of detail that will satisfy quantitative specialists.},
ISBN = {9781606236390},
Owner = {alyssa},
Pages = {401},
Timestamp = {2016.09.27},
Topics = {general}
}
@Article{enders_SEM2001,
Title = {A primer on maximum likelihood algorithms available for use with missing data},
Author = {Enders, C. K.},
Journal = {Structural Equation Modeling},
Year = {2001},
Number = {1},
Pages = {128-141},
Volume = {8},
Abstract = {Maximum likelihood algorithms for use with missing data are becoming commonplace in microcomputer packages. Specifically, 3 maximum likelihood algorithms are currently available in existing software packages: the multiple-group approach, full information maximum likelihood estimation, and the EM algorithm. Although they belong to the same family of estimator, confusion appears to exist over the differences among the 3 algorithms. This article provides a comprehensive, nontechnical overview of the 3 maximum likelihood algorithms. Multiple imputation, which is frequently used in conjunction with the EM algorithm, is also discussed.},
Doi = {10.1207/S15328007SEM0801_7},
Owner = {alyssa},
Timestamp = {2017.07.07},
Topics = {ml}
}
@Article{erler_etal_2019,
author = {Erler, Nicole S and Rizopoulos, Dimitris and Lesaffre, Emmanuel MEH},
journal = {arXiv preprint},
title = {JointAI: joint analysis and imputation of incomplete data in R},
year = {2019},
abstract = {Missing data occur in many types of studies and typically complicate the analysis. Multiple imputation, either using joint modelling or the more flexible fully conditional specification approach, are popular and work well in standard settings. In settings involving non-linear associations or interactions, however, incompatibility of the imputation model with the analysis model is an issue often resulting in bias. Similarly, complex outcomes such as longitudinal or survival outcomes cannot be adequately handled by standard implementations. In this paper, we introduce the R package JointAI, which utilizes the Bayesian framework to perform simultaneous analysis and imputation in regression models with incomplete covariates. Using a fully Bayesian joint modelling approach it overcomes the issue of uncongeniality while retaining the attractive flexibility of fully conditional specification multiple imputation by specifying the joint distribution of analysis and imputation models as a sequence of univariate models that can be adapted to the type of variable. JointAI provides functions for Bayesian inference with generalized linear and generalized linear mixed models and extensions thereof as well as survival models and joint models for longitudinal and survival data, that take arguments analogous to corresponding well known functions for the analysis of complete data from base R and other packages. Usage and features of JointAI are described and illustrated using various examples and the theoretical background is outlined.},
archiveprefix = {arXiv},
arxivid = {1907.10867v3},
keywords = {multiple imputation; Bayesian inference; R},
owner = {aude},
timestamp = {2021.01.12},
topics = {multiple imputation},
url = {https://arxiv.org/abs/1907.10867},
}
@article{fang_etal_2018,
Title = {Imputation-based adjusted score equations in generalized linear models with nonignorable missing covariate values},
Author = {Fang, F. and Zhao, J. and Shao, J.},
Journal = {Statistica Sinica},
Volume = {28},
Year = {2018},
Number = {4},
Pages = {1677--1701},
Publisher = {Institute of Statistical Science},
Abstract = {We consider the estimation of unknown parameters in a generalized linear model when some covariates have nonignorable missing values. When an instrument, a covariate that helps identifying parameters under nonignorable missingness, is appropriately specified, a pseudo likelihood approach similar to that in Tang, Little and Raghunathan (2003) or Zhao and Shao (2015) can be applied. However, this approach does not work well when the instrument is a weak predictor of the response given other covariates. We show that the asymptotic variances of the pseudo likelihood estimators for the regression coefficients of covariates other than the instrument diverge to infinity as the regression coefficient of the instrument goes to 0. By an imputation-based adjustment for the score equations, we propose a new estimator for the regression coefficients of the covariates other than the instrument. This works well even if the instrument is a weak predictor. It is semiparametric since the propensity of missing covariate data is completely unspecified. To solve the adjusted score equation, we develop an iterative algorithm that can be applied by using standard softwares at each iteration. We establish some theoretical results on the convergence of the proposed iterative algorithm and asymptotic normality of the resulting estimators. A variance estimation formula is also derived. Some simulation results and a data example are presented for illustration.},
Doi = {10.5705/ss.202015.0437},
Keywords = {Adjusted likelihood; Identifiability; Nonignorable missing covariate data; Pseudo-likelihood; Semiparametric},
Owner = {imke},
Timestamp = {2018.11.11},
Topics = {mnar}
}
@Article{fay_JASA1996,
Title = {Alternative paradigms for the analysis of imputed survey data},
Author = {Fay, R. E.},
Journal = {Journal of the American Statistical Association},
Year = {1996},
Number = {434},
Pages = {490-498},
Volume = {91},
Abstract = {Rubin has offered multiple imputation as a general approach to inference from survey data sets with missing values filled in through imputation. In many situations the multiple imputation variance estimator is consistent. In tum, this observation has lent support to a number of complex applications. In fact, however, the multiple imputation variance estimator is inconsistent under some simple conditions. This article extends previous work of Rao and Shao and of Fay directed toward consistent variance estimation under wider conditions. Extensions of Rao and Shao's results to fractionally weighted imputation combines the estimation efficiency of multiple imputation and the consistency of the Rao-Shao variance estimator.},
Doi = {10.1080/01621459.1996.10476909},
Keywords = {fractionally weighted imputation; missing data; multiple imputation; Rao-Shao variance estimator},
Owner = {nathalie},
Timestamp = {2018.05.16},
Topics = {multiple imputation}
}
@Article{fellegi_holt_JASA1976,
Title = {A systematic approach to automatic edit and imputation},
Author = {Fellegi, I. P. and Holt, D.},
Journal = {Journal of the American Statistical Association},
Year = {1976},
Number = {353},
Pages = {17-35},
Volume = {71},
Doi = {10.2307/2285726},
Owner = {nathalie},
Timestamp = {2018.05.23},
Topics = {imputation}
}
@Article{ferrari_etal_CSDA2011,
Title = {An imputation method for categorical variables with application to nonlinear principal component analysis},
Author = {Ferrari, Pier Alda and Annoni, Paola and Barbiero, Alessandro and Manzi, Giancario},
Journal = {Computational Statistics \& Data Analysis},
Year = {2011},
Number = {7},
Pages = {2410-2420},
Volume = {55},
Abstract = {The problem of missing data in building multidimensional composite indicators is a delicate problem which is often underrated. An imputation method particularly suitable for categorical data is proposed. This method is discussed in detail in the framework of nonlinear principal component analysis and compared to other missing data treatments which are commonly used in this analysis. Its performance vs. these other methods is evaluated throughout a simulation procedure performed on both an artificial case, varying the experimental conditions, and a real case. The proposed procedure is implemented using R.},
Doi = {10.1016/j.csda.2011.02.007},
Keywords = {composite indicators; forward imputation; imputation procedure; listwise deletion; nearest neighbor; ordinal data; passive treatment},
Owner = {nathalie},
Timestamp = {2018.06.07},
Topics = {imputation; knn; factorial data analysis}
}
@Article{finkbeiner_P1979,
Title = {Estimation for the multiple factor model when data are missing},
Author = {Finkbeiner, C.},
Journal = {Psychometrika},
Year = {1979},
Number = {4},
Pages = {409-420},
Volume = {44},
Abstract = {A maximum likelihood method of estimating the parameters of the multiple factor model when data are missing from the sample is presented. A Monte Carlo study compares the method with 5 heuristic methods of dealing with the problem. The present method shows some advantage in accuracy of estimation over the heuristic methods but is considerably more costly computationally.},
Doi = {10.1007/BF02296204},
Keywords = {factor analysis; missing data},
Owner = {nathalie},
Timestamp = {2018.05.11},
Topics = {imputation; ml}
}
@article{fitzmorice_etal_JRSS1995,
Title = {Regression Models for Longitudinal Binary Responses with Informative Drop-Outs},
Author = {Fitzmaurice, Garrett M. and Molenberghs, Geert and Lipsitz, Stuart R.},
Journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
Number = {4},
Pages = {691--704},
Publisher = {[Royal Statistical Society, Wiley]},
Volume = {57},
Year = {1995},
Abstract = {This paper reviews both likelihood-based and non-likelihood (generalized estimating equations) regression models for longitudinal binary responses when there are drop-outs. Throughout, it is assumed that the regression parameters for the marginal expectations of the binary responses are of primary scientific interest. The association or time dependence between the responses is largely regarded as a nuisance characteristic of the data. The performance of the methods is compared, in terms of asymptotic bias, under misspecification of the association between the responses and the missing data mechanism or drop-out process.},
ISSN = {00359246},
Url = {http://www.jstor.org/stable/2345937},
Keywords = {Generalized Estimating Equations; Maximum Likelihood Estimation; Missing Data; Repeated Measures},
Owner = {imke},
Timestamp = {2018.12.19},
Topics = {survey}
}
@Article{follman_wu_B1995,
Title = {An approximate generalized linear model with random effects for informative missing data},
Author = {Follmann, D. and Wu, M.},
Journal = {Biometrics},
Year = {1995},
Number = {1},
Pages = {151-168},
Volume = {51},
Abstract = {This paper develops a class of models to deal with missing data from longitudinal studies. We assume that separate models for the primary response and missingness (e.g., number of missed visits) are linked by a common random parameter. Such models have been developed in the econometrics (Heckman, 1979, Econometrica 47, 153-161) and biostatistics (Wu and Carroll, 1988, Biometrics 44, 175-188) literature for a Gaussian primary response. We allow the primary response, conditional on the random parameter, to follow a generalized linear model and approximate the generalized linear model by conditioning on the data that describes missingness. The resultant approximation is a mixed generalized linear model with possibly heterogeneous random effects. An example is given to illustrate the approximate approach, and simulations are performed to critique the adequacy of the approximation for repeated binary data.},
Doi = {10.2307/2533322},
ISSN = {0006341X, 15410420},
Owner = {alyssa},
Publisher = {[Wiley, International Biometric Society]},
Timestamp = {2017.10.25},
Topics = {mnar}
}
@Article{gad_darwish_AJAMS2013,
Title = {A shared parameter model for longitudinal data with missing values},
Author = {Gad, A. M. and Darwish, N. M. M.},
Journal = {American Journal of Applied Mathematics and Statistics},
Year = {2013},
Number = {2},
Pages = {30-35},
Volume = {1},
Abstract = {Longitudinal studies represent one of the principal research strategies employed in medical and social research. These studies are the most appropriate for studying individual change over time. The prematurely withdrawal of some subjects from the study (dropout) is termed nonrandom when the probability of missingness depends on the missing value. Nonrandom dropout is common phenomenon associated with longitudinal data and it complicates statistical inference. The shared parameter model is used to fit longitudinal data in the presence of nonrandom dropout. The stochastic EM algorithm is developed to obtain the model parameter estimates. Also, parameter estimates of the dropout model have been obtained. Standard errors of estimates have been calculated using the developed Monte Carlo method. The proposed approach performance is evaluated through a simulation study. Also, the proposed approach is applied to a real data set.},
Owner = {alyssa},
Timestamp = {2017.08.07},
Topics = {mnar},
Url = {http://pubs.sciepub.com/ajams/1/2/3}
}
@Article{gelman_etal_1998,
Title = {Not asked and not answered: Multiple imputation for multiple surveys},
Author = {Gelman, A. and King, G. and Liu, C.},
Journal = {Journal of the American Statistical Association},
Volume = {93},
Number = {443},
Pages = {846--857},
Year = {1998},
Publisher = {Taylor \& Francis Group},
Abstract = {We present a method of analyzing a series of independent cross-sectional surveys in which some questions are not answered in some surveys and some respondents do not answer some of the questions posed. The method is also applicable to a single survey in which different questions are asked or different sampling methods are used in different strata or clusters. Our method involves multiply imputing the missing items and questions by adding to existing methods of imputation designed for single surveys a hierarchical regression model that allows covariates at the individual and survey levels. Information from survey weights is exploited by including in the analysis the variables on which the weights were based, and then reweighting individual responses (observed and imputed) to estimate population quantities. We also develop diagnostics for checking the fit of the imputation model based on comparing imputed data to nonimputed data. We illustrate with the example that motivated this project: a study of pre-election public opinion polls in which not all the questions of interest are asked in all the surveys, so that it is infeasible to impute within each survey separately.},
Keywords = {Bayesian inference; cluster sampling; diagnostics; hierarchical models; ignorable nonresponse; missing data; political science; sample surveys; stratified sampling},
Doi = {10.1080/01621459.1998.10473737},
Owner = {imke},
Timestamp = {2018.11.19},
Topics = {mi; survey}
}
@Article{gelman_etal_2005,
Title = {Multiple Imputation for Model Checking: Completed-Data Plots with Missing and Latent Data},
Author = {Gelman, A. and van Mechelen, I. and Verbeke, G. and Heitjan, D. F. and Meulders, M.},
Journal = {Biometrics},
Volume = {61},
Number = {1},
Pages = {74--85},
Year = {2005},
Publisher = {Wiley Online Library},
Abstract = {In problems with missing or latent data, a standard approach is to first impute the unobserved data, then perform all statistical analyses on the completed dataset -- corresponding to the observed data and imputed unobserved data -- using standard procedures for complete‐data inference. Here, we extend this approach to model checking by demonstrating the advantages of the use of completed‐data model diagnostics on imputed completed datasets. The approach is set in the theoretical framework of Bayesian posterior predictive checks (but, as with missing‐data imputation, our methods of missing‐data model checking can also be interpreted as “predictive inference” in a non‐Bayesian context). We consider the graphical diagnostics within this framework. Advantages of the completed‐data approach include: (1) One can often check model fit in terms of quantities that are of key substantive interest in a natural way, which is not always possible using observed data alone. (2) In problems with missing data, checks may be devised that do not require to model the missingness or inclusion mechanism; the latter is useful for the analysis of ignorable but unknown data collection mechanisms, such as are often assumed in the analysis of sample surveys and observational studies. (3) In many problems with latent data, it is possible to check qualitative features of the model (for example, independence of two variables) that can be naturally formalized with the help of the latent data. We illustrate with several applied examples.},
Keywords = {Bayesian model checking; exploratory data analysis; multiple imputation; nonresponse; posterior predictive checks; realized discrepancies; residuals},
Doi = {10.1111/j.0006-341X.2005.031010.x},
Owner = {imke},
Timestamp = {2018.11.19},
Topics = {mi}
}
@InProceedings{gill_etal_1997,
Title = {Coarsening at random: Characterizations, conjectures, counter-examples},
Author = {Gill, Richard D and Van Der Laan, Mark J and Robins, James M},
Booktitle = {Proceedings of the First Seattle Symposium in Biostatistics},
Pages = {255--294},
Year = {1997},
Organization = {Springer},
Abstract = {The notion of coarsening at random (CAR) was introduced by Heitjan and Rubin (1991) to describe the most general form of randomly grouped, censored, or missing data, for which the coarsening mechanism can be ignored when making likelihood-based inference about the parameters of the distribution of the variable of interest. The CAR assumption is popular, and applications abound. However the full implications of the assumption have not been realized. Moreover a satisfactory theory of CAR for continuously distributed data -- which is needed in many applications, particularly in survival analysis -- hardly exists as yet. This paper gives a detailed study of CAR. We show that grouped data from a finite sample space always fit a CAR model: a nonparametric model for the variable of interest together with the assumption of an arbitrary CAR mechanism puts no restriction at all on the distribution of the observed data. In a slogan, CAR is everything. We describe what would seem to be the most general way CAR data could occur in practice, a sequential procedure called randomized monotone coarsening. We show that CAR mechanisms exist which are not of this type. Such a coarsening mechanism uses information about the underlying data which is not revealed to the observer, without this affecting the observer’s conclusions. In a second slogan, CAR is more than it seems. This implies that if the analyst can argue from subject-matter considerations that coarsened data is CAR, he or she has knowledge about the structure of the coarsening mechanism which can be put to good use in non-likelihood-based inference procedures. We argue that this is a valuable option in multivariate survival analysis. We give a new definition of CAR in general sample spaces, criticising earlier proposals, and we establish parallel results to the discrete case. The new definition focusses on the distribution rather than the density of the data. It allows us to generalise the theory of CAR to the important situation where coarsening variables (e.g., censoring times) are partially observed as well as the variables of interest.},
Keywords = {coarsening at random; CAR; missingness mechanisms; survival analysis},
Doi = {10.1007/978-1-4684-6316-3_14},
Owner = {imke},
Timestamp = {2019.08.02},
Topics = {mnar; mechanisms}
}
@Article{golden_etal_2019,
Title = {Consequences of model misspecification for maximum likelihood estimation with missing data},
Author = {Golden, Richard M and Henley, Steven S and White, Halbert and Kashner, T Michael},
Journal = {Econometrics},
Volume = {7},
Number = {3},
Pages = {37},
Year = {2019},
Publisher = {Multidisciplinary Digital Publishing Institute},
Doi = {10.3390/econometrics7030037},
Abstract = {Researchers are often faced with the challenge of developing statistical models with incomplete data. Exacerbating this situation is the possibility that either the researcher's complete-data model or the model of the missing-data mechanism is misspecified. In this article, we create a formal theoretical framework for developing statistical models and detecting model misspecification in the presence of incomplete data where maximum likelihood estimates are obtained by maximizing the
observable-data likelihood function when the missing-data mechanism is assumed ignorable. First, we provide sufficient regularity conditions on the researcher's complete-data model to characterize the asymptotic behavior of maximum likelihood estimates in the simultaneous presence of both missing data and model misspecification. These results are then used to derive robust hypothesis testing
methods for possibly misspecified models in the presence of Missing at Random (MAR) or Missing Not at Random (MNAR) missing data. Second, we introduce a method for the detection of model misspecification in missing data problems using recently developed Generalized Information Matrix Tests (GIMT). Third, we identify regularity conditions for the Missing Information Principle (MIP) to hold in the presence of model misspecification so as to provide useful computational covariance matrix estimation formulas. Fourth, we provide regularity conditions that ensure the observable-data expected negative log-likelihood function is convex in the presence of partially observable data when the amount of missingness is sufficiently small and the complete-data likelihood is convex. Fifth, we show that when the researcher has correctly specified a complete-data model with a convex negative likelihood function and an ignorable missing-data mechanism, then its strict local minimizer is the true parameter value for the complete-data model when the amount of missingness is sufficiently small. Our results thus provide new robust estimation, inference, and specification analysis methods for developing statistical models with incomplete data.},
Keywords = {asymptotic theory; ignorable; Generalized Information Matrix Test; misspecification;
missing data; nonignorable; sandwich estimator; specification analysis},
Owner = {imke},
Timestamp = {2019.12.12},
Topics = {ml; regression}
}
@InProceedings{gondara_wang_2018,
Title = {MIDA: Multiple Imputation using Denoising Autoencoders},
Author = {Gondara, L. and Wang, K.},
Booktitle = {Proceedings of the 22nd Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 2018)},
Series = {Lecture Notes in Computer Science},
Year = {2018},
Editor = {Phung, D. and Tseng, V. and Webb, G. and Ho, B. and Ganji, M. and Rashidi, L.},
Pages = {260-272},
Publisher = {Springer International Publishing},
Eventdate = {2018-06-03/2018-06-06},
ISBN = {3319930404},
Abstract = {Missing data is a significant problem impacting all domains. State-of-the-art framework for minimizing missing data bias is multiple imputation, for which the choice of an imputation model remains nontrivial. We propose a multiple imputation model based on overcomplete deep denoising autoencoders. Our proposed model is capable of handling different data types, missingness patterns, missingness proportions and distributions. Evaluation on several real life datasets show our proposed model significantly outperforms current state-of-the-art methods under varying conditions while simultaneously improving end of the line analytics.},
Doi = {10.1007/978-3-319-93040-4_21},
Url = {https://arxiv.org/abs/1705.02737},
Keywords = {Multiple imputation; denoising autoencoders; DAE},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {multiple imputation; deep learning}
}
@InProceedings{goodfellow_etal_2013,
Title = {Multi-Prediction Deep Boltzmann Machines},
Author = {Goodfellow, I. and Mirza, M. and Courville, A. and Bengio, Y.},
Booktitle = {Proceedings of the 26th International Conference on Neural Information Processing Systems},
Series = {Advances in Neural Information Processing Systems 26},
Editor = {Burges, C.J.C. and Bottou, L. and Welling, M. and Ghahramani, Z. and Weinberger, K.Q.},
Pages = {548--556},
Year = {2013},
Publisher = {Curran Associates, Inc.},
Eventdate = {2013-12-05/2013-12-10},
Abstract = {We introduce the Multi-Prediction Deep Boltzmann Machine (MP-DBM). The MP-DBM can be seen as a single probabilistic model trained to maximize a variational approximation to the generalized pseudolikelihood, or as a family of recurrent nets that share parameters and approximately solve different inference problems. Prior methods of training DBMs either do not perform well on classification tasks or require an initial learning pass that trains the DBM greedily, one layer at a time. The MP-DBM does not require greedy layerwise pretraining, and outperforms the standard DBM at classification, classification with missing inputs, and mean field prediction tasks.},
Url = {http://papers.nips.cc/paper/5024-multi-prediction-deep-boltzmann-machines.pdf},
Keywords = {Classification; deep Boltzmann Machines; DBM; pseudolikelihood},
Owner = {imke},
Timestamp = {2018.11.08},
Topics = {classification; deep learning}
}
@Article{graham_ARP2009,
Title = {Missing data analysis: making it work in the real world},
Author = {Graham, J. W.},
Journal = {Annual Review of Psychology},
Year = {2009},
Pages = {549-576},
Volume = {60},
Abstract = {This review presents a practical summary of the missing data literature, including a sketch of missing data theory and descriptions of normal-model multiple imputation (MI) and maximum likelihood methods. Practical missing data analysis issues are discussed, most notably the inclusion of auxiliary variables for improving power and reducing bias. Solutions are given for missing data challenges such as handling longitudinal, categorical, and clustered data with normal-model MI; including interactions in the missing data model; and handling large numbers of variables. The discussion of attrition and nonignorable missingness emphasizes the need for longitudinal diagnostics and for reducing the uncertainty about the missing data mechanism under attrition. Strategies suggested for reducing attrition bias include using auxiliary variables, collecting follow-up data on a sample of those initially missing, and collecting data on intent to drop out. Suggestions are given for moving forward with research on missing data and attrition.},
Doi = {10.1146/annurev.psych.58.110405.085530},
ISBN = {0066-4308 (Print) 0066-4308 (Linking)},
ISSN = {0066-4308},
Mendeley-groups = {missing data},
Owner = {alyssa},
Pmid = {18652544},
Shorttitle = {Missing Data Analysis},
Timestamp = {2016.11.30},
Topics = {general}
}
@Article{graham_etal_PS2007,
Title = {How many imputations are really needed? Some practical clarifications of multiple imputation theory},
Author = {Graham, John W. and Olchowski, Allison E. and Gilreath, Tamika E.},
Journal = {Prevention Science},
Year = {2007},
Number = {3},
Pages = {206-213},
Volume = {8},
Abstract = {Multiple imputation (MI) and full information maximum likelihood (FIML) are the two most common approaches to missing data analysis. In theory, MI and FIML are equivalent when identical models are tested using the same variables, and when m, the number of imputations performed with MI, approaches infinity. However, it is important to know how many imputations are necessary before MI and FIML are sufficiently equivalent in ways that are important to prevention scientists. MI theory suggests that small values of m, even on the order of three to five imputations, yield excellent results. Previous guidelines for sufficient m are based on relative efficiency, which involves the fraction of missing information (gamma) for the parameter being estimated, and m. In the present study, we used a Monte Carlo simulation to test MI models across several scenarios in which gamma and m were varied. Standard errors and p-values for the regression coefficient of interest varied as a function of m, but not at the same rate as relative efficiency. Most importantly, statistical power for small effect sizes diminished as m became smaller, and the rate of this power falloff was much greater than predicted by changes in relative efficiency. Based our findings, we recommend that researchers using MI should perform many more imputations than previously considered sufficient. These recommendations are based on gamma, and take into consideration one's tolerance for a preventable power falloff (compared to FIML) due to using too few imputations.},
Doi = {10.1007/s11121-007-0070-9},
Keywords = {multiple imputation; number of imputations; full information maximum likelihood; missing data; statistical power},
Owner = {nathalie},
Timestamp = {2018.06.06},
Topics = {multiple imputation}