data/references.bib

@article{Boland2017,
abstract = {Open access, open data, and software are critical for advancing science and enabling collabora-tion across multiple institutions and throughout the world. Despite near universal recognition of its importance, major barriers still exist to sharing raw data, software, and research products throughout the scientific community. Many of these barriers vary by specialty [1], increasing the difficulties for interdisciplinary and/or translational researchers to engage in collaborative research. Multi-site collaborations are vital for increasing both the impact and the generalizability of research results. However, they often present unique data sharing challenges. We discuss enabling multi-site collaborations through enhanced data sharing in this set of Ten Simple Rules. Collaboration is an essential component of research [2] that takes many forms, including internal (across departments within a single institution) and external collaborations (across institutions). However, multi-site collaborations with more than two institutions encounter more complex challenges because of institutional-specific restrictions and guidelines [3]. Vicens and Bourne focus on collaborators working together on a shared research grant [4]. They do not discuss the specific complexities of multi-site collaborations and the vital need for enhanced data sharing in the multi-site and large-scale collaboration context, in which partici-pants may or may not have the same funding source and/or research grant. While challenging, multi-site collaborations are equally rewarding and result in increased research productivity [5, 6]. One highly successful multi-site and translational collaboration is the Electronic Medical Records and Genomics (eMERGE) network (URL: https://emerge.mc.},
author = {Boland, M. R. and Karczewski, K. J. and Tatonetti, N. P.},
doi = {10.1371/journal.pcbi.1005278},
file = {::},
isbn = {1111111111},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {jan},
number = {1},
pages = {e1005278},
pmid = {28103227},
publisher = {Public Library of Science},
title = {{Ten Simple Rules to Enable Multi-site Collaborations through Data Sharing}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1005278},
volume = {13},
year = {2017}
}
@article{Brazma2001,
abstract = {Microarray analysis has become a widely used tool for the generation of gene expression data on a genomic scale. Although many significant results have been derived from microarray studies, one limitation has been the lack of standards for presenting and exchanging such data. Here we present a proposal, the Minimum Information About a Microarray Experiment (MIAME), that describes the minimum information required to ensure that microarray data can be easily interpreted and that results derived from its analysis can be independently verified. The ultimate goal of this work is to establish a standard for recording and reporting microarray-based gene expression data, which will in turn facilitate the establishment of databases and public repositories and enable the development of data analysis tools. With respect to MIAME, we concentrate on defining the content and structure of the necessary information rather than the technical format for capturing it.},
author = {Brazma, A. and Hingamp, P. and Quackenbush, J. and Sherlock, G. and Spellman, P. and Stoeckert, C. and Aach, J. and Ansorge, W. and Ball, C. A. and Causton, H. C. and Gaasterland, T. and Glenisson, P. and Holstege, F. C. P. and Kim, I. F. and Markowitz, V. and Matese, J. C. and Parkinson, H. and Robinson, A. and Sarkans, U. and Schulze-Kremer, S. and Stewart, J. and Taylor, R. and Vilo, J. and Vingron, M.},
doi = {10.1038/ng1201-365},
isbn = {1546-1718 (Electronic)},
issn = {1061-4036},
journal = {Nature genetics},
month = {dec},
number = {4},
pages = {365--71},
pmid = {11726920},
publisher = {Nature Publishing Group},
title = {{Minimum information about a microarray experiment (MIAME)-toward standards for microarray data.}},
url = {http://www.nature.com/doifinder/10.1038/ng1201-365},
volume = {29},
year = {2001}
}
@article{Buneman2000,
abstract = {The ease with which one can copy and transform data on the Web, has made it increasingly difficult to determine the origins of a piece of data. We use the term data provenance to refer to the process of tracing and recording the origins of data and its movement between databases. Provenance is now an acute issue in scientific databases where it is central to the validation of data. In this paper we discuss some of the technical issues that have emerged in an initial exploration of the topic.},
author = {Buneman, P. and Khanna, S. and Tan, W.-C.},
doi = {10.1007/3-540-44450-5_6},
file = {::},
isbn = {978-3-540-41413-1},
journal = {Lecture Notes in Computer Science: Foundations of Software Technology and Theoretical Computer Science},
month = {dec},
pages = {87--93},
publisher = {Springer, Berlin, Heidelberg},
title = {{Data Provenance: Some Basic Issues}},
url = {http://db.cis.upenn.edu/DL/fsttcs.pdf},
volume = {1974},
year = {2000}
}
@article{Chavan2011,
abstract = {BACKGROUND: Free and open access to primary biodiversity data is essential for informed decision-making to achieve conservation of biodiversity and sustainable development. However, primary biodiversity data are neither easily accessible nor discoverable. Among several impediments, one is a lack of incentives to data publishers for publishing of their data resources. One such mechanism currently lacking is recognition through conventional scholarly publication of enriched metadata, which should ensure rapid discovery of 'fit-for-use' biodiversity data resources. DISCUSSION: We review the state of the art of data discovery options and the mechanisms in place for incentivizing data publishers efforts towards easy, efficient and enhanced publishing, dissemination, sharing and re-use of biodiversity data. We propose the establishment of the 'biodiversity data paper' as one possible mechanism to offer scholarly recognition for efforts and investment by data publishers in authoring rich metadata and publishing them as citable academic papers. While detailing the benefits to data publishers, we describe the objectives, work flow and outcomes of the pilot project commissioned by the Global Biodiversity Information Facility in collaboration with scholarly publishers and pioneered by Pensoft Publishers through its journals Zookeys, PhytoKeys, MycoKeys, BioRisk, NeoBiota, Nature Conservation and the forthcoming Biodiversity Data Journal. We then debate further enhancements of the data paper beyond the pilot project and attempt to forecast the future uptake of data papers as an incentivization mechanism by the stakeholder communities. CONCLUSIONS: We believe that in addition to recognition for those involved in the data publishing enterprise, data papers will also expedite publishing of fit-for-use biodiversity data resources. However, uptake and establishment of the data paper as a potential mechanism of scholarly recognition requires a high degree of commitment and investment by the cross-sectional stakeholder communities.},
author = {Chavan, V. and Penev, L.},
doi = {10.1186/1471-2105-12-S15-S2},
isbn = {1471-2105},
issn = {1471-2105},
journal = {BMC Bioinformatics},
number = {Suppl 15},
pages = {S2},
pmid = {22373175},
title = {{The data paper: a mechanism to incentivize data publishing in biodiversity science}},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-S15-S2 http://www.biomedcentral.com/1471-2105/12/S15/S2},
volume = {12},
year = {2011}
}
@article{Cock2009,
abstract = {FASTQ has emerged as a common file format for sharing sequencing read data combining both the sequence and an associated per base quality score, despite lacking any formal definition to date, and existing in at least three incompatible variants. This article defines the FASTQ format, covering the original Sanger standard, the Solexa/Illumina variants and conversion between them, based on publicly available information such as the MAQ documentation and conventions recently agreed by the Open Bioinformatics Foundation projects Biopython, BioPerl, BioRuby, BioJava and EMBOSS. Being an open access publication, it is hoped that this description, with the example files provided as Supplementary Data, will serve in future as a reference for this important file format.},
author = {Cock, P. J. A. and Fields, C. J. and Goto, N. and Heuer, M. L. and Rice, P. M.},
doi = {10.1093/nar/gkp1137},
file = {::},
isbn = {1362-4962 (Electronic)$\backslash$r0305-1048 (Linking)},
issn = {03051048},
journal = {Nucleic Acids Research},
keywords = {bioinformatics,color,personality character},
month = {apr},
number = {6},
pages = {1767--1771},
pmid = {20015970},
publisher = {Illumina Inc., San Diego, CA, USA},
title = {{The Sanger FASTQ file format for sequences with quality scores, and the Solexa/Illumina FASTQ variants}},
url = {https://academic.oup.com/nar/article-lookup/doi/10.1093/nar/gkp1137},
volume = {38},
year = {2009}
}
@article{Gewin2016,
abstract = {The move to make scientific findings transparent can be a major boon to research, but it can be tricky to embrace the change.},
author = {Gewin, V.},
doi = {10.1038/nj7584-117a},
issn = {0028-0836},
journal = {Nature},
month = {jan},
number = {7584},
pages = {117--119},
pmid = {26744755},
publisher = {Nature Research},
title = {{Data sharing: An open mind on open data}},
url = {http://www.nature.com/doifinder/10.1038/nj7584-117a},
volume = {529},
year = {2016}
}
@article{Goodman2016,
abstract = {The language and conceptual framework of “research reproducibility” are nonstandard and unsettled across the sciences. In this Perspective, we review an array of explicit and implicit definitions of reproducibility and related terminology, and discuss how to avoid potential misunderstandings when these terms are used as a surrogate for “truth.”},
author = {Goodman, S. N. and Fanelli, D. and Ioannidis, J. P. A.},
doi = {10.1126/scitranslmed.aaf5027},
issn = {1946-6234},
journal = {Science Translational Medicine},
number = {341},
pages = {341ps12--341ps12},
pmid = {27252173},
title = {{What does research reproducibility mean?}},
url = {http://stm.sciencemag.org/content/8/341/341ps12 http://stm.sciencemag.org/cgi/doi/10.1126/scitranslmed.aaf5027},
volume = {8},
year = {2016}
}
@article{Hart2016,
abstract = {Data is the central currency of science, but the nature of scientific data has changed dramatically with the rapid pace of technology. This change has led to the development of a wide variety of data formats, dataset sizes, data complexity, data use cases, and data sharing practices. Improvements in high throughput DNA sequencing, sustained institutional support for large sensor networks, and sky surveys with large-format digital cameras have created massive quantities of data. At the same time, the combination of increasingly diverse research teams and data aggregation in portals (e.g. for biodiversity data, GBIF or iDigBio) necessitates increased coordination among data collectors and institutions. As a consequence, “data” can now mean anything from petabytes of information stored in professionally-maintained databases, through spreadsheets on a single computer, to hand-written tables in lab notebooks on shelves. All remain important, but data curation practices must continue to keep pace with the changes brought about by new forms and practices of data collection and storage.},
author = {Hart, E. M. and Barmby, P. and LeBauer, D. and Michonneau, F. and Mount, S. and Mulrooney, P. and Poisot, T. and Woo, K. H. and Zimmerman, N. B. and Hollister, J. W.},
doi = {10.1371/journal.pcbi.1005097},
file = {::},
isbn = {15537358 (Electronic)},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {oct},
number = {10},
pages = {e1005097},
pmid = {27764088},
publisher = {Public Library of Science},
title = {{Ten Simple Rules for Digital Data Storage}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/27764088},
volume = {12},
year = {2016}
}
@article{Hoehndorf2016,
abstract = {The systematic analysis of a large number of comparable plant trait data can support investigations into phylogenetics and ecological adaptation, with broad applications in evolutionary biology, agriculture, conservation, and the functioning of ecosystems. Floras, i.e., books collecting the information on all known plant species found within a region, are a potentially rich source of such plant trait data. Floras describe plant traits with a focus on morphology and other traits relevant for species identification in addition to other characteristics of plant species, such as ecological affinities, distribution, economic value, health applications, traditional uses, and so on. However, a key limitation in systematically analyzing information in Floras is the lack of a standardized vocabulary for the described traits as well as the difficulties in extracting structured information from free text. We have developed the Flora Phenotype Ontology (FLOPO), an ontology for describing traits of plant species found in Floras. We used the Plant Ontology (PO) and the Phenotype And Trait Ontology (PATO) to extract entity-quality relationships from digitized taxon descriptions in Floras, and used a formal ontological approach based on phenotype description patterns and automated reasoning to generate the FLOPO. The resulting ontology consists of 25,407 classes and is based on the PO and PATO. The classified ontology closely follows the structure of Plant Ontology in that the primary axis of classification is the observed plant anatomical structure, and more specific traits are then classified based on parthood and subclass relations between anatomical structures as well as subclass relations between phenotypic qualities. The FLOPO is primarily intended as a framework based on which plant traits can be integrated computationally across all species and higher taxa of flowering plants. Importantly, it is not intended to replace established vocabularies or ontologies, but rather serve as an overarching framework based on which different application- and domain-specific ontologies, thesauri and vocabularies of phenotypes observed in flowering plants can be integrated.},
author = {Hoehndorf, R. and Alshahrani, M. and Gkoutos, G. V. and Gosline, G. and Groom, Q. and Hamann, T. and Kattge, J. and de Oliveira, S. M. and Schmidt, M. and Sierra, S. and Smets, E. and Vos, R. A. and Weiland, C.},
doi = {10.1186/s13326-016-0107-8},
issn = {2041-1480},
journal = {Journal of Biomedical Semantics},
month = {dec},
number = {1},
pages = {65},
title = {{The flora phenotype ontology (FLOPO): tool for integrating morphological traits and phenotypes of vascular plants}},
url = {http://jbiomedsem.biomedcentral.com/articles/10.1186/s13326-016-0107-8},
volume = {7},
year = {2016}
}
@article{Lariviere2016,
abstract = {Although the Journal Impact Factor (JIF) is widely acknowledged to be a poor indicator of the quality of individual papers, it is used routinely to evaluate research and researchers. Here, we present a simple method for generating the citation distributions that underlie JIFs. Application of this straightforward protocol reveals the full extent of the skew of distributions and variation in citations received by published papers that is characteristic of all scientific journals. Although there are differences among journals across the spectrum of JIFs, the citation distributions overlap extensively, demonstrating that the citation performance of individual papers cannot be inferred from the JIF. We propose that this methodology be adopted by all journals as a move to greater transparency, one that should help to refocus attention on individual pieces of work and counter the inappropriate usage of JIFs during the process of research assessment.},
author = {Lariviere, V. and Kiermer, V. and MacCallum, C. J. and McNutt, M. and Patterson, M. and Pulverer, B. and Swaminathan, S. and Taylor, S. and Curry, S.},
doi = {10.1101/062109},
journal = {bioRxiv},
pages = {062109},
title = {{A simple proposal for the publication of journal citation distributions}},
url = {http://biorxiv.org/lookup/doi/10.1101/062109},
year = {2016}
}
@article{List2017,
abstract = {The rise of high-throughput technologies in molecular biology has led to a massive amount of publicly available data. While computational method development has been a cornerstone of biomedical research for decades, the rapid technological progress in the wet lab makes it difficult for software development to keep pace. Wet lab scientists rely heavily on computational methods, especially since more research is now performed in silico. However, suitable tools do not always exist, and not everyone has the skills to write complex software. Computational biologists are required to close this gap, but they often lack formal training in software engineering. To alleviate this, several related challenges have been previously addressed in the Ten Simple Rules series, including reproducibility [1], effectiveness [2], and open-source development of software [3, 4].},
author = {List, M. and Ebert, P. and Albrecht, F.},
doi = {10.1371/journal.pcbi.1005265},
editor = {Markel, Scott},
file = {:Users/rutger.vos/Library/Application Support/Mendeley Desktop/Downloaded/List, Ebert, Albrecht - 2017 - Ten Simple Rules for Developing Usable Software in Computational Biology.pdf:pdf},
isbn = {15537358 (Electronic)},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {jan},
number = {1},
pages = {e1005265},
pmid = {28056032},
publisher = {Taylor and Francis, ch. User Acceptance of Information Technology},
title = {{Ten Simple Rules for Developing Usable Software in Computational Biology}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1005265},
volume = {13},
year = {2017}
}
@article{Malone2016,
author = {Malone, J. and Stevens, R. and Jupp, S. and Hancocks, T. and Parkinson, H. and Brooksbank, C.},
doi = {10.1371/journal.pcbi.1004743},
editor = {Markel, Scott},
file = {::},
isbn = {15537358 (Electronic)},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {feb},
number = {2},
pages = {e1004743},
pmid = {26867217},
publisher = {Public Library of Science},
title = {{Ten Simple Rules for Selecting a Bio-ontology}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1004743},
volume = {12},
year = {2016}
}
@article{Markowetz2017,
abstract = {Here, I argue that computational thinking and techniques are so central to the quest of understanding life that today all biology is computational biology. Computational biology brings order into our understanding of life, it makes biological concepts rigorous and testable, and it provides a reference map that holds together individual insights. The next modern synthesis in biology will be driven by mathematical, statistical, and computational methods being absorbed into mainstream biological training, turning biology into a quantitative science.},
author = {Markowetz, F.},
doi = {10.1371/journal.pbio.2002050},
file = {::},
isbn = {1111111111},
issn = {15457885},
journal = {PLoS Biology},
month = {mar},
number = {3},
pages = {e2002050},
pmid = {28278152},
publisher = {The University of Chicago Press},
title = {{All biology is computational biology}},
url = {http://dx.plos.org/10.1371/journal.pbio.2002050},
volume = {15},
year = {2017}
}
@article{Michener2015,
abstract = {In this article, the author offers suggestions on improving data management during research. Topics discussed include creating data management plan (DMP) by identifying the data to be collected, understanding the organization of the data collected, improving the quality of products by taking adequate measures for quality control, and development of a data storage strategy with reference to accessibility and preservation.},
author = {Michener, W. K.},
doi = {10.1371/journal.pcbi.1004525},
editor = {Bourne, Philip E.},
file = {::},
isbn = {10.1371/journal.pcbi.1004525},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {oct},
number = {10},
pages = {e1004525},
pmid = {26492633},
publisher = {Public Library of Science},
title = {{Ten Simple Rules for Creating a Good Data Management Plan}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1004525},
volume = {11},
year = {2015}
}
@article{Mobley2013,
abstract = {BACKGROUND: The pharmaceutical and biotechnology industries depend on findings from academic investigators prior to initiating programs to develop new diagnostic and therapeutic agents to benefit cancer patients. The success of these programs depends on the validity of published findings. This validity, represented by the reproducibility of published findings, has come into question recently as investigators from companies have raised the issue of poor reproducibility of published results from academic laboratories. Furthermore, retraction rates in high impact journals are climbing.$\backslash$n$\backslash$nMETHODS AND FINDINGS: To examine a microcosm of the academic experience with data reproducibility, we surveyed the faculty and trainees at MD Anderson Cancer Center using an anonymous computerized questionnaire; we sought to ascertain the frequency and potential causes of non-reproducible data. We found that ∼50{\%} of respondents had experienced at least one episode of the inability to reproduce published data; many who pursued this issue with the original authors were never able to identify the reason for the lack of reproducibility; some were even met with a less than "collegial" interaction.$\backslash$n$\backslash$nCONCLUSIONS: These results suggest that the problem of data reproducibility is real. Biomedical science needs to establish processes to decrease the problem and adjudicate discrepancies in findings when they are discovered.},
author = {Mobley, A. and Linder, S. K. and Braeuer, R. and Ellis, L. M. and Zwelling, L.},
doi = {10.1371/journal.pone.0063221},
editor = {Arakawa, Hirofumi},
file = {::},
isbn = {1932-6203 (Electronic)$\backslash$r1932-6203 (Linking)},
issn = {19326203},
journal = {PLoS ONE},
month = {may},
number = {5},
pages = {e63221},
pmid = {23691000},
publisher = {Public Library of Science},
title = {{A Survey on Data Reproducibility in Cancer Research Provides Insights into Our Limited Ability to Translate Findings from the Laboratory to the Clinic}},
url = {http://dx.plos.org/10.1371/journal.pone.0063221},
volume = {8},
year = {2013}
}
@article{Noble2009,
abstract = {Most bioinformatics coursework focuses on algorithms, with perhaps some components devoted to learning programming skills and learning how to use existing bioinformatics software. Unfortunately, for students who are preparing for a research career, this type of curriculum fails to address many of the day-to-day organizational challenges associated with performing computational experiments. In practice, the principles behind organizing and documenting computational experiments are often learned on the fly, and this learning is strongly influenced by personal predilections as well as by chance interactions with collaborators or colleagues. The purpose of this article is to describe one good strategy for carrying out computational experiments. I will not describe profound issues such as how to formulate hypotheses, design experiments, or draw conclusions. Rather, I will focus on relatively mundane issues such as organizing files and directories and documenting progress. These issues are important because poor organizational choices can lead to significantly slower research progress. I do not claim that the strategies I outline here are optimal. These are simply the principles and practices that I have developed over 12 years of bioinformatics research, augmented with various suggestions from other researchers with whom I have discussed these issues.},
author = {Noble, W. S.},
doi = {10.1371/journal.pcbi.1000424},
editor = {Lewitter, Fran},
file = {::},
isbn = {1553-7358},
issn = {1553734X},
journal = {PLoS Computational Biology},
month = {jul},
number = {7},
pages = {e1000424},
pmid = {19649301},
publisher = {Public Library of Science},
title = {{A quick guide to organizing computational biology projects}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1000424},
volume = {5},
year = {2009}
}
@article{Nosek2015,
abstract = {Author guidelines for journals could help to promote transparency, openness, and reproducibility},
author = {Nosek, B. A. and Alter, G. and Banks, G. C. and Borsboom, D. and Bowman, S. D. and Breckler, S. J. and Buck, S. and Chambers, C. D. and Chin, G. and Christensen, G. and Contestabile, M. and Dafoe, A. and Eich, E. and Freese, J. and Glennerster, R. and Goroff, D. and Green, D. P. and Hesse, B. and Humphreys, M. and Ishiyama, J. and Karlan, D. and Kraut, A. and Lupia, A. and Mabry, P. and Madon, T. and Malhotra, N. and Mayo-Wilson, E. and McNutt, M. and Miguel, E. and Paluck, E. L. and Simonsohn, U. and Soderberg, C. and Spellman, B. A. and Turitto, J. and VandenBos, G. and Vazire, S. and Wagenmakers, E. J. and Wilson, R. and Yarkoni, T.},
doi = {10.1126/science.aab2374},
file = {::},
isbn = {0036-8075, 1095-9203},
issn = {0036-8075},
journal = {Science},
month = {jun},
number = {6242},
pages = {1422--1425},
pmid = {26113702},
publisher = {NIH Public Access},
title = {{Promoting an open research culture}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/26113702 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4550299 http://www.sciencemag.org/cgi/doi/10.1126/science.aab2374},
volume = {348},
year = {2015}
}
@article{Pepe2017,
author = {Pepe, A. and Cantiello, M. and Nicholson, J.},
doi = {10.22541/au.149693987.70506124},
journal = {Authorea preprint},
publisher = {Authorea, Inc.},
title = {{The arXiv of the future will not look like the arXiv}},
url = {https://dx.doi.org/10.22541/au.149693987.70506124},
year = {2017}
}
@article{Perez2016,
abstract = {A "Ten Simple Rules" guide to git and GitHub. We describe and provide examples on how to use these software to track projects, as users, teams and organizations. We document collaborative development using branching and forking, interaction between collaborators using issues and continuous integration and automation using, for example, Travis CI and codevoc. We also describe dissemination and social aspects of GitHub such as GitHub pages, following and watching repositories, and give advice on how to make code citable.},
author = {Perez-Riverol, Y. and Gatto, L. and Wang, R. and Sachsenberg, T. and Uszkoreit, J. and Leprevost, F. and Fufezan, C. and Ternent, T. and Eglen, S. J. and Katz, D. S. and Pollard, T. J. and Konovalov, A. and Flight, R. M. and Blin, K. and Vizcaino, J. A.},
doi = {10.1371/journal.pcbi.1004947},
editor = {Markel, Scott},
file = {::},
isbn = {15537358 (Electronic)},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {jul},
number = {7},
pages = {e1004947},
pmid = {27415786},
publisher = {Springer-Verlag},
title = {{Ten Simple Rules for Taking Advantage of Git and GitHub}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1004947},
volume = {12},
year = {2016}
}
@article{Piwowar2013,
abstract = {Background. Attribution to the original contributor upon reuse of published data is important both as a reward for data creators and to document the provenance of research findings. Previous studies have found that papers with publicly available datasets receive a higher number of citations than similar studies without available data. However, few previous analyses have had the statistical power to control for the many variables known to predict citation rate, which has led to uncertain estimates of the "citation benefit". Furthermore, little is known about patterns in data reuse over time and across datasets. Method and Results. Here, we look at citation rates while controlling for many known citation predictors and investigate the variability of data reuse. In a multivariate regression on 10,555 studies that created gene expression microarray data, we found that studies that made data available in a public repository received 9{\%} (95{\%} confidence interval: 5{\%} to 13{\%}) more citations than similar studies for which the data was not made available. Date of publication, journal impact factor, open access status, number of authors, first and last author publication history, corresponding author country, institution citation history, and study topic were included as covariates. The citation benefit varied with date of dataset deposition: a citation benefit was most clear for papers published in 2004 and 2005, at about 30{\%}. Authors published most papers using their own datasets within two years of their first publication on the dataset, whereas data reuse papers published by third-party investigators continued to accumulate for at least six years. To study patterns of data reuse directly, we compiled 9,724 instances of third party data reuse via mention of GEO or ArrayExpress accession numbers in the full text of papers. The level of third-party data use was high: for 100 datasets deposited in year 0, we estimated that 40 papers in PubMed reused a dataset by year 2, 100 by year 4, and more than 150 data reuse papers had been published by year 5. Data reuse was distributed across a broad base of datasets: a very conservative estimate found that 20{\%} of the datasets deposited between 2003 and 2007 had been reused at least once by third parties. Conclusion. After accounting for other factors affecting citation rate, we find a robust citation benefit from open data, although a smaller one than previously reported. We conclude there is a direct effect of third-party data reuse that persists for years beyond the time when researchers have published most of the papers reusing their own data. Other factors that may also contribute to the citation benefit are considered. We further conclude that, at least for gene expression microarray data, a substantial fraction of archived datasets are reused, and that the intensity of dataset reuse has been steadily increasing since 2003.},
archivePrefix = {arXiv},
arxivId = {10.7287/peerj.preprints.270v1},
author = {Piwowar, H. A. and Vision, T. J.},
doi = {10.7717/peerj.175},
eprint = {peerj.preprints.270v1},
file = {::},
isbn = {2167-8359 (Electronic)},
issn = {2167-8359},
journal = {PeerJ},
keywords = {Bibliometrics,Data archiving,Data repositories,Data reuse,Gene expression microarray,Incentives,Information science,Open data},
month = {oct},
pages = {e175},
pmid = {24109559},
primaryClass = {10.7287},
publisher = {PeerJ Inc.},
title = {{Data reuse and the open data citation advantage}},
url = {https://peerj.com/articles/175},
volume = {1},
year = {2013}
}
@article{Pond2009,
abstract = {How many species inhabit our immediate surroundings? A straightforward collection technique suitable for answering this question is known to anyone who has ever driven a car at highway speeds. The windshield of a moving vehicle is subjected to numerous insect strikes and can be used as a collection device for representative sampling. Unfortunately the analysis of biological material collected in that manner, as with most metagenomic studies, proves to be rather demanding due to the large number of required tools and considerable computational infrastructure. In this study, we use organic matter collected by a moving vehicle to design and test a comprehensive pipeline for phylogenetic profiling of metagenomic samples that includes all steps from processing and quality control of data generated by next-generation sequencing technologies to statistical analyses and data visualization. To the best of our knowledge, this is also the first publication that features a live online supplement providing access to exact analyses and workflows used in the article.},
author = {Pond, S. K. and Wadhawan, S. and Chiaromonte, F. and Ananda, G. and Chung, W. Y. and Taylor, J. and Nekrutenko, A.},
doi = {10.1101/gr.094508.109},
file = {::},
isbn = {1549-5469 (Electronic)$\backslash$n1088-9051 (Linking)},
issn = {10889051},
journal = {Genome Research},
month = {nov},
number = {11},
pages = {2144--2153},
pmid = {19819906},
publisher = {Cold Spring Harbor Laboratory Press},
title = {{Windshield splatter analysis with the Galaxy metagenomic pipeline}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/19819906},
volume = {19},
year = {2009}
}
@article{Sandve2013,
abstract = {a},
author = {Sandve, G. K. and Nekrutenko, A. and Taylor, J. and Hovig, E.},
doi = {10.1371/journal.pcbi.1003285},
editor = {Bourne, Philip E.},
file = {::},
isbn = {1553-7358 (Electronic)$\backslash$r1553-734X (Linking)},
issn = {1553734X},
journal = {PLoS Computational Biology},
month = {oct},
number = {10},
pages = {e1003285},
pmid = {24204232},
publisher = {Public Library of Science},
title = {{Ten Simple Rules for Reproducible Computational Research}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1003285},
volume = {9},
year = {2013}
}
@article{Taschuk2017,
abstract = {Software produced for research, published and otherwise, suffers from a number of common problems that make it difficult or impossible to run outside the original institution, or even off the primary developer's computer. We present ten simple rules to make such software robust enough to run anywhere, and inspire confidence in your reproducibility, and thereby delight your users and collaborators.},
archivePrefix = {arXiv},
arxivId = {1610.04546},
author = {Taschuk, M. and Wilson, G.},
doi = {10.1371/journal.pcbi.1005412},
eprint = {1610.04546},
file = {::},
isbn = {1111111111},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {apr},
number = {4},
pages = {e1005412},
pmid = {28407023},
publisher = {Public Library of Science},
title = {{Ten simple rules for making research software more robust}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1005412},
volume = {13},
year = {2017}
}
@article{Taylor2007,
abstract = {Both the generation and the analysis of proteomics data are now widespread, and high-throughput approaches are commonplace. Protocols continue to increase in complexity as methods and technologies evolve and diversify. To encourage the standardized collection, integration, storage and dissemination of proteomics data, the Human Proteome Organization's Proteomics Standards Initiative develops guidance modules for reporting the use of techniques such as gel electrophoresis and mass spectrometry. This paper describes the processes and principles underpinning the development of these modules; discusses the ramifications for various interest groups such as experimentalists, funders, publishers and the private sector; addresses the issue of overlap with other reporting guidelines; and highlights the criticality of appropriate tools and resources in enabling 'MIAPE-compliant' reporting.},
author = {Taylor, C. F. and Paton, N. W. and Lilley, K. S. and Binz, P.-A. and Julian, R. K. and Jones, A. R. and Zhu, W. and Apweiler, R. and Aebersold, R. and Deutsch, E. W. and Dunn, M. J. and Heck, A. J. R. and Leitner, A. and Macht, M. and Mann, M. and Martens, L. and Neubert, A. A. and Patterson, S. D. and Ping, P. and Seymour, S. L. and Souda, P. and Tsugita, A. and Vandekerckhove, J. and Vondriska, T. M. and Whitelegge, J. P. and Wilkins, M. R. and Xenarios, I. and Yates, J. R. and Hermjakob, H.},
doi = {10.1038/nbt1329},
isbn = {1087-0156},
issn = {1087-0156},
journal = {Nature biotechnology},
keywords = {spective},
month = {aug},
number = {8},
pages = {887--93},
pmid = {17687369},
publisher = {Nature Publishing Group},
title = {{The minimum information about a proteomics experiment (MIAPE).}},
url = {http://www.nature.com/doifinder/10.1038/nbt1329},
volume = {25},
year = {2007}
}
@article{Weinberger2015,
abstract = {“...though a Philosopher need not be sollicitous that his style should delight its Reader with his Floridnesse, yet I think he may very well be allow'd to take a Care that it disgust not his Reader by its Flatness, especially when he does not so much deliver Experiments or explicate them, as make Reflections or Discourses on them; for on such Occasions hemay be allow'd the liberty of recreating his Reader and himself, and manifesting that he declin'd the Ornaments of Language, not out of Necessity, but Discretion...”—Robert Boyle, Pro- {\"{e}}mial Essay},
author = {Weinberger, C. J. and Evans, J. A. and Allesina, S.},
doi = {10.1371/journal.pcbi.1004205},
file = {::},
isbn = {1553-7358},
issn = {15537358},
journal = {PLoS Computational Biology},
month = {apr},
number = {4},
pages = {e1004205},
pmid = {25928031},
publisher = {Public Library of Science},
title = {{Ten simple (Empirical) rules for writing science}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1004205},
volume = {11},
year = {2015}
}
@article{Wilkinson2016,
abstract = {There is an urgent need to improve the infrastructure supporting the reuse of scholarly data. A diverse set of stakeholders-representing academia, industry, funding agencies, and scholarly publishers-have come together to design and jointly endorse a concise and measureable set of principles that we refer to as the FAIR Data Principles. The intent is that these may act as a guideline for those wishing to enhance the reusability of their data holdings. Distinct from peer initiatives that focus on the human scholar, the FAIR Principles put specific emphasis on enhancing the ability of machines to automatically find and use the data, in addition to supporting its reuse by individuals. This Comment is the first formal publication of the FAIR Principles, and includes the rationale behind them, and some exemplar implementations in the community.},
author = {Wilkinson, M. D. and Dumontier, M. and Aalbersberg, I. J. J. and Appleton, G. and Axton, M. and Baak, A. and Blomberg, N. and Boiten, J.-W. and {da Silva Santos}, L. B. and Bourne, P. E. and Bouwman, J. and Brookes, A. J. and Clark, T. and Crosas, M and Dillo, I. and Dumon, O. and Edmunds, S. and Evelo, C. T. and Finkers, R. and Gonzalez-Beltran, A. and Gray, A. J. G. and Groth, P. and Goble, C. and Grethe, J. S. and Heringa, J. and {'t Hoen}, P. A. C. and Hooft, R. and Kuhn, T. and Kok, R. and Kok, J. and Lusher, S. J. and Martone, M. E. and Mons, A. and Packer, A. L. and Persson, B. and Rocca-Serra, P. and Roos, M. and van Schaik, R. and Sansone, S.-A. and Schultes, E. and Sengstag, T. and Slater, T. and Strawn, G. and Swertz, M. A. and Thompson, M. and van der Lei, J. and van Mulligen, E. and Velterop, J. and Waagmeester, A. and Wittenburg, P. and Wolstencroft, K. and Zhao, J. and Mons, B.},
doi = {10.1038/sdata.2016.18},
issn = {2052-4463},
journal = {Scientific data},
month = {mar},
pages = {160018},
pmid = {26978244},
title = {{The FAIR Guiding Principles for scientific data management and stewardship.}},
url = {http://www.nature.com/articles/sdata201618 http://www.ncbi.nlm.nih.gov/pubmed/26978244 http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=PMC4792175},
volume = {3},
year = {2016}
}
@article{Zhang2014,
abstract = {The importance of writing well can never be overstated for a successful professional career, and the ability to write solid papers is an essential trait of a productive researcher. Writing and publishing a paper has its own life cycle; properly following a course of action and ... $\backslash$n},
author = {Zhang, W.},
doi = {10.1371/journal.pcbi.1003453},
editor = {Bourne, Philip E.},
file = {::},
isbn = {1553-7358 (Electronic)$\backslash$r1553-734X (Linking)},
issn = {1553734X},
journal = {PLoS Computational Biology},
month = {jan},
number = {1},
pages = {e1003453},
pmid = {24499936},
publisher = {Public Library of Science},
title = {{Ten Simple Rules for Writing Research Papers}},
url = {http://dx.plos.org/10.1371/journal.pcbi.1003453},
volume = {10},
year = {2014}
}
@article{Ziemann2016,
abstract = {The spreadsheet software Microsoft Excel, when used with default settings, is known to convert gene names to dates and floating-point numbers. A programmatic scan of leading genomics journals reveals that approximately one-fifth of papers with supplementary Excel gene lists contain erroneous gene name conversions.$\backslash$r$\backslash$n$\backslash$r$\backslash$n$\backslash$r$\backslash$nNew comment! Gene name errors are widespread in the scientific literature: Microsoft Excel ruining gene names strikes again, affecting {\textgreater}700 papers.$\backslash$r$\backslash$n},
archivePrefix = {arXiv},
arxivId = {1011.1669},
author = {Ziemann, M. and Eren, Y. and El-Osta, A.},
doi = {10.1186/s13059-016-1044-7},
eprint = {1011.1669},
isbn = {1807-5932},
issn = {1474-760X},
journal = {Genome Biology},
month = {dec},
number = {1},
pages = {177},
pmid = {27552985},
title = {{Gene name errors are widespread in the scientific literature}},
url = {http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1044-7},
volume = {17},
year = {2016}
}